def mode(x: pd.Series, w: Union[Window, int] = Window(None, 0)) -> pd.Series: """ Most common value in series over given window :param x: series: timeseries :param w: Window or int: size of window and ramp up to use. e.g. Window(22, 10) where 22 is the window size and 10 the ramp up value. Window size defaults to length of series. :return: timeseries of mode value **Usage** Computes the `mode <https://en.wikipedia.org/wiki/Mode_(statistics)>`_ over a given window. For each window, this function will return the most common value of all elements in the window. If there are multiple values with the same frequency of occurrence, will return the smallest value. If window is not provided, computes mode over the full series. **Examples** Generate price series and compute mode over :math:`22` observations >>> prices = generate_series(100) >>> mode(prices, 22) **See also** :func:`mean` :func:`median` """ w = normalize_window(x, w) assert x.index.is_monotonic_increasing, "series index is monotonic increasing" if isinstance(w.w, pd.DateOffset): values = [stats.mode(x.loc[(x.index > idx - w.w) & (x.index <= idx)]).mode[0] for idx in x.index] return apply_ramp(pd.Series(values, index=x.index, dtype=np.dtype(float)), w) else: return apply_ramp(x.rolling(w.w, 0).apply(lambda y: stats.mode(y).mode, raw=True), w)
def process_merge_severity_type(df, severity_type, event_type): # severity type severity_type = severity_type.merge(df[['id', 'fault_severity', 'source']], on='id') unique_severity_type = pd.DataFrame( severity_type['severity_type'].value_counts()) unique_severity_type['PercTrain'] = severity_type.pivot_table( values='source', index='severity_type', aggfunc=lambda x: sum(x == 'train') / float(len(x))) unique_severity_type['severity_mode'] = severity_type.loc[ severity_type['source'] == 'train'].pivot_table( values='fault_severity', index='severity_type', aggfunc=lambda x: mode(x)[0]) severity_type.loc[event_type['source'] == 'train'].pivot_table( values='fault_severity', index='severity_type', aggfunc=lambda x: mode(x)) severity_type_merge = severity_type.pivot_table(values='source', index='id', columns='severity_type', aggfunc=lambda x: len(x), fill_value=0) return df.merge(severity_type_merge, left_on='id', right_index=True)
def subpage_squash(packet_lists, min_duplicates=3, ignore_empty=False): """Yields squashed subpages.""" spdict = defaultdict(list) for pl in packet_lists: if len(pl) > 1: subpage = Subpage.from_packets(pl, ignore_empty=ignore_empty) spdict[(subpage.mrag.magazine, subpage.header.page, subpage.header.subpage)].append(subpage) for splist in tqdm(spdict.values(), unit=' Subpages'): if len(splist) >= min_duplicates: numbers = mode(np.stack( [np.clip(sp.numbers, -100, -1) for sp in splist]), axis=0)[0][0].astype(np.int64) s = Subpage(numbers=numbers) for row in range(29): if row in [26, 27, 28]: for dc in range(16): if s.number(row, dc) > -100: packets = [ sp.packet(row, dc) for sp in splist if sp.number(row, dc) > -100 ] arr = np.stack([p[3:] for p in packets]) s.packet(row, dc)[:3] = packets[0][:3] if row == 27: s.packet(row, dc)[3:] = mode( arr, axis=0)[0][0].astype(np.uint8) else: t = arr.astype(np.uint32) t = t[:, 0::3] | (t[:, 1::3] << 8) | ( t[:, 2::3] << 16) result = mode(t, axis=0)[0][0].astype(np.uint32) s.packet(row, dc)[3::3] = result & 0xff s.packet(row, dc)[4::3] = (result >> 8) & 0xff s.packet(row, dc)[5::3] = (result >> 16) & 0xff else: if s.number(row) > -100: packets = [ sp.packet(row) for sp in splist if sp.number(row) > -100 ] arr = np.stack([p[2:] for p in packets]) s.packet(row)[:2] = packets[0][:2] s.packet(row)[2:] = mode(arr, axis=0)[0][0].astype(np.uint8) yield s
def segment_majority_vote_indices(self, interval_size, em_iters): num_clusters = len(self.gmm_list) # Resegment data based on likelihood scoring likelihoods = self.gmm_list[0].score(self.X) for g in self.gmm_list[1:]: likelihoods = np.column_stack((likelihoods, g.score(self.X))) if num_clusters == 1: most_likely = np.zeros(len(self.X)) else: most_likely = likelihoods.argmax(axis=1) # Across 2.5 secs of observations, vote on which cluster they should be associated with iter_training = {} for i in range(interval_size, self.N, interval_size): arr = np.array(most_likely[(range(i - interval_size, i))]) max_gmm = int(stats.mode(arr)[0][0]) iter_training.setdefault((self.gmm_list[max_gmm], max_gmm), []).append((i - interval_size, i)) arr = np.array(most_likely[(range( (self.N / interval_size) * interval_size, self.N))]) max_gmm = int(stats.mode(arr)[0][0]) iter_training.setdefault((self.gmm_list[max_gmm], max_gmm),[]).\ append((self.N/interval_size*interval_size, self.N)) iter_bic_dict = {} iter_bic_list = [] for gp, e_tuple_list in iter_training.iteritems(): g = gp[0] p = gp[1] cluster_indices = np.array(range(e_tuple_list[0][0], e_tuple_list[0][1], 1), dtype=np.int32) for d in e_tuple_list[1:]: cluster_indices = np.concatenate((cluster_indices,\ np.array(range(d[0],d[1],1),\ dtype=np.int32))) g.train_on_subset(self.X, cluster_indices, max_em_iters=em_iters) iter_bic_list.append((g, cluster_indices)) iter_bic_dict[p] = cluster_indices return iter_bic_dict, iter_bic_list, most_likely
def segment_majority_vote(self, interval_size, em_iters): num_clusters = len(self.gmm_list) # Resegment data based on likelihood scoring score_time = time.time() likelihoods = self.gmm_list[0].score(self.X) for g in self.gmm_list[1:]: likelihoods = np.column_stack((likelihoods, g.score(self.X))) if num_clusters == 1: most_likely = np.zeros(len(self.X)) else: most_likely = likelihoods.argmax(axis=1) self.ftime.write("Score: {0}\n".format(time.time() - score_time)) # Across 2.5 secs of observations, vote on which cluster they should be associated with iter_training = {} segment_time = time.time() for i in range(interval_size, self.N, interval_size): arr = np.array(most_likely[(range(i-interval_size, i))]) max_gmm = int(stats.mode(arr)[0][0]) iter_training.setdefault((self.gmm_list[max_gmm],max_gmm),[]).append(self.X[i-interval_size:i,:]) arr = np.array(most_likely[(range((self.N/interval_size)*interval_size, self.N))]) max_gmm = int(stats.mode(arr)[0][0]) iter_training.setdefault((self.gmm_list[max_gmm], max_gmm),[]).append(self.X[(self.N/interval_size)*interval_size:self.N,:]) iter_bic_dict = {} iter_bic_list = [] # for each gmm, append all the segments and retrain for gp, data_list in iter_training.iteritems(): g = gp[0] p = gp[1] cluster_data = data_list[0] for d in data_list[1:]: cluster_data = np.concatenate((cluster_data, d)) g.train(cluster_data, max_em_iters=em_iters) iter_bic_list.append((g,cluster_data)) iter_bic_dict[p] = cluster_data self.ftime.write("Segment: {0}\n".format(time.time() - segment_time)) return iter_bic_dict, iter_bic_list, most_likely
def segment_majority_vote(self, interval_size, em_iters): num_clusters = len(self.gmm_list) # Resegment data based on likelihood scoring likelihoods = self.gmm_list[0].score(self.X) for g in self.gmm_list[1:]: likelihoods = np.column_stack((likelihoods, g.score(self.X))) if num_clusters == 1: most_likely = np.zeros(len(self.X)) else: most_likely = likelihoods.argmax(axis=1) # Across 2.5 secs of observations, vote on which cluster they should be associated with iter_training = {} for i in range(interval_size, self.N, interval_size): arr = np.array(most_likely[(range(i - interval_size, i))]) max_gmm = int(stats.mode(arr)[0][0]) iter_training.setdefault((self.gmm_list[max_gmm], max_gmm), []).append(self.X[i - interval_size:i, :]) arr = np.array(most_likely[(range( (self.N / interval_size) * interval_size, self.N))]) max_gmm = int(stats.mode(arr)[0][0]) iter_training.setdefault((self.gmm_list[max_gmm], max_gmm), []).append( self.X[(self.N / interval_size) * interval_size:self.N, :]) iter_bic_dict = {} iter_bic_list = [] # for each gmm, append all the segments and retrain for gp, data_list in iter_training.iteritems(): g = gp[0] p = gp[1] cluster_data = data_list[0] for d in data_list[1:]: cluster_data = np.concatenate((cluster_data, d)) g.train(cluster_data, max_em_iters=em_iters) iter_bic_list.append((g, cluster_data)) iter_bic_dict[p] = cluster_data return iter_bic_dict, iter_bic_list, most_likely
def main(): print "Starting...." RE = ReccomendationEngine() tweets = RE.get_corpus() model, centroids = RE.get_Kmeans() user_name = raw_input('Please enter twitter username to provide reccomendation for:') recommendations = RE.get_recommendations(user_name) print "Follow these users?\n" for i in recommendations[:5]: print i print evaluation = raw_input('How did I do at suggesting followers (on a scale of 1 to 5)?') feedback = client.twitter_data.user_feedback print "Thanks for the feedback, we'll do better next time!" feedback.insert({'user' : user_name, 'rating': evaluation, 'date_time': datetime.now() }) print 'Now updating scores' for i in users.find(): users.update({"_id": i["_id"]}, {'$set': {'cluster_score': int(mode(RE.model.predict(RE.vect.transform(i['tweets'])), axis=None)[0] )} })
def optimize(self,J_test,n_evals): # J_test is a 1 by n_theta array ranked_idxs = np.argsort(-self.J_train) # sort each row best_per_scene = ranked_idxs[:,0] idx_to_eval = int(mode(best_per_scene)[0]) plan_dists = self.computePlanDist(self.J_train) B_values_context = np.mean(self.J_train,axis=0)+np.std(self.J_train,axis=0) evaluated_idxs = np.array([idx_to_eval]) for n in range(n_evals): B_values_plan = np.transpose( J_test[0,evaluated_idxs] ) + \ plan_dists[evaluated_idxs,:] B_values_plan = np.asmatrix(B_values_plan) B_values = np.min(np.r_[B_values_plan,B_values_context],axis=0) #TODO: how come I cannot slice this? for eval_idx in evaluated_idxs: B_values[0,eval_idx] = \ J_test[0,eval_idx] # idx_to_eval=np.argmax(B_values) sorted_Bval_idxs = np.argsort(-B_values) idx_to_eval=idx_to_eval[0,0] if n is not (len(n_evals)-1) evaluated_idxs = np.r_[evaluated_idxs,idx_to_eval] import pdb; pdb.set_trace()
def _fix_cachedcurrent(self, ensoindices): """ Private function to convert cached ENSO indices. This function is used to ensure that: * the frequency of the indices matches the frequency of the indicator. * the date range of the indices matches the date range of the indicator. * the shape of the indices matches the shape of the indicator. """ _currentfreq = ensoindices._dates.freq _freq = self._dates.freq # Check the frequency, and convert if needed if _currentfreq > _freq: # To a lower frequency ('Q'/'A') if self.ndim == 2: conversionfunc = None else: conversionfunc = lambda x: mode(x)[0].squeeze() ensoindices = ensoindices.convert(_freq, func=conversionfunc) elif _currentfreq < _freq: # To a higher frequency (eg, 'D') ensoindices = backward_fill(ensoindices.convert(_freq)) # Reset to the original frequency if needed... (start, end) = self._dates.flat[[0, -1]] if tuple(ensoindices._dates.flat[[0, -1]]) != (start, end): ensoindices = ts.adjust_endpoints(ensoindices, start_date=start, end_date=end) # Reset the shape of the indices if ensoindices.shape != self.shape: ensoindices.shape = self.shape return ensoindices
def update_db(self): for i in users.find(): users.update({"_id": i["_id"]}, {'$set': {'cluster_score': int(mode(model.predict(RE.vect.transform(i['tweets'])), axis=None)[0] )} })
def spectral(tweetfile,npmifile,dictfile,k,noc): Ptmp=textscan(npmifile,'([^ ]*) ([^ ]*) ([^ ]*)'); PP=textscan(dictfile,'(.*) (.*)',(int,str)); PP[0] -= 1 PMI=ssp.coo_matrix( (Ptmp[2],(Ptmp[0]-1,Ptmp[1]-1)), (PP[0].shape[0],PP[0].shape[0]) ).tocsr(); W=knnmatrix(PMI,k); # This is hidious and wrong and it must be fixed W=ssp.csr_matrix(minimum(W.todense(),W.T.todense())) s,comp = ssp.csgraph.connected_components(W,directed=False) comp_mode = mstats.mode(comp)[0] inds = comp==comp_mode inds = [x for x in range(W.shape[0]) if inds[x]] WW = W[inds,:][:,inds] P=PP[1][inds]; ids = P; X = WW; c = spectral_clustering(X,n_clusters=noc, eigen_solver='arpack') fid=file("".join(['cl.',tweetfile,'-',str(noc)]),'w'); for i in range(max(c)+1): cl=[x for x in range(len(c)) if c[x] == i] b,wordsix = centralityn(cl,X,ids); for j in range(len(b)): word=wordsix[j]; fid.write('%s %d %.5f\n'%(word,i,b[j]));
def __zonal_stats(self, f, masked): for op, alias in self.operations_idx.iteritems(): if op == 'min': v = float(masked.min()) elif op == 'max': v = float(masked.max()) elif op == 'sum': v = float(masked.sum()) elif op == 'count': v = int(masked.count()) elif op == 'mean': v = float(masked.mean()) elif op == 'std': v = float(masked.std()) elif op == 'unique': v = numpy.unique(masked.compressed()).size elif op == 'range': v = float(masked.max()) - float(masked.min()) elif op == 'var': v = float(masked.var()) elif op == 'median': v = float(numpy.ma.median(masked)) elif op == 'mode': v = float(mode(masked, axis=None)[0][0]) else: self.logger.warning('Unrecognized operation "{}".'.format(op)) continue self.layer.startEditing() f[alias] = NULL if numpy.isnan(v) else v self.layer.updateFeature(f) self.layer.commitChanges()
def mode(x): """ Comutes a single value for the mode of an array :param x: :return: returns the mode of an array """ return mstats.mode(x, axis=None)[0]
def main(): map_obstacles = np.loadtxt('obstacles_map.txt') laser_obstacles = np.loadtxt('obstacles_laser.txt') # Center the data map_obstacles = center(map_obstacles) laser_obstacles = center(laser_obstacles) # Translate and rotate the measurements true_rotation = np.pi / 10 true_translation = np.array([10, 10]) laser_trans = translate(laser_obstacles, true_translation) laser_rot = rotate(laser_trans, true_rotation) # Run k-means map_code, map_dist = sp.kmeans(map_obstacles, 10) code, dist = sp.vq(laser_obstacles, map_code) m = stats.mode(code) pos = map_code[m[0][0]] print "Code:", map_code print "Dist:", dist print "Most likely cluster:", m print "Which corresponds to:", pos # plot_super(map_obstacles, laser_obstacles, "Original") plot_super(map_obstacles, map_code, "Map's k-means centers") plot_super(map_obstacles, np.vstack([laser_obstacles, pos]), "The Cluster") # plot_super(laser_obstacles, laser_code, "Laser's k-means centers") # plot_super(map_obstacles, laser_trans, "Measure misaligned with map") # plot_super(map_obstacles, laser_reloc, "Measure realigned with map") plt.show()
def __zonal_stats(self, f, masked): attrs = f.attributes() for op, idx in self.operations_idx.iteritems(): if op == 'min': v = float(masked.min()) elif op == 'max': v = float(masked.max()) elif op == 'sum': v = float(masked.sum()) elif op == 'count': v = int(masked.count()) elif op == 'mean': v = float(masked.mean()) elif op == 'std': v = float(masked.std()) elif op == 'unique': v = numpy.unique(masked.compressed()).size elif op == 'range': v = float(masked.max()) - float(masked.min()) elif op == 'var': v = float(masked.var()) elif op == 'median': v = float(numpy.ma.median(masked)) elif op == 'mode': v = float(mode(masked, axis=None)[0][0]) else: self.logger.warning('Unrecognized operation "{}".'.format(op)) continue attrs.insert(idx, None if numpy.isnan(v) else v) return attrs
def binner(x, y, w_sta, nbins, rang = None, ebar = False, per = None) : from numpy import array, digitize, lexsort, linspace from numpy.ma import average, median ind = lexsort((y, x)) xs, ys = x[ind], y[ind] if rang is None : mn, mx = min(xs), max(xs) else : mn, mx = rang bins = linspace(mn, mx, nbins + 1) x_cen = (bins[: - 1] + bins[1:])*0.5 bins = linspace(mn, mx, nbins) ibins = digitize(xs, bins) if w_sta == "median" : y_sta = array([median(ys[ibins == i]) for i in range(1, bins.size + 1)]) elif w_sta == "mean" : y_sta = array([average(ys[ibins == i]) for i in range(1, bins.size + 1)]) elif w_sta == "mode" : y_sta = array([mode(ys[ibins == i])[0] for i in range(1, bins.size + 1)]) if ebar == False : return x_cen, y_sta elif ebar == True and per == None : myer = abs(array([scoreatpercentile(ys[ibins == i], 15.8) for i in range(1, bins.size + 1)]) - y_sta) pyer = abs(array([scoreatpercentile(ys[ibins == i], 84.0) for i in range(1, bins.size + 1)]) - y_sta) yer = array([myer, pyer]) return x_cen, y_sta, yer elif ebar == True and per != None : myer = abs(array([scoreatpercentile(ys[ibins == i], per[0]) for i in range(1, bins.size + 1)]) - y_sta) pyer = abs(array([scoreatpercentile(ys[ibins == i], per[1]) for i in range(1, bins.size + 1)]) - y_sta) yer = array([myer, pyer]) return x_cen, y_sta, yer
def usdn_energy_v_hops(df_dict): """Plot usdn energy vs hops.""" try: if 'app' in df_dict and 'pow' in df_dict: pow_df = df_dict['pow'] app_df = df_dict['app'] else: raise Exception('ERROR: Correct df(s) not in dict!') except Exception: traceback.print_exc() sys.exit(0) # Add hops to node_df. N.B. cols with NaN are always converted to float hops = app_df[['src', 'hops']].groupby('src').agg(lambda x: mode(x)[0]) pow_df = pow_df.join(hops['hops'].astype(int)) df = pow_df.groupby('hops')['all_rdc'] \ .apply(lambda x: x.mean()) \ .reset_index() \ .set_index('hops') x = df.index.tolist() y = df['all_rdc'].tolist() cpplot.plot_bar(df, 'usdn_energy_v_hops', sim_dir, x, y, xlabel='Hops', ylabel='Radio duty cycle (%)')
def show_dist(start_stocks=StartingPortfolio.stocks, lc=True): ages = [] for iteration in range(0, iterations): ages.append(simulate(start_stocks, lc)) min_age = min(ages) max_age = max(ages) for pa in range(60, 66): print('probability for age ', pa, ': ', calc_probability(ages, pa)) print('mean: ', mean(ages)) print('stdev: ', stdev(ages)) print('variance: ', variance(ages)) print('mode: ', mode(ages)) print('median: ', median(ages)) print('min age: ', min_age) print('max age: ', max_age) print('interval (max-min): ', max_age - min_age) sns.distplot(ages, norm_hist=True, kde=False, bins=np.arange(min_age, max_age + 1)) # with open('ages100.txt', mode='wt', encoding='utf-8') as file: # file.write('\n'.join((str(i) for i in set(ages)))) # sns.kdeplot(ages, shade=True) # pyplot.hist(ages, density=True) pyplot.show()
def getModeWeek(dfDat): # find the most frequent week when max revenue occurs : 1 = week 1; 2 = week 2 # from all merged 6w data dfDat['week'] = np.where((dfDat['nWeek'] % 2 == 0), 2, 1) datWeek = dfDat[['rid', 'sic', 'week']] # mode of timeOfDay for every rid+sic key grpWeekRev = datWeek.groupby(['rid', 'sic']) grpName = [] modeWeek = 0 dfModeWeek = pd.DataFrame({'rid': [], 'sic': [], 'weekMaxRev': []}) for name, group in grpWeekRev: grpName.append(group) for jNum in range(0, len(grpName)): tmpArr = grpName[jNum] tmpArr = tmpArr.set_index(['rid', 'sic']) modeTmp = mode(tmpArr['week']) modeWeek = modeTmp[0][0] dfWeek = pd.DataFrame({ 'rid': [(tmpArr.index[0][0])], 'sic': [(tmpArr.index[0][1])], 'weekMaxRev': [modeWeek] }) dfModeWeek = dfModeWeek.append(dfWeek, ignore_index=True) del tmpArr, modeTmp, modeWeek, dfWeek, grpWeekRev, grpName, datWeek return dfModeWeek
def usdn_pdr_v_hops(df_dict): """Plot usdn energy vs hops.""" try: if 'app' in df_dict: app_df = df_dict['app'] else: raise Exception('ERROR: Correct df(s) not in dict!') except Exception: traceback.print_exc() sys.exit(0) # Get hops for each node. N.B. cols with NaN are always converted to float df = app_df[['src', 'hops']].groupby('src').agg(lambda x: mode(x)[0]) # Calculate PRR df['pdr'] = app_df.groupby('src')['drpd'] \ .apply(lambda x: ratio(len(x), x.sum())) df = df.groupby('hops')['pdr'] \ .apply(lambda x: x.mean()) \ .reset_index() \ .set_index('hops') x = df.index.tolist() y = df['pdr'].tolist() cpplot.plot_bar(df, 'usdn_pdr_v_hops', sim_dir, x, y, xlabel='Hops', ylabel='PDR (%)')
def getModeTime(dfDat): # find the most frequent time of day and datetime when max revenue occurs # from all merged 6W data # mode of timeOfDay for every rid+sic key grpTimeOfDay = dfDat.groupby(['rid', 'sic']) grpName = [] modeTime = 0 dfModeTime = pd.DataFrame({'rid': [], 'sic': [], 'timeMaxRev': []}) for name, group in grpTimeOfDay: grpName.append(group) for jNum in range(0, len(grpName)): tmpArr = grpName[jNum] tmpArr = tmpArr.set_index(['rid', 'sic']) modeTmp = mode(tmpArr['daytime']) modeTime = modeTmp[0][0] dfTimeOfDay = pd.DataFrame({ 'rid': [(tmpArr.index[0][0])], 'sic': [(tmpArr.index[0][1])], 'timeMaxRev': [modeTime] }) dfModeTime = dfModeTime.append(dfTimeOfDay, ignore_index=True) del tmpArr, modeTmp, modeTime, dfTimeOfDay, grpTimeOfDay, grpName return dfModeTime
def balanced_resample(data, labels): """Do a balanced resampling of data and labels, returning them See the test routine at the bottom for an example of behavior """ most_common, num_required = mstats.mode(labels) possible_labels = np.unique(labels) data_resampled = [] labels_resampled = [] for possible_label in possible_labels: in_this_label = labels == possible_label data_buffered = np.array([]) data_buffered = np.reshape(data_buffered, (0, data.shape[1])) labels_buffered = np.array([]) while len(data_buffered) < num_required: data_buffered = np.vstack([data_buffered, data[in_this_label]]) labels_buffered = np.hstack([labels_buffered, labels[in_this_label]]) single_data_resampled, single_labels_resampled = utils.resample( data_buffered, labels_buffered, n_samples=int(num_required), replace=True ) data_resampled.append(single_data_resampled) labels_resampled.append(single_labels_resampled) return np.vstack(data_resampled).astype(data.dtype), np.hstack(labels_resampled).astype(labels.dtype)
def mode(x: pd.Series, w: int = 0) -> pd.Series: """ Most common value in series over given window :param x: series: timeseries :param w: window: number of observations to use (defaults to length of series) :return: timeseries of mode value **Usage** Computes the `mode <https://en.wikipedia.org/wiki/Mode_(statistics)>`_ over a given window. For each window, this function will return the most common value of all elements in the window. If there are multiple values with the same frequency of occurrence, will return the smallest value. If window is not provided, computes mode over the full series. **Examples** Generate price series and compute mode over :math:`22` observations >>> prices = generate_series(100) >>> mode(prices, 22) **See also** :func:`mean` :func:`median` """ w = w or x.size assert x.index.is_monotonic_increasing, "series index is monotonic increasing" return x.rolling(w, 0).apply(lambda y: stats.mode(y).mode, raw=True)
def get_info_gain(data): info_gains = [] g1 = [] g2 = [] for val in np.unique(data[:, 0]): m = mode(data[:, 0]) m = float(m[1]) n = len(data[:, 0]) p = dict(Counter(data[:, 0])) m = 0 for i in p: m += p[i] * i m = m / float(n) for i in range(len(data[:, 0])): if is_missing(data[i, 0]): print "yo" data[i, 0] = m g1 = data[data[:, 0] <= val][:, -1] g2 = data[data[:, 0] > val][:, -1] if g1.shape[0] != 0: ent1 = (float)(g1.shape[0]) * calc_info_gain(g1) / (data.shape[0]) else: ent1 = 0 if g2.shape[0] != 0: ent2 = (float)(g2.shape[0]) * calc_info_gain(g2) / (data.shape[0]) else: ent2 = 0 avg_ent = ent1 + ent2 info_gains.append((avg_ent, val)) return min(info_gains)
def get_gts_by_indiv(self, correct_for_odd_major = True): cp_2_thresh=1.0 mode_label = int(mode(self.labels)[0][0]) indiv_labels = np.array(self.labels) mu_args = np.argsort(self.all_uniq_mus) ordered_labels = self.all_uniq_labels[mu_args] ordered_mus = self.all_uniq_mus[mu_args] d_from_2 = np.absolute(ordered_mus-2.0) labels_to_gt = {} """ if there is something that looks like a 2, use it to callibrate others assign 2 to the closest 1, then assign the rest as +-1 in the order make sure that you aren't assigning -1 genotypes then finally, consolidate w/ the mus """ if np.amin(d_from_2)<cp_2_thresh: idx = np.argmin(d_from_2) idx_cp = 2 else: idx = 0 idx_cp = round(ordered_mus[0]) for i,l in enumerate(ordered_labels): labels_to_gt[l] = idx_cp-(idx-i) ## ensure no -1s while min(labels_to_gt.values())<0: print "<0's detected..." new_labels_to_gt = {} for l, gt in labels_to_gt.iteritems(): new_labels_to_gt[l] = gt+1 labels_to_gt = new_labels_to_gt ##correct for odd major alleles out of HWE if correct_for_odd_major and (labels_to_gt[mode_label] %2 == 1) and np.sum(indiv_labels==mode_label) >= .5*(indiv_labels.shape[0]): d=0 if self.label_to_mu[mode_label]-labels_to_gt[mode_label]>0 or min(labels_to_gt.values())==0: d=1 else: d=-1 new_labels_to_gt = {} for l, gt in labels_to_gt.iteritems(): new_labels_to_gt[l] = gt+d labels_to_gt = new_labels_to_gt gts_by_indiv = {} for i, indiv in enumerate(self.indivs): gts_by_indiv[indiv] = int(labels_to_gt[self.labels[i]]) new_labels_to_gt = {k:int(v) for k,v in labels_to_gt.iteritems()} labels_to_gt = new_labels_to_gt gt_to_labels = {v:k for k,v in labels_to_gt.iteritems()} return gts_by_indiv, gt_to_labels, labels_to_gt
def subpage_squash(packet_lists, min_duplicates=3): """Yields squashed subpages.""" spdict = defaultdict(list) for pl in packet_lists: subpage = Subpage.from_packets(pl) spdict[(subpage.mrag.magazine, subpage.header.page, subpage.header.subpage)].append(subpage) for splist in tqdm(spdict.values(), unit=' Subpages'): if len(splist) >= min_duplicates: arr = mode(np.stack([sp[:] for sp in splist]), axis=0)[0][0].astype(np.uint8) numbers = mode(np.stack( [np.clip(sp.numbers, -100, -1) for sp in splist]), axis=0)[0][0].astype(np.int64) yield Subpage(arr, numbers)
def segment_majority_vote_indices(self, interval_size, em_iters): num_clusters = len(self.gmm_list) # Resegment data based on likelihood scoring likelihoods = self.gmm_list[0].score(self.X) for g in self.gmm_list[1:]: likelihoods = np.column_stack((likelihoods, g.score(self.X))) if num_clusters == 1: most_likely = np.zeros(len(self.X)) else: most_likely = likelihoods.argmax(axis=1) # Across 2.5 secs of observations, vote on which cluster they should be associated with iter_training = {} for i in range(interval_size, self.N, interval_size): arr = np.array(most_likely[(range(i-interval_size, i))]) max_gmm = int(stats.mode(arr)[0][0]) iter_training.setdefault((self.gmm_list[max_gmm],max_gmm),[]).append((i-interval_size,i)) arr = np.array(most_likely[(range((self.N/interval_size)*interval_size, self.N))]) max_gmm = int(stats.mode(arr)[0][0]) iter_training.setdefault((self.gmm_list[max_gmm], max_gmm),[]).\ append((self.N/interval_size*interval_size, self.N)) iter_bic_dict = {} iter_bic_list = [] for gp, e_tuple_list in iter_training.iteritems(): g = gp[0] p = gp[1] cluster_indices = np.array(range(e_tuple_list[0][0], e_tuple_list[0][1],1), dtype=np.int32) for d in e_tuple_list[1:]: cluster_indices = np.concatenate((cluster_indices,\ np.array(range(d[0],d[1],1),\ dtype=np.int32))) g.train_on_subset(self.X, cluster_indices, max_em_iters=em_iters) iter_bic_list.append((g,cluster_indices)) iter_bic_dict[p] = cluster_indices return iter_bic_dict, iter_bic_list, most_likely
def row_squash(packet_iter, n_rows): for l_list in split_seq(packet_iter, n_rows): a = numpy.array([numpy.fromstring(l.to_bytes(), dtype=numpy.uint8) for l in l_list]) best, counts = mode(a) best = best[0].astype(numpy.uint8) p = Packet.from_bytes(best) p._offset = l_list[0]._offset yield p
def predict(self, x): prediction_matrix = numpy.zeros((x.shape[0], self.n_classifiers)) n = 0 for classifier in self.classifiers: prediction = classifier.predict(x) prediction_matrix[:, n] = prediction n+=1 return mode(prediction_matrix, 1)[0].reshape(1, -1)[0]
def baseline(labels): """ Baseline for testing, predict majority value for any input. Usually, the majority is positive feedback. """ mode = mstats.mode(labels).mode[0] print('method: BASELINE, ACC: %.3f' % (accuracy([mode]*len(labels), labels))) print()
def tonique(self,percent,method): """ Get the tonic frequency defined as the mode of the last frequencies array. These as selected by the percent argument. Two methods are possible : pdf or mode. Input : ----------- percent (optional) : a percentage of the number of frames from the total size of the frequencies array to give the last frequencies. Default percent= 8 Output : ----------- M : the mode N : the mode converted inside an octave Final_Freqs : the last frequencies according to the percentage """ self.percent = percent self.method = method L = len(self.freq) Nb_Frames = L*self.percent/100 Final_Freqs = self.freq[(L-Nb_Frames):L] if self.method=="pdf": # Down to the same octave centered on the mode #Final_Freqs[Final_Freqs>mode(self.freq)[0]*2] = Final_Freqs[Final_Freqs>mode(self.freq)[0]*2]/2. #Final_Freqs[Final_Freqs<(mode(self.freq)[0]/2.)] = Final_Freqs[Final_Freqs<mode(self.freq)[0]/2.]*2 self.final_pdf = gaussian_kde(Final_Freqs) lmax= numpy.argmax(self.final_pdf(self.x))+self.xmin #plt.plot(self.x,self.final_pdf(self.x)) return self.final_pdf,lmax,Final_Freqs if self.method=="mode": M = mode(Final_Freqs) if M[0] > mode(self.freq)[0]*2: N = M[0]/2 if M[0] < mode(self.freq)[0]/2: N = M[0]*2 else: N = M[0] return M[0],int(N.tolist()[0]),Final_Freqs
def get_recommendations(self, username): tweets = api.user_timeline(username) raw_text = [i.text for i in tweets] # clean text text = [] for i in raw_text: i = ' '.join(filter(lambda x: bool(wordnet.synsets(x)), i.split(' '))) if len(i) > 5: text.append(i) # find users like you user_score = int(mode([self.model.predict(self.vect.transform([i])) for i in text], axis=None)[0]) for i in users.find(): users.update({"_id": i["_id"]}, {'$set': {'cluster_score': int(mode(self.model.predict(self.vect.transform(i['tweets'])), axis=None)[0] )} }) recs = [(i['user_id'], i['screen_name']) for i in users.find({'cluster_score': user_score})] return recs
def mode_from_str_list(score_list): """Takes a string like x1,x2,... which comes out of the database and computes the mode """ scores = [] for s in score_list.split(","): #contains NULLS: try: scores.append(int(s)) except: pass return mode(scores)[0][0]
def df_numerical_summary(df): """ automatically selects all the numeric columns in a dataframe and provides their summary statistics Parameters: ___________ df: Dataframe Returns: ________ Dataframe """ ndf = df.select_dtypes(include=[np.number]) dft = ndf.describe().T nunique = [n for n in ndf.apply(pd.Series.nunique)] nzeros = [((ndf[col] == 0).sum()) for col in ndf] kurt = [x for x in ndf.kurtosis()] skewness = [x for x in ndf.skew()] modes = [x for x in ndf.apply(lambda x: mode(x, axis=None)[0]).iloc[0]] ranges = DataFrame({ "1%": ndf.quantile(0.01), "5%": ndf.quantile(0.05), "95%": ndf.quantile(0.95), "99%": ndf.quantile(0.99) }) infodf = dft.assign(nunique=nunique, mode=modes, median=ndf.apply(np.median), nzeros=nzeros, kurtosis=kurt, skewness=skewness, iqr=dft['75%'] - dft['25%']).join( na_count(ndf)).join(ranges) def Round(x): return np.round(x, 2) rnd = [ 'count', 'mean', 'mode', 'median', 'std', 'min', 'max', 'nunique', 'nzeros', 'miss', 'kurtosis', 'skewness', '25%', '50%', '75%', '1%', '95%', '99%', '5%', 'iqr' ] infodf[rnd] = infodf[rnd].apply(Round) infodf = infodf[[ 'count', 'mean', 'mode', 'median', 'std', 'min', 'max', 'nunique', 'nzeros', 'kurtosis', 'skewness', 'miss', 'miss_percent', 'iqr', '1%', '5%', '25%', '50%', '75%', '95%', '99%' ]] return infodf
def row_squash(packet_iter, n_rows): for l_list in split_seq(packet_iter, n_rows): a = numpy.array([ numpy.fromstring(l.to_bytes(), dtype=numpy.uint8) for l in l_list ]) best, counts = mode(a) best = best[0].astype(numpy.uint8) p = Packet.from_bytes(best) p._offset = l_list[0]._offset yield p
def test_mode(self): a1 = [0,0,0,1,1,1,2,3,3,3,3,4,5,6,7] a2 = np.reshape(a1, (3,5)) ma1 = ma.masked_where(ma.array(a1) > 2,a1) ma2 = ma.masked_where(a2 > 2, a2) assert_equal(mstats.mode(a1, axis=None), (3,4)) assert_equal(mstats.mode(ma1, axis=None), (0,3)) assert_equal(mstats.mode(a2, axis=None), (3,4)) assert_equal(mstats.mode(ma2, axis=None), (0,3)) assert_equal(mstats.mode(a2, axis=0), ([[0,0,0,1,1]],[[1,1,1,1,1]])) assert_equal(mstats.mode(ma2, axis=0), ([[0,0,0,1,1]],[[1,1,1,1,1]])) assert_equal(mstats.mode(a2, axis=-1), ([[0],[3],[3]], [[3],[3],[1]])) assert_equal(mstats.mode(ma2, axis=-1), ([[0],[1],[0]], [[3],[1],[0]]))
def test_mode(self): a1 = [0,0,0,1,1,1,2,3,3,3,3,4,5,6,7] a2 = np.reshape(a1, (3,5)) ma1 = ma.masked_where(ma.array(a1)>2,a1) ma2 = ma.masked_where(a2>2, a2) assert_equal(mstats.mode(a1, axis=None), (3,4)) assert_equal(mstats.mode(ma1, axis=None), (0,3)) assert_equal(mstats.mode(a2, axis=None), (3,4)) assert_equal(mstats.mode(ma2, axis=None), (0,3)) assert_equal(mstats.mode(a2, axis=0), ([[0,0,0,1,1]],[[1,1,1,1,1]])) assert_equal(mstats.mode(ma2, axis=0), ([[0,0,0,1,1]],[[1,1,1,1,1]])) assert_equal(mstats.mode(a2, axis=-1), ([[0],[3],[3]], [[3],[3],[1]])) assert_equal(mstats.mode(ma2, axis=-1), ([[0],[1],[0]], [[3],[1],[0]]))
def segment_majority_vote(self): num_clusters = len(self.gmm_list) # Resegment data based on likelihood scoring likelihoods = self.gmm_list[0].score(self.X) for g in self.gmm_list[1:]: likelihoods = np.column_stack((likelihoods, g.score(self.X))) most_likely = likelihoods.argmax(axis=1) # Across 2.5 secs of observations, vote on which cluster they should be associated with iter_training = {} interval_size = 250 for i in range(interval_size, self.N, interval_size): arr = np.array(most_likely[(range(i-interval_size, i))]) max_gmm = int(stats.mode(arr)[0][0]) iter_training.setdefault((self.gmm_list[max_gmm],max_gmm),[]).append(self.X[i-interval_size:i,:]) arr = np.array(most_likely[(range((self.N/interval_size)*interval_size, self.N))]) max_gmm = int(stats.mode(arr)[0][0]) iter_training.setdefault((self.gmm_list[max_gmm], max_gmm),[]).append(self.X[(self.N/interval_size)*interval_size:self.N,:]) iter_bic_dict = {} iter_bic_list = [] cluster_count = 0 for gp, data_list in iter_training.iteritems(): g = gp[0] p = gp[1] cluster_data = data_list[0] for d in data_list[1:]: cluster_data = np.concatenate((cluster_data, d)) cluster_data = np.ascontiguousarray(cluster_data) g.train(cluster_data) iter_bic_list.append((g,cluster_data)) iter_bic_dict[p] = cluster_data cluster_count += 1 return iter_bic_dict, iter_bic_list, most_likely
def transmode(freqlist,freqref=300): """Transpose all the frequencies by setting the _mode_ on a given reference frequency. Args: freqlist (numpy.array) : A list of frequencies to be transposed. freqref (int): The frequency reference to be transposed to. Default = 300. Return: transfreq (numpy.array): a list of the transposed frequencies. """ transfreq = freqlist*float(freqref)/mode(freqlist)[0] return transfreq
def binner(x, y, w_sta, nbins, rang=None, ebar=False, per=None): from numpy import array, digitize, lexsort, linspace from numpy.ma import average, median ind = lexsort((y, x)) xs, ys = x[ind], y[ind] if rang is None: mn, mx = min(xs), max(xs) else: mn, mx = rang bins = linspace(mn, mx, nbins + 1) x_cen = (bins[:-1] + bins[1:]) * 0.5 bins = linspace(mn, mx, nbins) ibins = digitize(xs, bins) if w_sta == "median": y_sta = array( [median(ys[ibins == i]) for i in range(1, bins.size + 1)]) elif w_sta == "mean": y_sta = array( [average(ys[ibins == i]) for i in range(1, bins.size + 1)]) elif w_sta == "mode": y_sta = array( [mode(ys[ibins == i])[0] for i in range(1, bins.size + 1)]) if ebar == False: return x_cen, y_sta elif ebar == True and per == None: myer = abs( array([ scoreatpercentile(ys[ibins == i], 15.8) for i in range(1, bins.size + 1) ]) - y_sta) pyer = abs( array([ scoreatpercentile(ys[ibins == i], 84.0) for i in range(1, bins.size + 1) ]) - y_sta) yer = array([myer, pyer]) return x_cen, y_sta, yer elif ebar == True and per != None: myer = abs( array([ scoreatpercentile(ys[ibins == i], per[0]) for i in range(1, bins.size + 1) ]) - y_sta) pyer = abs( array([ scoreatpercentile(ys[ibins == i], per[1]) for i in range(1, bins.size + 1) ]) - y_sta) yer = array([myer, pyer]) return x_cen, y_sta, yer
def is_var(self, indiv_id, g, force_not_mode = False): idx = g.indivs.index(indiv_id) if (not force_not_mode) and len(np.unique(self.labels))>2: return True else: idx = g.indivs.index(indiv_id) m = mode(self.labels)[0] if self.labels[idx] != m: return True return False
def MCSpreds(probs1, probs2, probs3, probs4): preds1 = np.argmax(probs1, axis=1) preds2 = np.argmax(probs2, axis=1) preds3 = np.argmax(probs3, axis=1) preds4 = np.argmax(probs4, axis=1) print("preds1 uniques are: %s" %(np.unique(preds1))) combined_preds = np.concatenate((preds1, preds2, preds3, preds4), axis=1) mode_preds = mode(combined_preds, axis=1) return mode_preds
def find_timestep(data, n_random_values=10): # Random position pos_ini = np.random.randint(0, data.shape[0], n_random_values, 'int64') pos_end = pos_ini + 1 time = data.index.values time_ini = time[pos_ini] time_end = time[pos_end] time_difference = time_end - time_ini time_step = mode(time_difference)[0][0] return pd.to_timedelta(time_step)
def is_var(self, indiv_id, g, force_not_mode=False): idx = g.indivs.index(indiv_id) if (not force_not_mode) and len(np.unique(self.labels)) > 2: return True else: idx = g.indivs.index(indiv_id) m = mode(self.labels)[0] if self.labels[idx] != m: return True return False
def predict(self, te): te_data = te.data n_movs, n_cells, n_samples, n_trials = te_data.shape cell_predictions = -np.ones((n_movs, n_cells, n_trials)).astype(int) final_predictions = -np.ones((n_movs, n_trials)).astype(int) for i_s in range(n_movs): for i_r in range(n_trials): for i_n in range(n_cells): r = te_data[i_s,i_n,:,i_r] dists = [np.linalg.norm(r - self.templates[i_s_p,i_n,:]) for i_s_p in range(self.n_stim)] cell_predictions[i_s,i_n,i_r] = np.argmin(dists) final_predictions[i_s,i_r] \ = mode(cell_predictions[i_s,:,i_r])[0][0].astype(int) return final_predictions
def where2prd(train, test, smoteit = True): "WHERE2" t = discreteNums(train, map(lambda x: x.cells, train._rows)) myTree = tdiv(t) testCase = test._rows rows, preds = [], [] for tC in testCase: newRow = tC; loc = drop(tC, myTree) # Drop a test case in the tree & see where it lands if not loc.kids: rows.extend(loc.rows) else: for k in loc.kids: rows.extend(k.rows) vals = [r.cells[-2] for r in rows] preds.append([mode([k for k in vals])[0].tolist()]) # \ # if median(vals) > 0 else preds.extend([0]) return preds
def common_value_imputer(data, add_binary=False): """ A function for filling missing values in dataset with the most common value for each feature. :param data: dataset :param add_binary: adding additonal columns with mask missing or not :return: dataset without missing values """ X = np.array(data) mask = X != X for col in range(X.shape[1]): X[mask[:, col], col] = mode(X[~mask[:, col], col])[0][0] if add_binary: X = _add_missing_binary(X, mask) return X
def transmode(self): """Transpose all the frequencies by setting the mode on a given reference frequency :params freqref : The frequency reference to be transposed to. Default = 300 ? ref : The note reference : mode or tonic. Default = mode : return the transposed frequencies """ if self.transpositionref=="mode": interv_transpo = mode(self.freq)[0]/self.freqref if self.transpositionref=="tonic": T = float(self.tonique(self.percent,self.method)[1]) print "Tonic :",T if T > self.freqref : interv_transpo = T/self.freqref if T < self.freqref : interv_transpo = self.freqref/T print "Intervalle de tranposition :",interv_transpo self.freqtransposed = self.freq / interv_transpo return self.freqtransposed
def subpage_squash(packet_iter, minimum_dups=3, pages=All, yield_func=packets): subpages = defaultdict(list) for pl in paginate(packet_iter, pages=pages, yield_func=packet_lists, drop_empty=True): subpagekey = (pl[0].mrag.magazine, pl[0].header.page, pl[0].header.subpage) arr = numpy.zeros((42, 32), dtype=numpy.uint8) for p in pl: arr[:,p.mrag.row] = p._original_bytes subpages[subpagekey].append(arr) for arrlist in subpages.itervalues(): if len(arrlist) >= minimum_dups: arr = mode(numpy.array(arrlist), axis=0)[0][0].astype(numpy.uint8) packets = [] for i in range(32): if arr[:,i].any(): packets.append(Packet.from_bytes(arr[:,i])) for item in yield_func(packets): yield item
def test_indices_convert(self): "Test the conversion of ensoindices from one frequency to another." ensoi = self.ensoi series = ClimateSeries(np.random.rand(len(ensoi)), dates=ensoi._dates, ensoindicator = ensoi) series.set_ensoindices(minimum_size=5, reference_season='NDJ') control = ts.time_series([ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-1,-1, -1,-1,-1,-1,-1, 0, 0,-1,-1,-1,-1,-1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,+1,+1, +1,+1,+1,+1,+1,+1,+1, 0, 0, 0, 0, 0,], dates=ensoi.dates) assert_equal(series.ensoindices, control) # Conversion 'M' to 'D' dseries = series.convert('D') assert_equal(dseries.ensoindices, ts.lib.backward_fill(control.convert('D'))) # Conversion 'M' to 'A' aseries = series.convert('A', func=ma.mean) assert_equal(aseries.ensoindices, mode(control.convert('A'), axis=1)[0].squeeze())
def multi_window(sig,win): ''' algorithm picking arrival times the maximum amplitudes correspond to sending and arriving times n-dimensional extension sig - (N,M) numpy array N - number of sonic tracks M - data points of oscilloscope win - 3-element list ''' sig0 = sig - mode(sig,axis=1)[0] # remove shift in amplitude E = sig0**2 N = E.shape[1]-win[2]-win[0]-1 BTA = np.zeros((E.shape[0],N)) # before term average ATA = np.zeros((E.shape[0],N)) # after term average DTA = np.zeros((E.shape[0],N)) # delayed term average iterator = np.arange(N) for i in np.nditer(iterator): BTA[:,i] = np.mean(E[:,i:i+win[0]],axis=1) ATA[:,i] = np.mean(E[:,i+win[0]:i+win[0]+win[1]],axis=1) DTA[:,i] = np.mean(E[:,i+win[0]:i+win[0]+win[2]],axis=1) r = ATA/BTA + DTA/BTA return r/10
def execute(self): likelihoods = self.cluster_list[0].get_gmm().score(self.X) self.cluster_list[0].reset_data() for cluster in self.cluster_list[1:]: likelihoods = numpy.column_stack((likelihoods, cluster.get_gmm().score(self.X))) cluster.reset_data() if self.number_of_clusters == 1: self.most_likely = numpy.zeros(len(self.X)) else: self.most_likely = likelihoods.argmax(axis=1) # Across 250 frames of observations # Vote on wich cluster they should be associated with data_range = range(0, self.N, self.interval_size) if data_range[-1] < self.N: data_range.append(self.N) for i, v in enumerate(data_range[0:len(data_range)-1]): current_segment_indexes = range(data_range[i], data_range[i+1]) current_segment_scores = numpy.array(self.most_likely[current_segment_indexes]) # print(current_segment_data) most_likely_gmm_class = int(stats.mode(current_segment_scores)[0][0]) print(most_likely_gmm_class) # print(self.X[current_segment_indexes,:]) current_segment_data = self.X[current_segment_indexes,:] segment = Segment( data_range[i], data_range[i+1], current_segment_data, self.cluster_list[most_likely_gmm_class].get_name() ) self.cluster_list[most_likely_gmm_class].add_segment(segment) new_cluster_list = [] for cluster in self.cluster_list: if len(cluster.get_segments()) > 0: cluster.train_gmm() new_cluster_list.append(cluster) return new_cluster_list
def test_frame(frame,desc): surf = cv2.SURF(400) #kp, descriptor = surf.detectAndCompute(cv2.cvtColor(frame,cv2.COLOR_BGR2GRAY), None) kp, descriptor = surf.detectAndCompute(frame, None) if descriptor is None: return -1 points = descriptor.shape[0] matches = np.zeros((points,1)) best_poke_desc = np.zeros((points,151)) best_poke_desc[:,0] = 151; for pt in range(0, points): d = descriptor[pt,:].reshape(1,128) for poke in range(1,151): best_poke_desc[pt,poke] = np.min(sp.cdist(desc[poke],d,'Euclidean')) matches[pt] = np.argmin(best_poke_desc[pt,:]) val, count = mstats.mode(matches) if( count[0][0] < 4): return -1 return val[0][0]