示例#1
0
def mode(x: pd.Series, w: Union[Window, int] = Window(None, 0)) -> pd.Series:
    """
    Most common value in series over given window

    :param x: series: timeseries
    :param w: Window or int: size of window and ramp up to use. e.g. Window(22, 10) where 22 is the window size
              and 10 the ramp up value. Window size defaults to length of series.
    :return: timeseries of mode value

    **Usage**

    Computes the `mode <https://en.wikipedia.org/wiki/Mode_(statistics)>`_ over a given window. For each window, this
    function will return the most common value of all elements in the window. If there are multiple values with the same
    frequency of occurrence, will return the smallest value.

    If window is not provided, computes mode over the full series.

    **Examples**

    Generate price series and compute mode over :math:`22` observations

    >>> prices = generate_series(100)
    >>> mode(prices, 22)

    **See also**

    :func:`mean` :func:`median`
    """
    w = normalize_window(x, w)
    assert x.index.is_monotonic_increasing, "series index is monotonic increasing"
    if isinstance(w.w, pd.DateOffset):
        values = [stats.mode(x.loc[(x.index > idx - w.w) & (x.index <= idx)]).mode[0] for idx in x.index]
        return apply_ramp(pd.Series(values, index=x.index, dtype=np.dtype(float)), w)
    else:
        return apply_ramp(x.rolling(w.w, 0).apply(lambda y: stats.mode(y).mode, raw=True), w)
def process_merge_severity_type(df, severity_type, event_type):
    # severity type
    severity_type = severity_type.merge(df[['id', 'fault_severity', 'source']],
                                        on='id')
    unique_severity_type = pd.DataFrame(
        severity_type['severity_type'].value_counts())
    unique_severity_type['PercTrain'] = severity_type.pivot_table(
        values='source',
        index='severity_type',
        aggfunc=lambda x: sum(x == 'train') / float(len(x)))
    unique_severity_type['severity_mode'] = severity_type.loc[
        severity_type['source'] == 'train'].pivot_table(
            values='fault_severity',
            index='severity_type',
            aggfunc=lambda x: mode(x)[0])
    severity_type.loc[event_type['source'] == 'train'].pivot_table(
        values='fault_severity',
        index='severity_type',
        aggfunc=lambda x: mode(x))
    severity_type_merge = severity_type.pivot_table(values='source',
                                                    index='id',
                                                    columns='severity_type',
                                                    aggfunc=lambda x: len(x),
                                                    fill_value=0)
    return df.merge(severity_type_merge, left_on='id', right_index=True)
示例#3
0
def subpage_squash(packet_lists, min_duplicates=3, ignore_empty=False):
    """Yields squashed subpages."""

    spdict = defaultdict(list)
    for pl in packet_lists:
        if len(pl) > 1:
            subpage = Subpage.from_packets(pl, ignore_empty=ignore_empty)
            spdict[(subpage.mrag.magazine, subpage.header.page,
                    subpage.header.subpage)].append(subpage)

    for splist in tqdm(spdict.values(), unit=' Subpages'):
        if len(splist) >= min_duplicates:
            numbers = mode(np.stack(
                [np.clip(sp.numbers, -100, -1) for sp in splist]),
                           axis=0)[0][0].astype(np.int64)
            s = Subpage(numbers=numbers)
            for row in range(29):
                if row in [26, 27, 28]:
                    for dc in range(16):
                        if s.number(row, dc) > -100:
                            packets = [
                                sp.packet(row, dc) for sp in splist
                                if sp.number(row, dc) > -100
                            ]
                            arr = np.stack([p[3:] for p in packets])
                            s.packet(row, dc)[:3] = packets[0][:3]
                            if row == 27:
                                s.packet(row, dc)[3:] = mode(
                                    arr, axis=0)[0][0].astype(np.uint8)
                            else:
                                t = arr.astype(np.uint32)
                                t = t[:, 0::3] | (t[:, 1::3] << 8) | (
                                    t[:, 2::3] << 16)
                                result = mode(t,
                                              axis=0)[0][0].astype(np.uint32)
                                s.packet(row, dc)[3::3] = result & 0xff
                                s.packet(row, dc)[4::3] = (result >> 8) & 0xff
                                s.packet(row, dc)[5::3] = (result >> 16) & 0xff
                else:
                    if s.number(row) > -100:
                        packets = [
                            sp.packet(row) for sp in splist
                            if sp.number(row) > -100
                        ]
                        arr = np.stack([p[2:] for p in packets])
                        s.packet(row)[:2] = packets[0][:2]
                        s.packet(row)[2:] = mode(arr,
                                                 axis=0)[0][0].astype(np.uint8)

            yield s
示例#4
0
    def segment_majority_vote_indices(self, interval_size, em_iters):

        num_clusters = len(self.gmm_list)

        # Resegment data based on likelihood scoring
        likelihoods = self.gmm_list[0].score(self.X)
        for g in self.gmm_list[1:]:
            likelihoods = np.column_stack((likelihoods, g.score(self.X)))

        if num_clusters == 1:
            most_likely = np.zeros(len(self.X))
        else:
            most_likely = likelihoods.argmax(axis=1)

        # Across 2.5 secs of observations, vote on which cluster they should be associated with

        iter_training = {}

        for i in range(interval_size, self.N, interval_size):
            arr = np.array(most_likely[(range(i - interval_size, i))])
            max_gmm = int(stats.mode(arr)[0][0])
            iter_training.setdefault((self.gmm_list[max_gmm], max_gmm),
                                     []).append((i - interval_size, i))

        arr = np.array(most_likely[(range(
            (self.N / interval_size) * interval_size, self.N))])
        max_gmm = int(stats.mode(arr)[0][0])
        iter_training.setdefault((self.gmm_list[max_gmm], max_gmm),[]).\
                                  append((self.N/interval_size*interval_size, self.N))
        iter_bic_dict = {}
        iter_bic_list = []

        for gp, e_tuple_list in iter_training.iteritems():
            g = gp[0]
            p = gp[1]

            cluster_indices = np.array(range(e_tuple_list[0][0],
                                             e_tuple_list[0][1], 1),
                                       dtype=np.int32)
            for d in e_tuple_list[1:]:
                cluster_indices = np.concatenate((cluster_indices,\
                                                  np.array(range(d[0],d[1],1),\
                                                  dtype=np.int32)))

            g.train_on_subset(self.X, cluster_indices, max_em_iters=em_iters)

            iter_bic_list.append((g, cluster_indices))
            iter_bic_dict[p] = cluster_indices

        return iter_bic_dict, iter_bic_list, most_likely
示例#5
0
    def segment_majority_vote(self, interval_size, em_iters):
        
        num_clusters = len(self.gmm_list)

        # Resegment data based on likelihood scoring
        score_time = time.time()
        likelihoods = self.gmm_list[0].score(self.X)
        for g in self.gmm_list[1:]:
            likelihoods = np.column_stack((likelihoods, g.score(self.X)))

        if num_clusters == 1:
            most_likely = np.zeros(len(self.X))
        else:
            most_likely = likelihoods.argmax(axis=1)
        self.ftime.write("Score: {0}\n".format(time.time() - score_time))


        # Across 2.5 secs of observations, vote on which cluster they should be associated with

        iter_training = {}
        segment_time = time.time()
        for i in range(interval_size, self.N, interval_size):

            arr = np.array(most_likely[(range(i-interval_size, i))])
            max_gmm = int(stats.mode(arr)[0][0])
            iter_training.setdefault((self.gmm_list[max_gmm],max_gmm),[]).append(self.X[i-interval_size:i,:])

        arr = np.array(most_likely[(range((self.N/interval_size)*interval_size, self.N))])
        max_gmm = int(stats.mode(arr)[0][0])
        iter_training.setdefault((self.gmm_list[max_gmm], max_gmm),[]).append(self.X[(self.N/interval_size)*interval_size:self.N,:])
        
        iter_bic_dict = {}
        iter_bic_list = []

        # for each gmm, append all the segments and retrain
        for gp, data_list in iter_training.iteritems():
            g = gp[0]
            p = gp[1]
            cluster_data =  data_list[0]

            for d in data_list[1:]:
                cluster_data = np.concatenate((cluster_data, d))

            g.train(cluster_data, max_em_iters=em_iters)

            iter_bic_list.append((g,cluster_data))
            iter_bic_dict[p] = cluster_data

        self.ftime.write("Segment: {0}\n".format(time.time() - segment_time))
        return iter_bic_dict, iter_bic_list, most_likely
示例#6
0
文件: cluster.py 项目: ppr10/gmm
    def segment_majority_vote(self, interval_size, em_iters):

        num_clusters = len(self.gmm_list)

        # Resegment data based on likelihood scoring
        likelihoods = self.gmm_list[0].score(self.X)
        for g in self.gmm_list[1:]:
            likelihoods = np.column_stack((likelihoods, g.score(self.X)))

        if num_clusters == 1:
            most_likely = np.zeros(len(self.X))
        else:
            most_likely = likelihoods.argmax(axis=1)

        # Across 2.5 secs of observations, vote on which cluster they should be associated with

        iter_training = {}

        for i in range(interval_size, self.N, interval_size):

            arr = np.array(most_likely[(range(i - interval_size, i))])
            max_gmm = int(stats.mode(arr)[0][0])
            iter_training.setdefault((self.gmm_list[max_gmm], max_gmm),
                                     []).append(self.X[i - interval_size:i, :])

        arr = np.array(most_likely[(range(
            (self.N / interval_size) * interval_size, self.N))])
        max_gmm = int(stats.mode(arr)[0][0])
        iter_training.setdefault((self.gmm_list[max_gmm], max_gmm), []).append(
            self.X[(self.N / interval_size) * interval_size:self.N, :])

        iter_bic_dict = {}
        iter_bic_list = []

        # for each gmm, append all the segments and retrain
        for gp, data_list in iter_training.iteritems():
            g = gp[0]
            p = gp[1]
            cluster_data = data_list[0]

            for d in data_list[1:]:
                cluster_data = np.concatenate((cluster_data, d))

            g.train(cluster_data, max_em_iters=em_iters)

            iter_bic_list.append((g, cluster_data))
            iter_bic_dict[p] = cluster_data

        return iter_bic_dict, iter_bic_list, most_likely
def main():
	print "Starting...."
	RE = ReccomendationEngine()
	tweets = RE.get_corpus()
	model, centroids = RE.get_Kmeans()
	user_name = raw_input('Please enter twitter username to provide reccomendation for:')
	recommendations = RE.get_recommendations(user_name)
	print "Follow these users?\n"
	for i in recommendations[:5]:
		print i
		print

	evaluation = raw_input('How did I do at suggesting followers (on a scale of 1 to 5)?')
	feedback = client.twitter_data.user_feedback

	print "Thanks for the feedback, we'll do better next time!"
	feedback.insert({'user' : user_name,
                    'rating': evaluation,
                    'date_time': datetime.now()
                    })

	print 'Now updating scores'
	for i in users.find():
    		users.update({"_id": i["_id"]},
                 {'$set': {'cluster_score': int(mode(RE.model.predict(RE.vect.transform(i['tweets'])), 
                 			axis=None)[0]
                 			)}
                 })
示例#8
0
	def optimize(self,J_test,n_evals):
		# J_test is a 1 by n_theta array
		ranked_idxs = np.argsort(-self.J_train) # sort each row
		best_per_scene = ranked_idxs[:,0]
		idx_to_eval = int(mode(best_per_scene)[0])
		
		
		plan_dists = self.computePlanDist(self.J_train)
		
		B_values_context = np.mean(self.J_train,axis=0)+np.std(self.J_train,axis=0)
		evaluated_idxs = np.array([idx_to_eval])
		for n in range(n_evals):
			B_values_plan = np.transpose( J_test[0,evaluated_idxs] ) + \
					  plan_dists[evaluated_idxs,:]
			B_values_plan = np.asmatrix(B_values_plan)
			B_values = np.min(np.r_[B_values_plan,B_values_context],axis=0)
			#TODO: how come I cannot slice this?
			for eval_idx in evaluated_idxs:
				B_values[0,eval_idx] = \
					J_test[0,eval_idx]
#			idx_to_eval=np.argmax(B_values)
			sorted_Bval_idxs = np.argsort(-B_values)
			
			idx_to_eval=idx_to_eval[0,0]
			if n is not (len(n_evals)-1)
				evaluated_idxs = np.r_[evaluated_idxs,idx_to_eval]
		import pdb; pdb.set_trace()
示例#9
0
 def _fix_cachedcurrent(self, ensoindices):
     """
 Private function to convert cached ENSO indices.
 This function is used to ensure that:
 * the frequency of the indices matches the frequency of the indicator.
 * the date range of the indices matches the date range of the indicator.
 * the shape of the indices matches the shape of the indicator.
     """
     _currentfreq = ensoindices._dates.freq
     _freq = self._dates.freq
     # Check the frequency, and convert if needed
     if _currentfreq > _freq:
         # To a lower frequency ('Q'/'A')
         if self.ndim == 2:
             conversionfunc = None
         else:
             conversionfunc = lambda x: mode(x)[0].squeeze()
         ensoindices = ensoindices.convert(_freq, func=conversionfunc)
     elif _currentfreq < _freq:
         # To a higher frequency (eg, 'D')
         ensoindices = backward_fill(ensoindices.convert(_freq))
     # Reset to the original frequency if needed...
     (start, end) = self._dates.flat[[0, -1]]
     if tuple(ensoindices._dates.flat[[0, -1]]) != (start, end):
         ensoindices = ts.adjust_endpoints(ensoindices, start_date=start, end_date=end)
     # Reset the shape of the indices
     if ensoindices.shape != self.shape:
         ensoindices.shape = self.shape
     return ensoindices
示例#10
0
 def update_db(self):
     for i in users.find():
         users.update({"_id": i["_id"]},
              {'$set': {'cluster_score': int(mode(model.predict(RE.vect.transform(i['tweets'])), 
                         axis=None)[0]
                         )}
              })
示例#11
0
def spectral(tweetfile,npmifile,dictfile,k,noc):
	Ptmp=textscan(npmifile,'([^ ]*) ([^ ]*) ([^ ]*)');
	PP=textscan(dictfile,'(.*) (.*)',(int,str));
	PP[0] -= 1
	PMI=ssp.coo_matrix(
		(Ptmp[2],(Ptmp[0]-1,Ptmp[1]-1)),
		(PP[0].shape[0],PP[0].shape[0])
	).tocsr();

	W=knnmatrix(PMI,k);
	# This is hidious and wrong and it must be fixed
	W=ssp.csr_matrix(minimum(W.todense(),W.T.todense()))
	
	s,comp = ssp.csgraph.connected_components(W,directed=False)
	comp_mode = mstats.mode(comp)[0]
	inds = comp==comp_mode
	inds = [x for x in range(W.shape[0]) if inds[x]]
	WW = W[inds,:][:,inds]
	P=PP[1][inds];

	ids = P;
	X = WW;

	c = spectral_clustering(X,n_clusters=noc, eigen_solver='arpack')
	fid=file("".join(['cl.',tweetfile,'-',str(noc)]),'w');
	for i in range(max(c)+1):
		cl=[x for x in range(len(c)) if c[x] == i]
		b,wordsix = centralityn(cl,X,ids);
		for j in range(len(b)):
			word=wordsix[j];
			fid.write('%s %d %.5f\n'%(word,i,b[j]));
示例#12
0
    def __zonal_stats(self, f, masked):
        for op, alias in self.operations_idx.iteritems():
            if op == 'min':
                v = float(masked.min())
            elif op == 'max':
                v = float(masked.max())
            elif op == 'sum':
                v = float(masked.sum())
            elif op == 'count':
                v = int(masked.count())
            elif op == 'mean':
                v = float(masked.mean())
            elif op == 'std':
                v = float(masked.std())
            elif op == 'unique':
                v = numpy.unique(masked.compressed()).size
            elif op == 'range':
                v = float(masked.max()) - float(masked.min())
            elif op == 'var':
                v = float(masked.var())
            elif op == 'median':
                v = float(numpy.ma.median(masked))
            elif op == 'mode':
                v = float(mode(masked, axis=None)[0][0])
            else:
                self.logger.warning('Unrecognized operation "{}".'.format(op))
                continue

            self.layer.startEditing()

            f[alias] = NULL if numpy.isnan(v) else v
            self.layer.updateFeature(f)

            self.layer.commitChanges()
示例#13
0
文件: getdata.py 项目: yiorg/lambda
def mode(x):
    """
    Comutes a single value for the mode of an array
    :param x:
    :return: returns the mode of an array
    """
    return mstats.mode(x, axis=None)[0]
示例#14
0
def main():
    map_obstacles = np.loadtxt('obstacles_map.txt')
    laser_obstacles = np.loadtxt('obstacles_laser.txt')

    # Center the data
    map_obstacles = center(map_obstacles)
    laser_obstacles = center(laser_obstacles)

    # Translate and rotate the measurements
    true_rotation = np.pi / 10
    true_translation = np.array([10, 10])

    laser_trans = translate(laser_obstacles, true_translation)
    laser_rot = rotate(laser_trans, true_rotation)

    # Run k-means
    map_code, map_dist = sp.kmeans(map_obstacles, 10)
    code, dist = sp.vq(laser_obstacles, map_code)

    m = stats.mode(code)
    pos = map_code[m[0][0]]

    print "Code:", map_code
    print "Dist:", dist
    print "Most likely cluster:", m
    print "Which corresponds to:", pos

    # plot_super(map_obstacles, laser_obstacles, "Original")
    plot_super(map_obstacles, map_code, "Map's k-means centers")
    plot_super(map_obstacles, np.vstack([laser_obstacles, pos]), "The Cluster")
    # plot_super(laser_obstacles, laser_code, "Laser's k-means centers")
    # plot_super(map_obstacles, laser_trans, "Measure misaligned with map")
    # plot_super(map_obstacles, laser_reloc, "Measure realigned with map")
    plt.show()
    def __zonal_stats(self, f, masked):
        attrs = f.attributes()

        for op, idx in self.operations_idx.iteritems():
            if op == 'min':
                v = float(masked.min())
            elif op == 'max':
                v = float(masked.max())
            elif op == 'sum':
                v = float(masked.sum())
            elif op == 'count':
                v = int(masked.count())
            elif op == 'mean':
                v = float(masked.mean())
            elif op == 'std':
                v = float(masked.std())
            elif op == 'unique':
                v = numpy.unique(masked.compressed()).size
            elif op == 'range':
                v = float(masked.max()) - float(masked.min())
            elif op == 'var':
                v = float(masked.var())
            elif op == 'median':
                v = float(numpy.ma.median(masked))
            elif op == 'mode':
                v = float(mode(masked, axis=None)[0][0])
            else:
                self.logger.warning('Unrecognized operation "{}".'.format(op))
                continue

            attrs.insert(idx, None if numpy.isnan(v) else v)
        return attrs
示例#16
0
def binner(x, y, w_sta, nbins, rang = None, ebar = False, per = None) :
	from numpy import array, digitize, lexsort, linspace
	from numpy.ma import average, median

	ind    = lexsort((y, x))
	xs, ys = x[ind], y[ind]

	if rang is None : mn, mx = min(xs), max(xs)
	else            : mn, mx = rang
	
	bins  = linspace(mn, mx, nbins + 1)
	x_cen = (bins[: - 1] + bins[1:])*0.5
	bins  = linspace(mn, mx, nbins)
	ibins = digitize(xs, bins)

	if w_sta   == "median" : y_sta = array([median(ys[ibins == i]) for i in range(1, bins.size + 1)])
	elif w_sta == "mean"   : y_sta = array([average(ys[ibins == i]) for i in range(1, bins.size + 1)])
	elif w_sta == "mode"   : y_sta = array([mode(ys[ibins == i])[0] for i in range(1, bins.size + 1)])

	if ebar   == False                : return x_cen, y_sta
	elif ebar == True and per == None :
		myer = abs(array([scoreatpercentile(ys[ibins == i], 15.8) for i in range(1, bins.size + 1)]) - y_sta)
		pyer = abs(array([scoreatpercentile(ys[ibins == i], 84.0) for i in range(1, bins.size + 1)]) - y_sta)
		yer  = array([myer, pyer])
		return x_cen, y_sta, yer

	elif ebar == True and per != None :
		myer = abs(array([scoreatpercentile(ys[ibins == i], per[0]) for i in range(1, bins.size + 1)]) - y_sta)
		pyer = abs(array([scoreatpercentile(ys[ibins == i], per[1]) for i in range(1, bins.size + 1)]) - y_sta)
		yer = array([myer, pyer])
		return x_cen, y_sta, yer
示例#17
0
def usdn_energy_v_hops(df_dict):
    """Plot usdn energy vs hops."""
    try:
        if 'app' in df_dict and 'pow' in df_dict:
            pow_df = df_dict['pow']
            app_df = df_dict['app']
        else:
            raise Exception('ERROR: Correct df(s) not in dict!')
    except Exception:
        traceback.print_exc()
        sys.exit(0)

    # Add hops to node_df. N.B. cols with NaN are always converted to float
    hops = app_df[['src', 'hops']].groupby('src').agg(lambda x: mode(x)[0])
    pow_df = pow_df.join(hops['hops'].astype(int))

    df = pow_df.groupby('hops')['all_rdc']    \
               .apply(lambda x: x.mean()) \
               .reset_index()             \
               .set_index('hops')
    x = df.index.tolist()
    y = df['all_rdc'].tolist()
    cpplot.plot_bar(df,
                    'usdn_energy_v_hops',
                    sim_dir,
                    x,
                    y,
                    xlabel='Hops',
                    ylabel='Radio duty cycle (%)')
def show_dist(start_stocks=StartingPortfolio.stocks, lc=True):
    ages = []
    for iteration in range(0, iterations):
        ages.append(simulate(start_stocks, lc))

    min_age = min(ages)
    max_age = max(ages)
    for pa in range(60, 66):
        print('probability for age ', pa, ': ', calc_probability(ages, pa))
    print('mean: ', mean(ages))
    print('stdev: ', stdev(ages))
    print('variance: ', variance(ages))
    print('mode: ', mode(ages))
    print('median: ', median(ages))
    print('min age: ', min_age)
    print('max age: ', max_age)
    print('interval (max-min): ', max_age - min_age)

    sns.distplot(ages,
                 norm_hist=True,
                 kde=False,
                 bins=np.arange(min_age, max_age + 1))

    # with open('ages100.txt', mode='wt', encoding='utf-8') as file:
    #     file.write('\n'.join((str(i) for i in set(ages))))
    # sns.kdeplot(ages, shade=True)

    # pyplot.hist(ages, density=True)

    pyplot.show()
示例#19
0
def getModeWeek(dfDat):

    # find the most frequent week when max revenue occurs : 1 = week 1; 2 = week 2
    # from all merged 6w data
    dfDat['week'] = np.where((dfDat['nWeek'] % 2 == 0), 2, 1)
    datWeek = dfDat[['rid', 'sic', 'week']]
    # mode of timeOfDay for every rid+sic key
    grpWeekRev = datWeek.groupby(['rid', 'sic'])
    grpName = []

    modeWeek = 0
    dfModeWeek = pd.DataFrame({'rid': [], 'sic': [], 'weekMaxRev': []})
    for name, group in grpWeekRev:
        grpName.append(group)
    for jNum in range(0, len(grpName)):
        tmpArr = grpName[jNum]
        tmpArr = tmpArr.set_index(['rid', 'sic'])
        modeTmp = mode(tmpArr['week'])
        modeWeek = modeTmp[0][0]
        dfWeek = pd.DataFrame({
            'rid': [(tmpArr.index[0][0])],
            'sic': [(tmpArr.index[0][1])],
            'weekMaxRev': [modeWeek]
        })
        dfModeWeek = dfModeWeek.append(dfWeek, ignore_index=True)
    del tmpArr, modeTmp, modeWeek, dfWeek, grpWeekRev, grpName, datWeek
    return dfModeWeek
示例#20
0
def usdn_pdr_v_hops(df_dict):
    """Plot usdn energy vs hops."""
    try:
        if 'app' in df_dict:
            app_df = df_dict['app']
        else:
            raise Exception('ERROR: Correct df(s) not in dict!')
    except Exception:
        traceback.print_exc()
        sys.exit(0)

    # Get hops for each node. N.B. cols with NaN are always converted to float
    df = app_df[['src', 'hops']].groupby('src').agg(lambda x: mode(x)[0])
    # Calculate PRR
    df['pdr'] = app_df.groupby('src')['drpd'] \
                      .apply(lambda x: ratio(len(x), x.sum()))

    df = df.groupby('hops')['pdr']    \
           .apply(lambda x: x.mean()) \
           .reset_index()             \
           .set_index('hops')
    x = df.index.tolist()
    y = df['pdr'].tolist()
    cpplot.plot_bar(df,
                    'usdn_pdr_v_hops',
                    sim_dir,
                    x,
                    y,
                    xlabel='Hops',
                    ylabel='PDR (%)')
示例#21
0
def getModeTime(dfDat):

    # find the most frequent time of day and datetime when max revenue occurs
    # from all merged 6W data

    # mode of timeOfDay for every rid+sic key
    grpTimeOfDay = dfDat.groupby(['rid', 'sic'])
    grpName = []

    modeTime = 0
    dfModeTime = pd.DataFrame({'rid': [], 'sic': [], 'timeMaxRev': []})
    for name, group in grpTimeOfDay:
        grpName.append(group)
    for jNum in range(0, len(grpName)):
        tmpArr = grpName[jNum]
        tmpArr = tmpArr.set_index(['rid', 'sic'])
        modeTmp = mode(tmpArr['daytime'])
        modeTime = modeTmp[0][0]
        dfTimeOfDay = pd.DataFrame({
            'rid': [(tmpArr.index[0][0])],
            'sic': [(tmpArr.index[0][1])],
            'timeMaxRev': [modeTime]
        })
        dfModeTime = dfModeTime.append(dfTimeOfDay, ignore_index=True)
    del tmpArr, modeTmp, modeTime, dfTimeOfDay, grpTimeOfDay, grpName
    return dfModeTime
示例#22
0
def balanced_resample(data, labels):
    """Do a balanced resampling of data and labels, returning them
    See the test routine at the bottom for an example of behavior
    """
    most_common, num_required = mstats.mode(labels)
    possible_labels = np.unique(labels)

    data_resampled = []
    labels_resampled = []

    for possible_label in possible_labels:
        in_this_label = labels == possible_label

        data_buffered = np.array([])
        data_buffered = np.reshape(data_buffered, (0, data.shape[1]))
        labels_buffered = np.array([])

        while len(data_buffered) < num_required:
            data_buffered = np.vstack([data_buffered, data[in_this_label]])
            labels_buffered = np.hstack([labels_buffered, labels[in_this_label]])

        single_data_resampled, single_labels_resampled = utils.resample(
            data_buffered,
            labels_buffered,
            n_samples=int(num_required),
            replace=True
        )
        data_resampled.append(single_data_resampled)
        labels_resampled.append(single_labels_resampled)

    return np.vstack(data_resampled).astype(data.dtype), np.hstack(labels_resampled).astype(labels.dtype)
示例#23
0
def mode(x: pd.Series, w: int = 0) -> pd.Series:
    """
    Most common value in series over given window

    :param x: series: timeseries
    :param w: window: number of observations to use (defaults to length of series)
    :return: timeseries of mode value

    **Usage**

    Computes the `mode <https://en.wikipedia.org/wiki/Mode_(statistics)>`_ over a given window. For each window, this
    function will return the most common value of all elements in the window. If there are multiple values with the same
    frequency of occurrence, will return the smallest value.

    If window is not provided, computes mode over the full series.

    **Examples**

    Generate price series and compute mode over :math:`22` observations

    >>> prices = generate_series(100)
    >>> mode(prices, 22)

    **See also**

    :func:`mean` :func:`median`
    """
    w = w or x.size
    assert x.index.is_monotonic_increasing, "series index is monotonic increasing"
    return x.rolling(w, 0).apply(lambda y: stats.mode(y).mode, raw=True)
def get_info_gain(data):
    info_gains = []
    g1 = []
    g2 = []
    for val in np.unique(data[:, 0]):

        m = mode(data[:, 0])
        m = float(m[1])
        n = len(data[:, 0])
        p = dict(Counter(data[:, 0]))
        m = 0
        for i in p:
            m += p[i] * i
        m = m / float(n)

        for i in range(len(data[:, 0])):
            if is_missing(data[i, 0]):
                print "yo"
                data[i, 0] = m

        g1 = data[data[:, 0] <= val][:, -1]
        g2 = data[data[:, 0] > val][:, -1]

        if g1.shape[0] != 0:
            ent1 = (float)(g1.shape[0]) * calc_info_gain(g1) / (data.shape[0])
        else:
            ent1 = 0
        if g2.shape[0] != 0:
            ent2 = (float)(g2.shape[0]) * calc_info_gain(g2) / (data.shape[0])
        else:
            ent2 = 0
        avg_ent = ent1 + ent2
        info_gains.append((avg_ent, val))

        return min(info_gains)
示例#25
0
 def _fix_cachedcurrent(self, ensoindices):
     """
 Private function to convert cached ENSO indices.
 This function is used to ensure that:
 * the frequency of the indices matches the frequency of the indicator.
 * the date range of the indices matches the date range of the indicator.
 * the shape of the indices matches the shape of the indicator.
     """
     _currentfreq = ensoindices._dates.freq
     _freq = self._dates.freq
     # Check the frequency, and convert if needed
     if _currentfreq > _freq:
         # To a lower frequency ('Q'/'A')
         if self.ndim == 2:
             conversionfunc = None
         else:
             conversionfunc = lambda x: mode(x)[0].squeeze()
         ensoindices = ensoindices.convert(_freq, func=conversionfunc)
     elif _currentfreq < _freq:
         # To a higher frequency (eg, 'D')
         ensoindices = backward_fill(ensoindices.convert(_freq))
     # Reset to the original frequency if needed...
     (start, end) = self._dates.flat[[0, -1]]
     if tuple(ensoindices._dates.flat[[0, -1]]) != (start, end):
         ensoindices = ts.adjust_endpoints(ensoindices,
                                           start_date=start, end_date=end)
     # Reset the shape of the indices
     if ensoindices.shape != self.shape:
         ensoindices.shape = self.shape
     return ensoindices
    def get_gts_by_indiv(self, correct_for_odd_major = True):
        cp_2_thresh=1.0
        mode_label = int(mode(self.labels)[0][0])
        
        indiv_labels = np.array(self.labels)
        
        mu_args = np.argsort(self.all_uniq_mus) 
          
        ordered_labels = self.all_uniq_labels[mu_args]
        ordered_mus = self.all_uniq_mus[mu_args] 
        d_from_2 = np.absolute(ordered_mus-2.0)
        
        labels_to_gt = {}
        """
        if there is something that looks like a 2, use it to callibrate others
        assign 2 to the closest 1, then assign the rest as +-1 in the order
        make sure that you aren't assigning -1 genotypes
        then finally, consolidate w/ the mus 
        """
        if np.amin(d_from_2)<cp_2_thresh:
            idx = np.argmin(d_from_2)
            idx_cp = 2
        else:
            idx = 0
            idx_cp = round(ordered_mus[0])

        for i,l in enumerate(ordered_labels): 
            labels_to_gt[l] = idx_cp-(idx-i)
        
        ## ensure no -1s
        while min(labels_to_gt.values())<0:
            print "<0's detected..."
            new_labels_to_gt = {}
            for l, gt in labels_to_gt.iteritems():
                new_labels_to_gt[l] = gt+1
            labels_to_gt = new_labels_to_gt
       
        ##correct for odd major alleles out of HWE 
        if correct_for_odd_major and (labels_to_gt[mode_label] %2 == 1) and np.sum(indiv_labels==mode_label) >= .5*(indiv_labels.shape[0]):
            d=0
            if self.label_to_mu[mode_label]-labels_to_gt[mode_label]>0 or min(labels_to_gt.values())==0:
                d=1
            else:
                d=-1
            new_labels_to_gt = {}
            for l, gt in labels_to_gt.iteritems():
                new_labels_to_gt[l] = gt+d
            labels_to_gt = new_labels_to_gt
        
        gts_by_indiv = {}
        for i, indiv in enumerate(self.indivs):  
            gts_by_indiv[indiv] = int(labels_to_gt[self.labels[i]]) 
        
        new_labels_to_gt = {k:int(v) for k,v in labels_to_gt.iteritems()}
        labels_to_gt = new_labels_to_gt

        gt_to_labels = {v:k for k,v in labels_to_gt.iteritems()} 

        return gts_by_indiv, gt_to_labels, labels_to_gt
示例#27
0
def subpage_squash(packet_lists, min_duplicates=3):
    """Yields squashed subpages."""

    spdict = defaultdict(list)
    for pl in packet_lists:
        subpage = Subpage.from_packets(pl)
        spdict[(subpage.mrag.magazine, subpage.header.page,
                subpage.header.subpage)].append(subpage)

    for splist in tqdm(spdict.values(), unit=' Subpages'):
        if len(splist) >= min_duplicates:
            arr = mode(np.stack([sp[:] for sp in splist]),
                       axis=0)[0][0].astype(np.uint8)
            numbers = mode(np.stack(
                [np.clip(sp.numbers, -100, -1) for sp in splist]),
                           axis=0)[0][0].astype(np.int64)
            yield Subpage(arr, numbers)
示例#28
0
    def segment_majority_vote_indices(self, interval_size, em_iters):
        
        num_clusters = len(self.gmm_list)

        # Resegment data based on likelihood scoring
        likelihoods = self.gmm_list[0].score(self.X)
        for g in self.gmm_list[1:]:
            likelihoods = np.column_stack((likelihoods, g.score(self.X)))

        if num_clusters == 1:
            most_likely = np.zeros(len(self.X))
        else:
            most_likely = likelihoods.argmax(axis=1)

        # Across 2.5 secs of observations, vote on which cluster they should be associated with

        iter_training = {}
        
        for i in range(interval_size, self.N, interval_size):
            arr = np.array(most_likely[(range(i-interval_size, i))])
            max_gmm = int(stats.mode(arr)[0][0])
            iter_training.setdefault((self.gmm_list[max_gmm],max_gmm),[]).append((i-interval_size,i))

        arr = np.array(most_likely[(range((self.N/interval_size)*interval_size, self.N))])
        max_gmm = int(stats.mode(arr)[0][0])
        iter_training.setdefault((self.gmm_list[max_gmm], max_gmm),[]).\
                                  append((self.N/interval_size*interval_size, self.N))
        iter_bic_dict = {}
        iter_bic_list = []

        for gp, e_tuple_list in iter_training.iteritems():
            g = gp[0]
            p = gp[1]

            cluster_indices =  np.array(range(e_tuple_list[0][0], e_tuple_list[0][1],1), dtype=np.int32)
            for d in e_tuple_list[1:]:
                cluster_indices = np.concatenate((cluster_indices,\
                                                  np.array(range(d[0],d[1],1),\
                                                  dtype=np.int32)))

            g.train_on_subset(self.X, cluster_indices, max_em_iters=em_iters)
            
            iter_bic_list.append((g,cluster_indices))
            iter_bic_dict[p] = cluster_indices

        return iter_bic_dict, iter_bic_list, most_likely
示例#29
0
def row_squash(packet_iter, n_rows):

    for l_list in split_seq(packet_iter, n_rows):
        a = numpy.array([numpy.fromstring(l.to_bytes(), dtype=numpy.uint8) for l in l_list])
        best, counts = mode(a)
        best = best[0].astype(numpy.uint8)
        p = Packet.from_bytes(best)
        p._offset = l_list[0]._offset
        yield p
示例#30
0
    def predict(self, x):
        prediction_matrix = numpy.zeros((x.shape[0], self.n_classifiers))
        n = 0

        for classifier in self.classifiers:
            prediction = classifier.predict(x)
            prediction_matrix[:, n] = prediction
            n+=1
        return mode(prediction_matrix, 1)[0].reshape(1, -1)[0]
示例#31
0
def baseline(labels):
    """
    Baseline for testing, predict majority value for any input.
    Usually, the majority is positive feedback.
    """
    mode = mstats.mode(labels).mode[0]
    print('method: BASELINE, ACC: %.3f' %
            (accuracy([mode]*len(labels), labels)))

    print()
示例#32
0
 def tonique(self,percent,method):
     """
     Get the tonic frequency defined as the mode of the last frequencies array.
     These as selected by the percent argument. Two methods are possible : pdf or mode.
     
     Input :
     -----------
         percent (optional) : a percentage of the number of frames from the total size
         of the frequencies array to give the last frequencies. Default percent= 8
     
     Output :
     -----------
 
         M : the mode
         N : the mode converted inside an octave
         Final_Freqs : the last frequencies according to the percentage
     """
     self.percent = percent
     self.method = method
 
     L = len(self.freq)
     Nb_Frames = L*self.percent/100
     Final_Freqs = self.freq[(L-Nb_Frames):L]
 
     if self.method=="pdf":
         # Down to the same octave centered on the mode
         #Final_Freqs[Final_Freqs>mode(self.freq)[0]*2] = Final_Freqs[Final_Freqs>mode(self.freq)[0]*2]/2.
         #Final_Freqs[Final_Freqs<(mode(self.freq)[0]/2.)] = Final_Freqs[Final_Freqs<mode(self.freq)[0]/2.]*2
 
         self.final_pdf = gaussian_kde(Final_Freqs)
         lmax= numpy.argmax(self.final_pdf(self.x))+self.xmin
         #plt.plot(self.x,self.final_pdf(self.x))
         return self.final_pdf,lmax,Final_Freqs
 
     if self.method=="mode":
         M = mode(Final_Freqs)
         if M[0] > mode(self.freq)[0]*2:
             N = M[0]/2
         if M[0] < mode(self.freq)[0]/2:
             N = M[0]*2
         else:
             N = M[0]
         return M[0],int(N.tolist()[0]),Final_Freqs
示例#33
0
 def get_recommendations(self, username):
     tweets = api.user_timeline(username)
     raw_text = [i.text for i in tweets]
     # clean text
     text = []
     for i in raw_text:
         i  = ' '.join(filter(lambda x: bool(wordnet.synsets(x)), i.split(' ')))
         if len(i) > 5:
             text.append(i)
     # find users like you
     user_score = int(mode([self.model.predict(self.vect.transform([i])) for i in text], axis=None)[0])
     for i in users.find():
         users.update({"_id": i["_id"]},
              {'$set': {'cluster_score': int(mode(self.model.predict(self.vect.transform(i['tweets'])), 
                         axis=None)[0]
                         )}
              })
     recs = [(i['user_id'], i['screen_name']) for i in users.find({'cluster_score': user_score})]
     return recs
def mode_from_str_list(score_list):
    """Takes a string like x1,x2,... which comes out of the database and computes the mode
    """    
    scores = []
    for s in score_list.split(","): #contains NULLS:
        try:
            scores.append(int(s)) 
        except:
            pass
    return mode(scores)[0][0] 
示例#35
0
def df_numerical_summary(df):
    """
    automatically selects all the numeric columns in a dataframe and provides their summary statistics

    Parameters: 
    ___________

        df: 
            Dataframe

    Returns:
    ________

        Dataframe

    """
    ndf = df.select_dtypes(include=[np.number])
    dft = ndf.describe().T
    nunique = [n for n in ndf.apply(pd.Series.nunique)]
    nzeros = [((ndf[col] == 0).sum()) for col in ndf]
    kurt = [x for x in ndf.kurtosis()]
    skewness = [x for x in ndf.skew()]
    modes = [x for x in ndf.apply(lambda x: mode(x, axis=None)[0]).iloc[0]]
    ranges = DataFrame({
        "1%": ndf.quantile(0.01),
        "5%": ndf.quantile(0.05),
        "95%": ndf.quantile(0.95),
        "99%": ndf.quantile(0.99)
    })
    infodf = dft.assign(nunique=nunique,
                        mode=modes,
                        median=ndf.apply(np.median),
                        nzeros=nzeros,
                        kurtosis=kurt,
                        skewness=skewness,
                        iqr=dft['75%'] - dft['25%']).join(
                            na_count(ndf)).join(ranges)

    def Round(x):
        return np.round(x, 2)

    rnd = [
        'count', 'mean', 'mode', 'median', 'std', 'min', 'max', 'nunique',
        'nzeros', 'miss', 'kurtosis', 'skewness', '25%', '50%', '75%', '1%',
        '95%', '99%', '5%', 'iqr'
    ]

    infodf[rnd] = infodf[rnd].apply(Round)

    infodf = infodf[[
        'count', 'mean', 'mode', 'median', 'std', 'min', 'max', 'nunique',
        'nzeros', 'kurtosis', 'skewness', 'miss', 'miss_percent', 'iqr', '1%',
        '5%', '25%', '50%', '75%', '95%', '99%'
    ]]
    return infodf
示例#36
0
def row_squash(packet_iter, n_rows):

    for l_list in split_seq(packet_iter, n_rows):
        a = numpy.array([
            numpy.fromstring(l.to_bytes(), dtype=numpy.uint8) for l in l_list
        ])
        best, counts = mode(a)
        best = best[0].astype(numpy.uint8)
        p = Packet.from_bytes(best)
        p._offset = l_list[0]._offset
        yield p
 def test_mode(self):
     a1 = [0,0,0,1,1,1,2,3,3,3,3,4,5,6,7]
     a2 = np.reshape(a1, (3,5))
     ma1 = ma.masked_where(ma.array(a1) > 2,a1)
     ma2 = ma.masked_where(a2 > 2, a2)
     assert_equal(mstats.mode(a1, axis=None), (3,4))
     assert_equal(mstats.mode(ma1, axis=None), (0,3))
     assert_equal(mstats.mode(a2, axis=None), (3,4))
     assert_equal(mstats.mode(ma2, axis=None), (0,3))
     assert_equal(mstats.mode(a2, axis=0), ([[0,0,0,1,1]],[[1,1,1,1,1]]))
     assert_equal(mstats.mode(ma2, axis=0), ([[0,0,0,1,1]],[[1,1,1,1,1]]))
     assert_equal(mstats.mode(a2, axis=-1), ([[0],[3],[3]], [[3],[3],[1]]))
     assert_equal(mstats.mode(ma2, axis=-1), ([[0],[1],[0]], [[3],[1],[0]]))
示例#38
0
 def test_mode(self):
     a1 = [0,0,0,1,1,1,2,3,3,3,3,4,5,6,7]
     a2 = np.reshape(a1, (3,5))
     ma1 = ma.masked_where(ma.array(a1)>2,a1)
     ma2 = ma.masked_where(a2>2, a2)
     assert_equal(mstats.mode(a1, axis=None), (3,4))
     assert_equal(mstats.mode(ma1, axis=None), (0,3))
     assert_equal(mstats.mode(a2, axis=None), (3,4))
     assert_equal(mstats.mode(ma2, axis=None), (0,3))
     assert_equal(mstats.mode(a2, axis=0), ([[0,0,0,1,1]],[[1,1,1,1,1]]))
     assert_equal(mstats.mode(ma2, axis=0), ([[0,0,0,1,1]],[[1,1,1,1,1]]))
     assert_equal(mstats.mode(a2, axis=-1), ([[0],[3],[3]], [[3],[3],[1]]))
     assert_equal(mstats.mode(ma2, axis=-1), ([[0],[1],[0]], [[3],[1],[0]]))
示例#39
0
    def segment_majority_vote(self):
        
        num_clusters = len(self.gmm_list)

        # Resegment data based on likelihood scoring
        likelihoods = self.gmm_list[0].score(self.X)
        for g in self.gmm_list[1:]:
            likelihoods = np.column_stack((likelihoods, g.score(self.X)))
        most_likely = likelihoods.argmax(axis=1)

        # Across 2.5 secs of observations, vote on which cluster they should be associated with

        iter_training = {}
        interval_size = 250

        for i in range(interval_size, self.N, interval_size):
            arr = np.array(most_likely[(range(i-interval_size, i))])
            max_gmm = int(stats.mode(arr)[0][0])
            iter_training.setdefault((self.gmm_list[max_gmm],max_gmm),[]).append(self.X[i-interval_size:i,:])
        
        arr = np.array(most_likely[(range((self.N/interval_size)*interval_size, self.N))])
        max_gmm = int(stats.mode(arr)[0][0])
        iter_training.setdefault((self.gmm_list[max_gmm], max_gmm),[]).append(self.X[(self.N/interval_size)*interval_size:self.N,:])
                
        iter_bic_dict = {}
        iter_bic_list = []
        cluster_count = 0
        for gp, data_list in iter_training.iteritems():
            g = gp[0]
            p = gp[1]
            cluster_data =  data_list[0]
            for d in data_list[1:]:
                cluster_data = np.concatenate((cluster_data, d))
            cluster_data = np.ascontiguousarray(cluster_data)
            
            g.train(cluster_data)
            iter_bic_list.append((g,cluster_data))
            iter_bic_dict[p] = cluster_data
            cluster_count += 1

        return iter_bic_dict, iter_bic_list, most_likely
示例#40
0
def transmode(freqlist,freqref=300):
     """Transpose all the frequencies by setting the _mode_ on a given reference frequency.

     Args:
        freqlist (numpy.array) : A list of frequencies to be transposed.
        freqref (int): The frequency reference to be transposed to. Default = 300.

     Return:
        transfreq (numpy.array): a list of the transposed frequencies.
     """
     transfreq = freqlist*float(freqref)/mode(freqlist)[0]
     return transfreq
示例#41
0
def binner(x, y, w_sta, nbins, rang=None, ebar=False, per=None):
    from numpy import array, digitize, lexsort, linspace
    from numpy.ma import average, median

    ind = lexsort((y, x))
    xs, ys = x[ind], y[ind]

    if rang is None: mn, mx = min(xs), max(xs)
    else: mn, mx = rang

    bins = linspace(mn, mx, nbins + 1)
    x_cen = (bins[:-1] + bins[1:]) * 0.5
    bins = linspace(mn, mx, nbins)
    ibins = digitize(xs, bins)

    if w_sta == "median":
        y_sta = array(
            [median(ys[ibins == i]) for i in range(1, bins.size + 1)])
    elif w_sta == "mean":
        y_sta = array(
            [average(ys[ibins == i]) for i in range(1, bins.size + 1)])
    elif w_sta == "mode":
        y_sta = array(
            [mode(ys[ibins == i])[0] for i in range(1, bins.size + 1)])

    if ebar == False: return x_cen, y_sta
    elif ebar == True and per == None:
        myer = abs(
            array([
                scoreatpercentile(ys[ibins == i], 15.8)
                for i in range(1, bins.size + 1)
            ]) - y_sta)
        pyer = abs(
            array([
                scoreatpercentile(ys[ibins == i], 84.0)
                for i in range(1, bins.size + 1)
            ]) - y_sta)
        yer = array([myer, pyer])
        return x_cen, y_sta, yer

    elif ebar == True and per != None:
        myer = abs(
            array([
                scoreatpercentile(ys[ibins == i], per[0])
                for i in range(1, bins.size + 1)
            ]) - y_sta)
        pyer = abs(
            array([
                scoreatpercentile(ys[ibins == i], per[1])
                for i in range(1, bins.size + 1)
            ]) - y_sta)
        yer = array([myer, pyer])
        return x_cen, y_sta, yer
    def is_var(self, indiv_id, g, force_not_mode = False):

        idx = g.indivs.index(indiv_id)
        
        if (not force_not_mode) and len(np.unique(self.labels))>2:
            return True
        else:
            idx = g.indivs.index(indiv_id)
            m = mode(self.labels)[0]
            if self.labels[idx] != m:
                return True
        
        return False
def MCSpreds(probs1, probs2, probs3, probs4):
    preds1 = np.argmax(probs1, axis=1)
    preds2 = np.argmax(probs2, axis=1)
    preds3 = np.argmax(probs3, axis=1)
    preds4 = np.argmax(probs4, axis=1)
    
    print("preds1 uniques are: %s" %(np.unique(preds1)))
    
    combined_preds = np.concatenate((preds1, preds2, preds3, preds4), axis=1)
    
    mode_preds = mode(combined_preds, axis=1)
    
    return mode_preds
示例#44
0
def find_timestep(data, n_random_values=10):
    # Random position
    pos_ini = np.random.randint(0, data.shape[0], n_random_values, 'int64')
    pos_end = pos_ini + 1

    time = data.index.values
    time_ini = time[pos_ini]
    time_end = time[pos_end]

    time_difference = time_end - time_ini
    time_step = mode(time_difference)[0][0]

    return pd.to_timedelta(time_step)
示例#45
0
    def is_var(self, indiv_id, g, force_not_mode=False):

        idx = g.indivs.index(indiv_id)

        if (not force_not_mode) and len(np.unique(self.labels)) > 2:
            return True
        else:
            idx = g.indivs.index(indiv_id)
            m = mode(self.labels)[0]
            if self.labels[idx] != m:
                return True

        return False
    def predict(self, te):
        te_data = te.data
        n_movs, n_cells, n_samples, n_trials = te_data.shape
        cell_predictions = -np.ones((n_movs, n_cells, n_trials)).astype(int)
        final_predictions = -np.ones((n_movs, n_trials)).astype(int)

        for i_s in range(n_movs):
            for i_r in range(n_trials):
                for i_n in range(n_cells):
                    r = te_data[i_s,i_n,:,i_r]
                    dists = [np.linalg.norm(r - self.templates[i_s_p,i_n,:])
                             for i_s_p in range(self.n_stim)]
                    cell_predictions[i_s,i_n,i_r] = np.argmin(dists)
                final_predictions[i_s,i_r] \
                        = mode(cell_predictions[i_s,:,i_r])[0][0].astype(int)
        return final_predictions
示例#47
0
def where2prd(train, test, smoteit = True):
  "WHERE2"
  t = discreteNums(train, map(lambda x: x.cells, train._rows))
  myTree = tdiv(t)
  testCase = test._rows
  rows, preds = [], []
  for tC in testCase:
    newRow = tC;
    loc = drop(tC, myTree)  # Drop a test case in the tree & see where it lands
    if not loc.kids:
      rows.extend(loc.rows)
    else:
      for k in loc.kids: rows.extend(k.rows)
    vals = [r.cells[-2] for r in rows]
    preds.append([mode([k for k in vals])[0].tolist()])  # \
                 # if median(vals) > 0 else preds.extend([0])
  return preds
示例#48
0
def common_value_imputer(data, add_binary=False):
    """
    A function for filling missing values in dataset with the most common value for each feature.
    :param data: dataset
    :param add_binary: adding additonal columns with mask missing or not
    :return: dataset without missing values
    """
    X = np.array(data)
    mask = X != X

    for col in range(X.shape[1]):
        X[mask[:, col], col] = mode(X[~mask[:, col], col])[0][0]

    if add_binary:
        X = _add_missing_binary(X, mask)

    return X
示例#49
0
 def transmode(self):
      """Transpose all the frequencies by setting the mode on a given reference frequency
 
      :params freqref : The frequency reference to be transposed to. Default = 300 ?
      ref : The note reference : mode or tonic. Default = mode
      : return the transposed frequencies
      """
 
      if self.transpositionref=="mode":
          interv_transpo = mode(self.freq)[0]/self.freqref
      if self.transpositionref=="tonic":
         T = float(self.tonique(self.percent,self.method)[1])
         print "Tonic :",T
         if T > self.freqref :
             interv_transpo = T/self.freqref
         if T < self.freqref :
             interv_transpo = self.freqref/T
      print "Intervalle de tranposition :",interv_transpo
      self.freqtransposed = self.freq / interv_transpo
      return self.freqtransposed
示例#50
0
def subpage_squash(packet_iter, minimum_dups=3, pages=All, yield_func=packets):
    subpages = defaultdict(list)
    for pl in paginate(packet_iter, pages=pages, yield_func=packet_lists, drop_empty=True):
        subpagekey = (pl[0].mrag.magazine, pl[0].header.page, pl[0].header.subpage)
        arr = numpy.zeros((42, 32), dtype=numpy.uint8)
        for p in pl:
            arr[:,p.mrag.row] = p._original_bytes
        subpages[subpagekey].append(arr)

    for arrlist in subpages.itervalues():
        if len(arrlist) >= minimum_dups:
            arr = mode(numpy.array(arrlist), axis=0)[0][0].astype(numpy.uint8)
            packets = []

            for i in range(32):
                if arr[:,i].any():
                    packets.append(Packet.from_bytes(arr[:,i]))

            for item in yield_func(packets):
                yield item
示例#51
0
 def test_indices_convert(self):
     "Test the conversion of ensoindices from one frequency to another."
     ensoi = self.ensoi
     series = ClimateSeries(np.random.rand(len(ensoi)),
                            dates=ensoi._dates,
                            ensoindicator = ensoi)
     series.set_ensoindices(minimum_size=5, reference_season='NDJ')
     control = ts.time_series([ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-1,-1,
                               -1,-1,-1,-1,-1, 0, 0,-1,-1,-1,-1,-1,
                               -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0,+1,+1,
                               +1,+1,+1,+1,+1,+1,+1, 0, 0, 0, 0, 0,],
                               dates=ensoi.dates)
     assert_equal(series.ensoindices, control)
     # Conversion 'M' to 'D'
     dseries = series.convert('D')
     assert_equal(dseries.ensoindices,
                  ts.lib.backward_fill(control.convert('D')))
     # Conversion 'M' to 'A'
     aseries = series.convert('A', func=ma.mean)
     assert_equal(aseries.ensoindices,
                  mode(control.convert('A'), axis=1)[0].squeeze())
示例#52
0
def multi_window(sig,win):
    '''
    algorithm picking arrival times
    the maximum amplitudes correspond
    to sending and arriving times
    n-dimensional extension
    sig - (N,M) numpy array
    N - number of sonic tracks
    M - data points of oscilloscope
    win - 3-element list
    '''
    sig0 = sig - mode(sig,axis=1)[0] # remove shift in amplitude
    E = sig0**2
    N = E.shape[1]-win[2]-win[0]-1
    BTA = np.zeros((E.shape[0],N)) # before term average
    ATA = np.zeros((E.shape[0],N)) # after term average
    DTA = np.zeros((E.shape[0],N)) # delayed term average
    iterator = np.arange(N)
    for i in np.nditer(iterator):
        BTA[:,i] = np.mean(E[:,i:i+win[0]],axis=1)
        ATA[:,i] = np.mean(E[:,i+win[0]:i+win[0]+win[1]],axis=1)
        DTA[:,i] = np.mean(E[:,i+win[0]:i+win[0]+win[2]],axis=1)
    r = ATA/BTA + DTA/BTA
    return r/10
示例#53
0
文件: cluster.py 项目: mlespiau/mlas
 def execute(self):
     likelihoods = self.cluster_list[0].get_gmm().score(self.X)
     self.cluster_list[0].reset_data()
     for cluster in self.cluster_list[1:]:
         likelihoods = numpy.column_stack((likelihoods, cluster.get_gmm().score(self.X)))
         cluster.reset_data()
     if self.number_of_clusters == 1:
         self.most_likely = numpy.zeros(len(self.X))
     else:
         self.most_likely = likelihoods.argmax(axis=1)
     # Across 250 frames of observations
     # Vote on wich cluster they should be associated with
     data_range = range(0, self.N, self.interval_size)
     if data_range[-1] < self.N:
         data_range.append(self.N)
     for i, v in enumerate(data_range[0:len(data_range)-1]):
         current_segment_indexes = range(data_range[i], data_range[i+1])
         current_segment_scores = numpy.array(self.most_likely[current_segment_indexes])
         # print(current_segment_data)
         most_likely_gmm_class = int(stats.mode(current_segment_scores)[0][0])
         print(most_likely_gmm_class)
         # print(self.X[current_segment_indexes,:])
         current_segment_data = self.X[current_segment_indexes,:]
         segment = Segment(
             data_range[i],
             data_range[i+1],
             current_segment_data,
             self.cluster_list[most_likely_gmm_class].get_name()
         )
         self.cluster_list[most_likely_gmm_class].add_segment(segment)
     new_cluster_list = []
     for cluster in self.cluster_list:
         if len(cluster.get_segments()) > 0:
             cluster.train_gmm()
             new_cluster_list.append(cluster)
     return new_cluster_list
示例#54
0
def test_frame(frame,desc):
    surf = cv2.SURF(400)

    #kp, descriptor = surf.detectAndCompute(cv2.cvtColor(frame,cv2.COLOR_BGR2GRAY), None)
    kp, descriptor = surf.detectAndCompute(frame, None)
   
    if descriptor is None:
        return -1 
    points = descriptor.shape[0]
    matches = np.zeros((points,1))
    best_poke_desc = np.zeros((points,151))

    best_poke_desc[:,0] = 151;

    for pt in range(0, points):
        d = descriptor[pt,:].reshape(1,128)
        for poke in range(1,151):
            best_poke_desc[pt,poke] =  np.min(sp.cdist(desc[poke],d,'Euclidean'))
        matches[pt] = np.argmin(best_poke_desc[pt,:])
    val, count =  mstats.mode(matches)
    if( count[0][0] < 4):
        return -1

    return val[0][0]