Python zscore示例，scipy.stats.zscore Python示例

示例#1

0

显示文件

文件： initialization_heuristic.py 项目： kitofans/gc

def find_scores_Z(N,old_seeds,degrees):
	scores = []
	diffs = []
	timespents = []
	avgdegs = []
	for i in range(len(N)):
		seeds = old_seeds + [i]
		diffscore = 0.0
		timespentscore = 0.0
		for j in range(len(seeds)):
			seed = seeds[j]
			for other_seed in seeds[j:]:
				diffscore += float(abs(degrees[seed] - degrees[other_seed]))
			for other_seed in seeds:
				if seed == other_seed:
					continue
				timespentscore += N[seed][other_seed]
		diffs.append(diffscore)
		timespents.append(timespentscore)

		avgdegsum = 0
		for seed in seeds:
			avgdegsum += degrees[seed]
		avgdeg = avgdegsum/len(seeds)
		avgdegs.append(avgdeg)

	print timespents[76]
	print timespents[58]
	diffs = stats.zscore(diffs)
	timespents = stats.zscore(timespents)
	avgdegs = stats.zscore(avgdegs)

	for i in range(len(diffs)):
		scores.append(diffs[i] + timespents[i] - avgdegs[i])
	return scores

示例#2

0

显示文件

文件： clustergram.py 项目： wangz10/clustergram

def collaspe_fclusters(data=None, t=None, row_labels=None, col_labels=None,
			linkage='average', pdist='euclidean', standardize=3, log=False):
	"""a function to collaspe flat clusters by averaging the vectors within
	each flat clusters achieved from hierarchical clustering"""
	## preprocess data
	if log:
		data = np.log2(data + 1.0)
	if standardize == 1: # Standardize along the columns of data
		data = zscore(data, axis=0)
	elif standardize == 2: # Standardize along the rows of data
		data = zscore(data, axis=1)
	
	if row_labels is not None and col_labels is None: ## only get fclusters for rows
		d = dist.pdist(data, metric=pdist)
		axis = 1 ##!!! haven't checked whether this is correct yet
	elif row_labels is None and col_labels is not None: ## only get fclusters for cols
		d = dist.pdist(data.T, metric=pdist)
		axis = 0
	D = dist.squareform(d)
	Y = sch.linkage(D, method=linkage, metric=pdist)
	fclusters = sch.fcluster(Y, t, 'distance')
	fcluster_set = set(fclusters)
	data_cf = []
	for fc in fcluster_set:
		mask = np.where(fclusters==fc)
		data_t = data.T
		vector_avg = np.average(data_t[mask],axis=axis)
		data_cf.append(vector_avg)
	data_cf = np.array(data_cf).T
	return data_cf

示例#3

0

显示文件

文件： combine_classify.py 项目： angelasheu/Classification-Results

def main(argv):

    category_predictions = {}

    for cat in categories:

        file_reg = classifier_reg + "/" + cat + "/predictions"
        fo = open(file_reg)
        reg_predictions = [float(x) for x in fo.read().split('\n')[:-1]]
        reg_zscores = stats.zscore(reg_predictions)

        file_red = classifier_red + "/" + cat + "/predictions"
        fo = open(file_red)
        red_predictions = [float(x) for x in fo.read().split('\n')[:-1]]
        red_zscores = stats.zscore(red_predictions)

        max_zscore = []

        # Take zscore that has the greatest deviation out of the two classifiers
        for i in range(len(reg_predictions)):
            reg_greater = abs(reg_zscores[i]) >= abs(red_zscores[i])
            val = reg_zscores[i] if reg_greater else red_zscores[i]
            max_zscore.append(val)


        category_predictions[cat] = max_zscore

    results_list = output_predictions(categories, category_predictions)

    calculate_accuracies(results_list)

示例#4

0

显示文件

文件： Graph.py 项目： dagley11/Garuda_Game

def algorithm(w1,w2,w3,w4,G1,G2,G3,G4):
	try:
		cc=np.array([nx.average_clustering(G1,weight='weight'),nx.average_clustering(G2,weight='weight'),nx.average_clustering(G3,weight='weight'),nx.average_clustering(G4,weight='weight')])
		spl=np.array([nx.average_shortest_path_length(G1,weight='weight'),nx.average_shortest_path_length(G2,weight='weight'),nx.average_shortest_path_length(G3,weight='weight'),nx.average_shortest_path_length(G4,weight='weight')])
		nds=np.array([nx.number_of_nodes(G1),nx.number_of_nodes(G2),nx.number_of_nodes(G3),nx.number_of_nodes(G4)])
		edgs= np.array([nx.number_of_edges(G1),nx.number_of_edges(G2),nx.number_of_edges(G3),nx.number_of_edges(G4)])
		if valid(cc):
			cc=stats.zscore(cc)
		else:
			cc=np.array([.1,.1,.1,.1])
		cc= cc-min(cc)+.1
		if valid(spl):
			spl=stats.zscore(spl)
		else:
			spl=np.array([.1,.1,.1,.1])
		spl= spl-min(spl)+.1
		if valid(nds):
			nds=stats.zscore(nds)
		else:
			nds=np.array([.1,.1,.1,.1])
		nds = nds-min(nds)+.1
		if valid(edgs):
			edgs=stats.zscore(edgs)
		else:
			edgs=np.array([.1,.1,.1,.1])
		edgs=edgs-min(edgs)+.1
		r1=(w1*cc[0]+w2*spl[0]+w3*nds[0]+w4*edgs[0])*1000
		r2=(w1*cc[1]+w2*spl[1]+w3*nds[1]+w4*edgs[1])*1000
		r3=(w1*cc[2]+w2*spl[2]+w3*nds[2]+w4*edgs[2])*1000
		r4=(w1*cc[3]+w2*spl[3]+w3*nds[3]+w4*edgs[3])*1000
		d={'Player 1:': r1, 'Player 2:': r2,'Player 3:': r3, 'Player 4:': r4}
		rank = sorted(d.items(), key=lambda x: x[1], reverse=True)
		return ["USAU RANKINGS",str(rank[0][0])+ " " + str(int(rank[0][1])),str(rank[1][0])+" "+ str(int(rank[1][1])),str(rank[2][0])+" "+ str(int(rank[2][1])),str(rank[3][0])+" "+str(int(rank[3][1]))]
	except:
		return ["Unable to compute rankings!  Need data","Player 1","Player 2","Player 3","Player 4"]

示例#5

0

显示文件

文件： preprocessing.py 项目： dimenwarper/gett

def correct_covariates(Dtrait, Dcov, variables):
    Dcomb = pd.merge(Dtrait.T, Dcov.T, left_index=True, right_index=True).T
    Dcorr = Dtrait.copy()
    traits = Dtrait.columns.values.tolist()
    print 'Correcting for %s' % variables
    for idx, i in enumerate(Dtrait.index):
        sys.stdout.write('\rTrait %d of %d' % (idx, Dtrait.shape[0]))
        sys.stdout.flush()
        if len(variables) == 1:
            rlm_model = sm.RLM(Dcomb.loc[i,:], zscore(array(Dcomb.loc[variables,:]).T))
        else:
            rlm_model = sm.RLM(Dcomb.loc[i,:], zscore(array(Dcomb.loc[variables,:]).T, axis=0))
        rlm_results = rlm_model.fit()
        Dcorr.loc[i,:] = rlm_results.resid
        """
        if idx > 1:
            f, axarr = subplots(3,2)
            axarr[0,0].scatter(Dtrait.loc['EIF1AY',:], Dtrait.loc['OSBP', :])
            axarr[0,1].scatter(Dcorr.loc['EIF1AY',:], Dcorr.loc['OSBP', :])
            axarr[1,0].hist([x for x in Dtrait.loc['EIF1AY',:] if not isnan(x)])
            axarr[1,1].hist([x for x in Dcorr.loc['EIF1AY',:] if not isnan(x)])
            axarr[2,0].hist([x for x in Dtrait.loc['OSBP',:] if not isnan(x)])
            axarr[2,1].hist([x for x in Dcorr.loc['OSBP',:] if not isnan(x)])
            f2, axarr2 = subplots(3,2)
            axarr2[0,0].scatter(Dcomb.loc['gender',:], Dcomb.loc['OSBP', :])
            axarr2[1,0].scatter(Dcomb.loc['age',:], Dcomb.loc['OSBP', :])
            axarr2[2,0].scatter(Dcomb.loc['site',:], Dcomb.loc['OSBP', :])
            axarr2[0,1].scatter(Dcomb.loc['gender',:], Dcomb.loc['EIF1AY', :])
            axarr2[1,1].scatter(Dcomb.loc['age',:], Dcomb.loc['EIF1AY', :])
            axarr2[2,1].scatter(Dcomb.loc['site',:], Dcomb.loc['EIF1AY', :])
            show()
            exit()
        """
    return Dcorr

示例#6

0

显示文件

文件： mysseg.py 项目： cameronphchen/pHA

def predict_loo(transformed_data, args):
  print 'mysseg loo',
  sys.stdout.flush()

  (ndim, nsample , nsubjs) = transformed_data.shape

  tst_subj = args.loo
  win_size = args.winsize
  nseg = nsample - win_size
  # mysseg prediction prediction
  trn_data = np.zeros((ndim*win_size, nseg))

  # the trn data also include the tst data, but will be subtracted when 
  # calculating A
  for m in range(nsubjs):
    for w in range(win_size):
      trn_data[w*ndim:(w+1)*ndim,:] += transformed_data[:,w:(w+nseg),m]

  tst_data = np.zeros((ndim*win_size, nseg))
  for w in range(win_size):
    tst_data[w*ndim:(w+1)*ndim,:] = transformed_data[:,w:(w+nseg),tst_subj]
    
  A =  stats.zscore((trn_data - tst_data),axis=0, ddof=1)
  B =  stats.zscore(tst_data,axis=0, ddof=1)
  corr_mtx = B.T.dot(A)

  for i in range(nseg):
    for j in range(nseg):
      if abs(i-j)<win_size and i != j :
        corr_mtx[i,j] = -np.inf

  rank = np.argmax(corr_mtx, axis=1)
  accu = sum(rank == range(nseg)) / float(nseg)

  return accu

示例#7

0

显示文件

文件： simple_fr_smoothing_gen_catch_alt.py 项目： jhess90/classification_scripts

def hist_and_smooth_data(spike_data):

        max_spike_ts = 0
        for i in range(len(spike_data)):
                if np.amax(spike_data[i]) > max_spike_ts:
                        max_spike_ts = np.amax(spike_data[i])
        
        max_bin_num = int(np.ceil(max_spike_ts) / float(bin_size) * 1000)
        hist_data = np.zeros((len(spike_data),max_bin_num))
        hist_bins = np.zeros((len(spike_data),max_bin_num))
        for i in range(len(spike_data)):
                total_bin_range = np.arange(0,int(np.
ceil(spike_data[i].max())),bin_size/1000.0)
                hist,bins = np.histogram(spike_data[i],bins=total_bin_range,range=(0,int(np.ceil(spike_data[i].max()))),normed=False,density=False)
                #pdb.set_trace()
                hist_data[i,0:len(hist)] = hist
                hist_bins[i,0:len(bins)] = bins

		#TODO fix so gaus divide by bin size and -> fr before smoothing
		#TODO make option for zscore and gaus togethre
        if zscore_bool and gaussian_bool:
			smoothed = stats.zscore(hist_data,axis=1)
			smoothed = ndimage.filters.gaussian_filter1d(smoothed,gauss_sigma,axis=1)
        elif zscore_bool:
			smoothed = stats.zscore(hist_data,axis=1)
        elif gaussian_bool:

			smoothed = ndimage.filters.gaussian_filter1d(hist_data,gauss_sigma,axis=1)
        else:
			smoothed = {}

        return_dict = {'hist_data':hist_data,'hist_bins':hist_bins,'smoothed':smoothed}
        return(return_dict)

示例#8

0

显示文件

def plot_features_distribution(feature_set, 
                               feature_set_permutation, 
                               save_path, 
                               prename='features', 
                               n_features=90, 
                               n_bins=20):
    
    plt.figure()
    h_values_p, _ = np.histogram(feature_set_permutation.flatten(), 
                                 bins=np.arange(0, n_features+1))
    
    plt.hist(zscore(h_values_p), bins=n_bins)
    
    fname = "%s_features_set_permutation_distribution.png" % (prename)
    plt.savefig(os.path.join(save_path, 
                            fname))
    
    plt.figure()
    h_values_, _ = np.histogram(feature_set.flatten(), 
                                bins=np.arange(0, n_features+1))
    plt.plot(zscore(h_values_))
        
    
    fname = "%s_features_set_cross_validation.png" % (prename)
    plt.savefig(os.path.join(save_path, 
                            fname))

    plt.close('all')

示例#9

0

显示文件

文件： HW2.py 项目： jamesrichter/CS450

def knnClassifier(training_data, test_data, training_target, test_target, k=5):
   #normalize the data
   #calculate the z-score of the data
   #print training_data
   training_data = training_data
   new_training_data = stats.zscore(training_data.astype(int), axis=0)
   new_test_data = stats.zscore(test_data.astype(int), axis=0)
   #find the k nearest neighbors for each test data
   #print 'test', new_test_data
   predictions = []
   for test in new_test_data:
      #print test
      # find the euclidean distance between the test case and all training cases
      distances = []
      neighbors = []
      neighbor_predictions = []
      for train in new_training_data:
         #print train
         distances.append(np.linalg.norm(train-test))
      #print distances
      for i in range(k):
         neighb_i = distances.index(min(distances))
         neighbors.append(neighb_i)
         distances[neighb_i] = 1000000
      #print neighbors
      for neighb in neighbors:
         neighbor_predictions.append(training_target[neighb])
      predictions.append(stats.mode(neighbor_predictions)[0][0])
   return predictions

示例#10

0

显示文件

文件： ThesisHelper.py 项目： Ori226/thesis_clean_v3

def ExtractDataVer2(all_relevant_channels, marker_positions, target, ms_before, ms_after):
    target_idx = marker_positions[np.where(target == 1)[0]] - 1

    all_target_transpose = np.asarray(all_relevant_channels).T

    number_positive_of_samples = len(target_idx)
    before_trigger = int((ms_before * 1.0) / 5)
    after_trigger = int((ms_after * 1.0) / 5)

    all_target_data = extractTimeWindowFast(all_target_transpose, target_idx, before_trigger, after_trigger)

    non_target_idx = marker_positions[np.where(target == 0)[0]] - 1
    number_positive_of_samples = len(non_target_idx)

    all_non_target_data = extractTimeWindowFast(all_target_transpose, non_target_idx, before_trigger, after_trigger)




    # normalize the data over the time axe
    all_data = np.vstack((stats.zscore(all_target_data, axis=1).astype('float32'),
                          stats.zscore(all_non_target_data, axis=1).astype('float32')))
    all_tags = np.vstack((np.ones((all_target_data.shape[0], 1), dtype='int8'),
                          np.zeros((all_non_target_data.shape[0], 1), dtype='int8')))

    return all_data, all_tags

示例#11

0

显示文件

文件： 3_class_simplemodels.py 项目： mathewzilla/whiskfree

def data_parser(theta,kappa,tt,ch,tt_ch):

    theta_r = np.array([[resample(theta.values.squeeze()[i,950:1440],50)] for i in range(0,theta.shape[0])])
    theta_r = zscore(theta_r.squeeze(),axis=None)

    kappa_r = np.array([[resample(kappa.values.squeeze()[i,950:1440],50)] for i in range(0,kappa.shape[0])])
    kappa_r = zscore(kappa_r.squeeze(),axis=None)

    kappa_df = pd.DataFrame(kappa_r)
    theta_df = pd.DataFrame(theta_r)

    both_df = pd.concat([theta_df,kappa_df],axis=1)

    if tt_ch == 'tt':
        # trial type
        clean = np.nan_to_num(tt) !=0
        tt_c = tt[clean.squeeze()].values
    else :
        # choice
        clean = np.nan_to_num(ch) !=0
        tt_c = ch[clean.squeeze()].values

    # tt_c = tt[tt.values !=0|3].values
    both = both_df.values
    # both_c = both[clean.squeeze(),:]
    both_c = both[clean.squeeze(),:]

    # keeping one hot vector for now (incase we want it later)
#     labs = np.eye(3)[tt_c.astype(int)-1]
    # y[np.arange(3), a] = 1
#     labs = labs.squeeze()

    return both_c, tt_c, clean

示例#12

0

显示文件

文件： aUtils.py 项目： alexrson/rbns_pipeline

def significantly_unenriched(xs, ys, zthresh=2., scale='linear'):
    assert scale in ['linear', 'log']
    if scale =='log':
        xs = np.log2(xs)
        ys = np.log2(ys)
    xs = stats.zscore(xs)
    ys = stats.zscore(ys)
    return [x < -zthresh or y < -zthresh for x, y in zip(xs, ys)]

示例#13

0

显示文件

文件： pyglmnet.py 项目： garftalk/pyglmnet

 def simulate(self, beta0, beta, x):
     if(self.distr=='poisson'):
         y = np.random.poisson(self.lmb(beta0, beta, zscore(x)))
     if(self.distr=='normal'):
         y = np.random.normal(self.lmb(beta0, beta, zscore(x)))
     if(self.distr=='binomial'):
         y = np.random.binomial(1, self.lmb(beta0, beta, zscore(x)))
     return y

示例#14

0

显示文件

文件： panda.py 项目： choyichen/pypanda

 def _normalize_network(self, x, square=True):
     if square:
         norm_col = zscore(x, axis=0)
         return (norm_col + norm_col.T) / math.sqrt(2)
     else:
         norm_col = zscore(x, axis=0)
         norm_row = zscore(x, axis=1)
         return (norm_col + norm_row) / math.sqrt(2)

示例#15

0

显示文件

文件： helperFunc.py 项目： ppalmedo/HelperFunctions

def combineTT(data1,data2):
    iA,iB = findOLIndices(data1.geneList,data2.geneList)
    newGeneList = [data1.geneList[i] for i in iA]
    newSampleList = data1.cleanSampIDs + data2.cleanSampIDs
    newCnData = np.column_stack((data1.cnData[iA,:],data2.cnData[iB,:]))
    newRnaData = np.column_stack((ss.zscore(data1.rnaData[iA,:],1),ss.zscore(data2.rnaData[iB,:],1)))
    newData = combinedData(newGeneList,newSampleList,newRnaData,newCnData)
    return newData

示例#16

0

显示文件

文件： z_score_5s.py 项目： exp0nge/eeg-viz

def five_second_z_score(start):
    for interval in range(len(matrix[start])/5000 + 1):
        j = 5000*interval
        # print interval, j
        if interval == len(matrix[start])/5000 + 1:
            z_score_five_sec.append(stats.zscore(matrix[start][j:]))
        else:
            z_score_five_sec.append(stats.zscore(matrix[start][j:j+5000]))

示例#17

0

显示文件

文件： pc.py 项目： pliz/gunfolds

def np_fisherZ(x,y,r):
    z = 0.5 * (np.log(1.0 + r) - np.log(1.0 - r))
    w = np.sqrt(len(x)) * z
    x_ = zscore(x)
    y_ = zscore(y)
    t2 = moment22(x_,y_)
    t = np.sqrt(t2)
    p = 2. * (1. - norm.cdf(np.abs(w), 0.0, t))
    return p

示例#18

0

显示文件

文件： pyglmnet.py 项目： jasmainak/pyglmnet

 def simulate(self, beta0, beta, X):
     """Simulate data."""
     if self.distr == 'poisson':
         y = np.random.poisson(self.lmb(beta0, beta, zscore(X)))
     if self.distr == 'normal':
         y = np.random.normal(self.lmb(beta0, beta, zscore(X)))
     if self.distr == 'binomial':
         y = np.random.binomial(1, self.lmb(beta0, beta, zscore(X)))
     return y

示例#19

0

显示文件

文件： reproject_Kidson_Types_detrend.py 项目： niwa/paleopy

def euclid(v1,v2):
    from scipy.stats import zscore
    '''Squared Euclidean Distance between two scalars or equally matched vectors
    
       USAGE: d = euclid(v1,v2)'''
    v1 = zscore(v1.flatten())
    v2 = zscore(v2.flatten())
    d2= np.sqrt(np.sum((v1-v2)**2))                                                                                                                                                                               
    return d2

示例#20

0

显示文件

文件： regression_l1.py 项目： rishizsinha/project-beta

def plot(pred, y):
    predm, ym = stats.zscore(pred, axis=0, ddof=1), stats.zscore(y, axis=0, ddof=1)
    # predm, ym = pp.normalize(pred), pp.normalize(y)
    print predm.shape, ym.shape
    times = range(0, len(pred))
    # print times.shape
    print len(times)
    plt.plot(times, predm, "r-", times, ym, "b-")
    plt.xlabel("Time (s)")
    plt.ylabel("BOLD Response")
    plt.savefig("../figure/lassoplot.jpg")

示例#21

0

显示文件

文件： iClustergram.py 项目： wangz10/clustergram

def iClustergram(data=None, row_labels=None, col_labels=None,
			row_groups=None, col_groups=None,
			row_linkage='average', col_linkage='average', 
			row_pdist='euclidean', col_pdist='euclidean',
			standardize=None, log=False, 
			display_range=3, username='******', apikey='fmnoxd2t2u'):
	## preprocess data
	if log:
		data = np.log2(data + 1.0)

	if standardize == 1: # Standardize along the columns of data
		data = zscore(data, axis=0)
	elif standardize == 2: # Standardize along the rows of data
		data = zscore(data, axis=1)

	## cluster data:
	## compute pdist for rows
	d1 = dist.pdist(data, metric=row_pdist)
	D1 = dist.squareform(d1)
	Y1 = sch.linkage(D1, method=row_linkage, metric=row_pdist)
	Z1 = sch.dendrogram(Y1, orientation='right')
	idx1 = Z1['leaves']

	## compute pdist for cols
	d2 = dist.pdist(data.T, metric=col_pdist)
	D2 = dist.squareform(d2)
	Y2 = sch.linkage(D2, method=col_linkage, metric=col_pdist)
	Z2 = sch.dendrogram(Y2)
	idx2 = Z2['leaves']

	## transform the orders of data to clustered data
	data_clustered = data
	data_clustered = data_clustered[:,idx2]
	data_clustered = data_clustered[idx1,:]
	data_to_plot = data_clustered.tolist()

	## transform the orders of row and col labels
	new_row_labels = []
	new_col_labels = []
	for i in range(data.shape[0]):
		new_row_labels.append(row_labels[idx1[i]])
	for i in range(data.shape[1]):
		new_col_labels.append(col_labels[idx2[i]])
	## plot clustered data using plotly
	py = plotly.plotly(username, apikey)
	d = {}
	d['x'] = new_row_labels
	d['y'] = new_col_labels
	d['z'] = data_to_plot
	d['type'] = 'heatmap'
	py.plot([d])
	return

示例#22

0

显示文件

文件： clustergram.py 项目： wangz10/clustergram

def plot_fclusters(data=None, row_labels=None, col_labels=None,
			linkage='average', pdist='euclidean', standardize=3, log=False):
	"""a function to plot the relationship between thresholds and number of
	flat clusters achieved from hierarchical clustering, aims to find the optimal
	threshold for forming clusters"""
	## preprocess data
	if log:
		data = np.log2(data + 1.0)
	if standardize == 1: # Standardize along the columns of data
		data = zscore(data, axis=0)
	elif standardize == 2: # Standardize along the rows of data
		data = zscore(data, axis=1)

	fig = plt.figure()
	ax1 = fig.add_subplot(121)
	ax2 = fig.add_subplot(122)

	if row_labels is not None and col_labels is None: ## only get fclusters for rows
		d = dist.pdist(data, metric=pdist)
	elif row_labels is None and col_labels is not None: ## only get fclusters for cols
		d = dist.pdist(data.T, metric=pdist)
	D = dist.squareform(d)
	Y = sch.linkage(D, method=linkage, metric=pdist)
	space1 = np.linspace(d.min(), d.max(), num=5, endpoint=False)
	space2 = np.linspace(d.max(),1.,num=30,endpoint=True)
	thresholds = np.concatenate((space1,space2))
	num_clusters = []
	num_singles = []
	for t in thresholds:
		fclusters = sch.fcluster(Y, t,'distance')
		c = Counter(fclusters)
		num_cluster = len(c.keys())
		num_single = c.values().count(1)
		num_clusters.append(num_cluster)
		num_singles.append(num_single)
		print 'threshold=', t, 'clusters:', num_cluster, 'singles:',num_single
		if num_cluster < 290:
			print c
	ax1.plot(thresholds, num_clusters,label='# of flat clusters')
	ax1.plot(thresholds, num_singles,label='# of singles',c='r')
	ax1.plot(thresholds, np.array(num_clusters)-np.array(num_singles),label='# of non-singles',c='g')
	ax1.legend(loc='upper right')
	ax1.set_xlabel('threshold for forming flat clusters')

	ax2.plot(thresholds, num_clusters,label='# of flat clusters')
	ax2.plot(thresholds, num_singles,label='# of singles',c='r')
	ax2.plot(thresholds, np.array(num_clusters)-np.array(num_singles),label='# of non-singles',c='g')
	ax2.legend(loc='upper right')
	ax2.set_xlabel('threshold for forming flat clusters')
	ax2.set_yscale('log')
	plt.show()
	return

示例#23

0

显示文件

文件： selection.py 项目： hoechenberger/mne-python

def _divide_to_regions(info, add_stim=True):
    """Divide channels to regions by positions."""
    from scipy.stats import zscore
    picks = _pick_data_channels(info, exclude=[])
    chs_in_lobe = len(picks) // 4
    pos = np.array([ch['loc'][:3] for ch in info['chs']])
    x, y, z = pos.T

    frontal = picks[np.argsort(y[picks])[-chs_in_lobe:]]
    picks = np.setdiff1d(picks, frontal)

    occipital = picks[np.argsort(y[picks])[:chs_in_lobe]]
    picks = np.setdiff1d(picks, occipital)

    temporal = picks[np.argsort(z[picks])[:chs_in_lobe]]
    picks = np.setdiff1d(picks, temporal)

    lt, rt = _divide_side(temporal, x)
    lf, rf = _divide_side(frontal, x)
    lo, ro = _divide_side(occipital, x)
    lp, rp = _divide_side(picks, x)  # Parietal lobe from the remaining picks.

    # Because of the way the sides are divided, there may be outliers in the
    # temporal lobes. Here we switch the sides for these outliers. For other
    # lobes it is not a big problem because of the vicinity of the lobes.
    with np.errstate(invalid='ignore'):  # invalid division, greater compare
        zs = np.abs(zscore(x[rt]))
        outliers = np.array(rt)[np.where(zs > 2.)[0]]
    rt = list(np.setdiff1d(rt, outliers))

    with np.errstate(invalid='ignore'):  # invalid division, greater compare
        zs = np.abs(zscore(x[lt]))
        outliers = np.append(outliers, (np.array(lt)[np.where(zs > 2.)[0]]))
    lt = list(np.setdiff1d(lt, outliers))

    l_mean = np.mean(x[lt])
    r_mean = np.mean(x[rt])
    for outlier in outliers:
        if abs(l_mean - x[outlier]) < abs(r_mean - x[outlier]):
            lt.append(outlier)
        else:
            rt.append(outlier)

    if add_stim:
        stim_ch = _get_stim_channel(None, info, raise_error=False)
        if len(stim_ch) > 0:
            for region in [lf, rf, lo, ro, lp, rp, lt, rt]:
                region.append(info['ch_names'].index(stim_ch[0]))
    return {'Left-frontal': lf, 'Right-frontal': rf, 'Left-parietal': lp,
            'Right-parietal': rp, 'Left-occipital': lo, 'Right-occipital': ro,
            'Left-temporal': lt, 'Right-temporal': rt}

示例#24

0

显示文件

文件： clustergram.py 项目： MaayanLab/creeds

def clustergram(data, rids, cids,
	row_linkage='average', col_linkage='average', 
	row_pdist='euclidean', col_pdist='euclidean',
	standardize=3, log=False):
	## preprocess data
	if log:
		data = np.log2(data + 1.0)

	if standardize == 1: # Standardize along the columns of data
		data = zscore(data, axis=0)
	elif standardize == 2: # Standardize along the rows of data
		data = zscore(data, axis=1)

	## perform hierarchical clustering for rows and cols
	## compute pdist for rows:
	d1 = dist.pdist(data, metric=row_pdist)
	D1 = dist.squareform(d1)
	Y1 = sch.linkage(D1, method=row_linkage, metric=row_pdist)
	Z1 = sch.dendrogram(Y1, orientation='right')
	idx1 = Z1['leaves']

	## compute pdist for cols
	d2 = dist.pdist(data.T, metric=col_pdist)
	D2 = dist.squareform(d2)
	Y2 = sch.linkage(D2, method=col_linkage, metric=col_pdist)
	Z2 = sch.dendrogram(Y2)
	idx2 = Z2['leaves']

	row_nodes = []
	rids = np.array(rids)[np.array(idx1)]
	for idx, rid in enumerate(rids):
		row_nodes.append({'sort': idx, 'name': rid})		


	col_nodes = []
	cids = np.array(cids)[np.array(idx2)]
	for idx, cid in enumerate(cids):
		col_nodes.append({'sort': idx, 'name': cid})	


	links = []
	for i in range(len(rids)):
		for j in range(len(cids)):
			links.append({'source': i, 'target': j, 'value': data[i,j]})
	
	json_data = {
		'row_nodes':row_nodes,
		'col_nodes':col_nodes,
		'links': links
				}
	return json_data

示例#25

0

显示文件

文件： penny_trends.py 项目： quantrocket/QuantFox

 def trend_zscore(self,sym,date,window):
     slice = self.trends[sym][-window:]
     if slice[-1] == slice[-2]:
         z = self.zscores[sym][-1]
     else:
         z = zscore(slice)[-1]
     return z

示例#26

0

显示文件

文件： bads.py 项目： BushraR/mne-python

def find_outliers(X, threshold=3.0):
    """Find outliers based on Gaussian mixture

    Parameters
    ----------
    X : np.ndarray of float, shape (n_elemenets,)
        The scores for which to find outliers.
    threshold : float
        The value above which a feature is classified as outlier.

    Returns
    -------
    bad_idx : np.ndarray of int, shape (n_features)
        The outlier indices.
    """
    max_iter = 2
    my_mask = np.zeros(len(X), dtype=np.bool)
    X = np.abs(X)
    for _ in range(max_iter):
        X = np.ma.masked_array(X, my_mask)
        this_z = stats.zscore(X)
        local_bad = this_z > threshold
        my_mask = np.max([my_mask, local_bad], 0)
        if not np.any(local_bad):
            break

    bad_idx = np.where(my_mask)[0]
    return bad_idx

示例#27

0

显示文件

文件： archive_ppca.py 项目： cameronphchen/pHA

def align(movie_data, options, args, lrh):
    print 'pPCA(scikit-learn)'
    nvoxel = movie_data.shape[0]
    nTR    = movie_data.shape[1]
    nsubjs = movie_data.shape[2]

    align_algo = args.align_algo
    nfeature   = args.nfeature

    # zscore the data
    bX = np.nan((nsubjs*nvoxel,nTR))

    for m in xrange(nsubjs):
        bX[m*nvoxel:(m+1)*nvoxel,:] = stats.zscore(movie_data[:, :, m].T, axis=0, ddof=1).T
    del movie_data

    U, s, VT = np.linalg.svd(bX, full_matrices=False)

    bW = np.zeros((nsubjs*nvoxel,nfeature))
    for m in xrange(nsubjs):
        bW[m*nvoxel:(m+1)*nvoxel,:] = U[m*nvoxel:(m+1)*nvoxel,:nfeature]

    niter = 10
    # initialization when first time run the algorithm
    np.savez_compressed(options['working_path']+align_algo+'_'+lrh+'_'+str(niter)+'.npz',\
                                  bW = bW,  niter=niter)
    return niter

示例#28

0

显示文件

def _generate_noise_system(dimensions_tr,
                           ):
    """Generate the scanner noise

    Generate the noise that is typical of a scanner. This is comprised
    of two types of noise, Rician and Gaussian

    Parameters
    ----------
    dimensions_tr : n length array, int
        What are the dimensions of the volume you wish to insert
        noise into. This can be a volume of any size

    Returns
    ----------
        system_noise : multidimensional array, float
            Create a volume with system noise


        """

    # Generate the Rician noise
    noise_rician = stats.rice.rvs(1, 1, size=dimensions_tr)

    # Apply the gaussian noise
    noise_gaussian = np.random.normal(0, 1, size=dimensions_tr)

    # Combine these two noise types
    noise_system = noise_rician + noise_gaussian

    # Normalize
    noise_system = stats.zscore(noise_system)

    return noise_system

示例#29

0

显示文件

文件： data_preparation.py 项目： Ori226/thesis_clean_v3

def triplet_data_collection(data, tags, batch_size, select=3, outof=10):
    from scipy import stats
    stimuli_category_size = 30
    number_of_repetition = select
    magic_number = number_of_repetition * stimuli_category_size
    number_of_samples = data.shape[0]
    time_samples_dim_size = data.shape[2]
    channel_dim_size = data.shape[3]

    all_combination = _get_all_possible_combination(np.arange(number_of_samples), outof, select)
    shuffled_combination = np.random.permutation(all_combination)

    batch_data = np.zeros((magic_number, batch_size, time_samples_dim_size, channel_dim_size), dtype=np.float32)
    counter = 0
    for i in range(0,len(shuffled_combination), batch_size):



        batch_tags = np.zeros((batch_size, stimuli_category_size), dtype=np.int8)

        for single_combination in shuffled_combination[i:min(i +batch_size,len(shuffled_combination) )]:

            batch_data[:, counter, :, :] = np.vstack([data[item] for item in single_combination])
            batch_tags[counter] = np.mean(np.vstack([tags[item] for item in single_combination]), axis=0)
            counter += 1
            if counter == batch_size:
                input_dict = dict(
                    [["positive_item_input_{}".format(i), stats.zscore(batch_data[i], axis=1)] for i in
                     range(90)])

                input_dict['triplet_loss'] = batch_tags

                return input_dict

示例#30

0

显示文件

文件： plotting.py 项目： dimenwarper/scimitar

def plot_transition_clustermap(data_array, gene_names, pseudotimes, n_clusters=10, gradient=False):
    if gradient:
        data_to_plot = zscore(np.gradient(data_array)[1].T, axis=0)
        scale = None
        metric = 'seuclidean'
        row_linkage = linkage(pdist(abs(data_to_plot), metric=metric), method='complete')
    else:
        data_to_plot = data_array.T
        scale = 0
        metric = 'correlation'
        row_linkage = linkage(pdist(data_to_plot, metric=metric), method='complete')
    
    assignments = fcluster(row_linkage, n_clusters, criterion='maxclust')
    cm = sns.clustermap(data_to_plot, col_cluster=False, standard_scale=scale, 
                        yticklabels=gene_names, row_linkage=row_linkage,
                        row_colors=[settings.STATE_COLORS[i] for i in assignments])
    r = np.arange(10, data_array.shape[0], data_array.shape[0]/10)
    plt.setp(cm.ax_heatmap.get_yticklabels(), fontsize=5)
    cm.ax_heatmap.set_xticks(r)
    cm.ax_heatmap.set_xticklabels(['%.1f' % x for x in pseudotimes[r]])
    cm.ax_heatmap.set_xlabel('Pseudotime')
    cm.ax_heatmap.set_ylabel('Gene')
    
    gene_clusters = defaultdict(list)
    for i, cl in enumerate(assignments):
        gene_clusters[settings.STATE_COLORS[cl]].append(gene_names[i])
    return gene_clusters

示例#31

0

显示文件

        wine_ds[col] = np.log1p(wine_ds[col])

# In[75]:

wine_ds.skew()

# In[76]:

sns.pairplot(wine_ds)
plt.show()

# In[77]:

from scipy.stats import zscore

z_score = abs(zscore(wine_ds))
print(wine_ds.shape)

wine_ds_final = wine_ds.loc[(z_score < 3).all(axis=1)]
print(wine_ds_final.shape)

# In[78]:

#Separatomg target and input variables

df_x = wine_ds_final.drop(columns=['Class'])
y = wine_ds_final['Class']

# In[79]:

df_x

示例#32

0

显示文件

文件： milestone1.py 项目： jinghanY/App517

np.random.seed(42)
n_splits = 10


# Read data
#print("Start loading data @ %.5f\n" % (time.time()-elapsed))
cwd = os.getcwd()
file_name_feature = cwd + "/../dataset/bank-additional-full_new_features.csv"
file_name_label = cwd + "/../dataset/bank-additional-full_new_labels.csv"
#print("End loading data @ %.5f\n" % (time.time()-elapsed))

#print("Start shuffling and sampling @ %.5f\n" % (time.time()-elapsed))
features, header_ele = readData(file_name_feature)
features = shuffle(features, random_state=41)[:5000]

features[:, :9] = zscore(features[:, :9])

label, label_names = loadLabels(file_name_label)
label = shuffle(label, random_state=41)[:5000]
#print("End shuffling and sampling @ %.5f\n" % (time.time()-elapsed))

#print("Start splitting dataset with 10-folds @ %.5f\n" % (time.time()-elapsed))
Kfold = StratifiedKFold(n_splits=n_splits)
#print("End splitting dataset with 10-folds @ %.5f\n" % (time.time()-elapsed))

accuracy_training_log = np.zeros(n_splits)
accuracy_testing_log = np.zeros(n_splits)
nlpd_t = np.zeros(n_splits)
nlpd_v = np.zeros(n_splits)

for i, (train_index, test_index) in enumerate(Kfold.split(features, label)):

示例#33

0

显示文件

文件： remove_anamolies.py 项目： sindhusha-t/Python-Programming

    })  # S marker size

# Set title
plt.title('plot between Garage Area and SalPrice')
plt.xlim(-200, 1600)
# Set x-axis label
plt.xlabel('GarageArea')
# Set y-axis label
plt.ylabel('SalePrice')
plt.show()

# Removing the Anamolies using z-score
# if the data is more than 3 standard deviations away then it is considered as outlier
# if the data is less than -3 standard deviations away then it is considered as outlier
df = pd.read_csv('train.csv', sep=',', usecols=(62, 80))
z = np.abs(stats.zscore(df))
threshold = 3
print(np.where(z > 3))
modified_df = df[(z < 3).all(axis=1)]
print(df.shape)
print(modified_df.shape)

# Create Scatterplot of the dataframe after removing the anamolies in the data
sns.lmplot(
    'GarageArea',  # Horizontal axis
    'SalePrice',  # Vertical axis
    data=modified_df,  # Data source
    fit_reg=False,  # Don't fix a regression line
    scatter_kws={
        "marker": "o",  # Set marker style
        "s": 80

示例#34

0

显示文件

文件： 1.py 项目： ThoshitaMovva/Python-1

import warnings

warnings.filterwarnings('ignore')

df_train = pd.read_csv(
    'C:/Users/Sushu/Documents/Python/ICP6/Python_Lesson6/train.csv')

df_train.describe()

var = 'GarageArea'
data = pd.concat([df_train['SalePrice'], df_train[var]], axis=1)
data.plot.scatter(x=var, y='SalePrice', ylim=(0, 800000))

data.shape

data_remana = np.abs(stats.zscore(data))

data_remana[:5, :5]

data1 = data[(data_remana < 3).all(axis=1)]

var = 'GarageArea'
data = pd.concat([df_train['SalePrice'], df_train[var]], axis=1)
data1.plot.scatter(x=var, y='SalePrice', ylim=(0, 800000))

df = data[~data.SalePrice.isin(data[data_remana > 3].SalePrice)]

df.describe()

data1.shape

示例#35

0

显示文件

def runPatterns(actmat,
                method='ica',
                nullhyp='mp',
                nshu=1000,
                percentile=99,
                tracywidom=False):
    '''
        INPUTS
        
            actmat:     activity matrix - numpy array (neurons, time bins) 
            
            nullhyp:    defines how to generate statistical threshold for assembly detection.
                            'bin' - bin shuffling, will shuffle time bins of each neuron independently
                            'circ' - circular shuffling, will shift time bins of each neuron independently
                                                                obs: mantains (virtually) autocorrelations
                            'mp' - Marcenko-Pastur distribution - analytical threshold
                            
            nshu:       defines how many shuffling controls will be done (n/a if nullhyp is 'mp')
            
            percentile: defines which percentile to be used use when shuffling methods are employed.
                                                                        (n/a if nullhyp is 'mp')
                                                                         
            tracywidow: determines if Tracy-Widom is used. See Peyrache et al 2010.
                                                    (n/a if nullhyp is NOT 'mp')
                                                    
        OUTPUTS
            
            patterns:     co-activation patterns (assemblies) - numpy array (assemblies, neurons)
            significance: object containing general information about significance tests 
            zactmat:      returns z-scored actmat
        
        '''

    nneurons = np.size(actmat, 0)
    nbins = np.size(actmat, 1)

    silentneurons = np.var(actmat, axis=1) == 0
    actmat_ = actmat[~silentneurons, :]

    # z-scoring activity matrix
    zactmat_ = stats.zscore(actmat_, axis=1)

    # running significance (estimating number of assemblies)
    significance = PCA()
    significance.fit(zactmat_.T)
    significance.nneurons = nneurons
    significance.nbins = nbins
    significance.nshu = nshu
    significance.percentile = percentile
    significance.tracywidom = tracywidom
    significance.nullhyp = nullhyp
    significance = runSignificance(zactmat_, significance)
    if np.isnan(significance.nassemblies):
        return

    if significance.nassemblies < 1:
        print('WARNING !')
        print('    no assembly detecded!')
        patterns = []
    else:
        # extracting co-activation patterns
        patterns_ = extractPatterns(zactmat_, significance, method)
        if patterns_ is np.nan:
            return

# putting eventual silent neurons back (their assembly weights are defined as zero)
        patterns = np.zeros((np.size(patterns_, 0), nneurons))
        patterns[:, ~silentneurons] = patterns_
    zactmat = np.copy(actmat)
    zactmat[~silentneurons, :] = zactmat_

    return patterns, significance, zactmat

示例#36

0

显示文件

文件： dashboard_graphs.py 项目： C-Iannotti/AeferMarketWebApplication

def demand_graphs(branch='A',
                  product_line='Electronic accessories',
                  date=datetime.date(2019, 3, 25)):
    pd.options.plotting.backend = "plotly"
    fig = make_subplots(rows=1,
                        cols=2,
                        shared_xaxes=False,
                        specs=[[{
                            "type": "xy"
                        }, {
                            "type": "xy"
                        }]],
                        subplot_titles=('Day vs Demand',
                                        'Predicted Values for a Week'))
    fig['layout'].update(xaxis_title='Day',
                         yaxis_title='Quantity',
                         paper_bgcolor='rgba(0,0,0,0)',
                         plot_bgcolor='rgba(0,0,0,0)')

    #Gets the earliest date in the database
    with connection.cursor() as crs:
        earliest_date_query = 'SELECT min(Date) FROM Sales'
        earliest_date = crs.execute(earliest_date_query).fetchval()

    df = pd.read_sql(
        demand_query % (earliest_date, earliest_date, date, branch,
                        product_line, earliest_date), connection)
    date_difference = date - earliest_date

    #fills in days where there are no sales
    s = df['Day'].tolist()
    for i in range(0, date_difference.days):
        if i not in s:
            df.loc[-1] = [i, 0]
            df.index = df.index + 1

    #drops data that has a zscore with an absolute value greater than or equal to 3 then sorts
    z_scores = np.abs(stats.zscore(df['Demand']))
    df = df[(z_scores < 3)]
    df = df.sort_values('Day')

    #makes a linear regression model then predicts the values and store them
    model = linear_model.LinearRegression()
    weight = np.ones(len(df)) * 10
    weight[-7:] *= 1.5
    x = df[['Day']]
    y = df[['Demand']]
    model.fit(x, y, weight)
    df['bestfit'] = model.predict(df[['Day']])

    #makes a prediction of the next weeks demand
    days_predicted = list(range(date_difference.days,
                                date_difference.days + 7))
    predicted_values = []
    total_predicted = 0
    for i in days_predicted:
        predicted = float(model.predict([[i]]))
        predicted_values.append(predicted)
        total_predicted += predicted
    predicted_values.append(total_predicted)

    #makes a scatter plot with a line for the linear regression
    fig.add_trace(
        go.Scatter(name='data points',
                   x=df['Day'],
                   y=df['Demand'].values,
                   mode='markers'), 1, 1)
    fig.add_trace(
        go.Scatter(name='regression line',
                   x=df['Day'],
                   y=df['bestfit'],
                   mode='lines'), 1, 1)

    #makes a bar chart for the predicted values for the week
    fig.add_trace(
        go.Bar(name='predicted values',
               x=days_predicted.append('total'),
               y=predicted_values), 1, 2)

    #finds the explained variance score, r2 score, and mean absolute error
    evs = sm.explained_variance_score(df['Demand'], df['bestfit'])
    r2 = sm.r2_score(df['Demand'], df['bestfit'])
    mae = sm.mean_absolute_error(df['Demand'], df['bestfit'])

    new_date = True
    with connection.cursor() as crs:
        crs.execute(check_prediction_query, (date, product_line, branch))
        if crs.fetchone() != None:
            new_date = False

    if new_date:
        with connection.cursor() as crs:
            crs.execute(update_prediction_log,
                        (branch, product_line, date, evs, r2, mae))

    return fig, evs, r2, mae

示例#37

0

显示文件

    ix = obj1.meta.type.isin(['iPSC'])
    obj1.filter_samples(ix)

    obj2 = copy(ref_obj)
    ix = obj2.meta.type == 'ESC'
    obj2.filter_samples(ix)

    dend = plot_dendrogram([obj1, obj2], qn_method=quantile_norm, n_by_mad=n_gene_by_mad)
    dend['fig'].savefig(os.path.join(outdir, "cluster_ipsc_esc.png"), dpi=200)

    # 3. iPSC, ESC, Ruiz signature (only)
    the_obj = loader.MultipleBatchLoader([obj1, obj2])
    dat_r_z = pd.DataFrame(np.log2(the_obj.data + eps))
    dat_r_z = dat_r_z.reindex(gene_sign_ens.values).dropna()
    for r in dat_r_z.index:
        dat_r_z.loc[r] = zscore(dat_r_z.loc[r])

    dat_r_z.index = gene_sign_ens.index[gene_sign_ens.isin(dat_r_z.index)]

    cg = clustering.plot_clustermap(dat_r_z, show_gene_labels=True, cmap='RdBu_r')
    cg.gs.update(bottom=0.2)
    cg.savefig(os.path.join(outdir, "clustermap_ruiz_ipsc_esc_ztrans.png"), dpi=200)

    # 4. HipSci, iPSC, ESC, FB
    obj1 = copy(obj)
    ix = obj1.meta.type.isin(['iPSC', 'FB'])
    obj1.filter_samples(ix)

    dend = plot_dendrogram([obj1, ref_obj, hip_obj], qn_method=quantile_norm, n_by_mad=n_gene_by_mad)
    dend['fig'].savefig(os.path.join(outdir, "cluster_ipsc_esc_fb_with_hipsci%d.png" % n_hipsci), dpi=200)

示例#38

0

显示文件

def Price_Main(data: pd.DataFrame):

    # Remove price and term outliers (out of 3 sigmas)
    data = data[((np.abs(stats.zscore(data.price)) < 2.5) & (np.abs(stats.zscore(data.term)) < 2.5) & (
                np.abs(stats.zscore(data.full_sq)) < 2.5))]


    # Fill NaN if it appears after merging
    data[['term']] = data[['term']].fillna(data[['term']].mean())

    # Fix year
    data = data[((data.yyyy_announce == 19) | (data.yyyy_announce == 20))]

    # Log Transformation
    data["longitude"] = np.log1p(data["longitude"])
    data["latitude"] = np.log1p(data["latitude"])
    data["full_sq"] = np.log1p(data["full_sq"])
    data["life_sq"] = np.log1p(data["life_sq"])
    data["kitchen_sq"] = np.log1p(data["kitchen_sq"])
    data["to_center"] = np.log1p(data["to_center"])
    data["price"] = np.log1p(data["price"])
    X = data[['life_sq', 'to_center', 'mm_announce', 'rooms', 'renovation', 'has_elevator', 'longitude', 'latitude', 'full_sq', 'kitchen_sq',
              'time_to_metro', 'floor_last', 'floor_first', 'clusters', 'is_rented', 'rent_quarter', 'rent_year']]

    y = data[['price']].values.ravel()
    print(X.shape, y.shape, flush=True)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

    # GBR model
    gbr_model = GradientBoostingRegressor(n_estimators=350, max_depth=8, verbose=1, random_state=42)

    print(10*'-', '> GBR Spb started fitting...')
    gbr_model.fit(X_train, y_train)

    gbr_preds = gbr_model.predict(X_test)
    print('Spb GBR R2_score: ', r2_score(y_test, gbr_preds), flush=True)
    print('Spb GBR RMSE : ', mean_squared_error(y_test, gbr_preds), flush=True)

    print('Train GBR on full spb dataset: ', flush=True)
    gbr_model.fit(X, y)

    dump(gbr_model, PATH_TO_PRICE_MODEL_GBR_D)
    print('GBR Spb model saved !', flush=True)

    # RANDOM FOREST REGRESSOR
    RF = RandomForestRegressor(n_estimators=300, verbose=1, n_jobs=-1)

    print(10*'-', '> Rf Spb started fitting...')
    RF.fit(X_train, y_train)

    rf_predicts = RF.predict(X_test)

    print('Spb RF R2_score: ', r2_score(y_test, rf_predicts), flush=True)
    print('Spb RF RMSE: ', mean_squared_error(y_test, rf_predicts), flush=True)

    print('Train RF on full spb dataset: ', flush=True)
    RF.fit(X, y)

    dump(RF, PATH_TO_PRICE_MODEL_RF_D)
    print('GBR Spb model saved !', flush=True)


    # LGBM model
    lgbm_model = LGBMRegressor(objective='regression',
                               learning_rate=0.05,
                               n_estimators=1250, max_depth=7, min_child_samples=1, verbose=0)

    print(10*'-', '> LGBM Spb started fitting...')
    lgbm_model.fit(X_train, y_train)
    lgbm_preds = lgbm_model.predict(X_test)
    print('Spb RF R2_score: ', r2_score(y_test, lgbm_preds), flush=True)
    print('Spb LGBM RMSE: ', mean_squared_error(y_test, lgbm_preds), flush=True)

    print('Train LGBM on full spb dataset: ', flush=True)
    lgbm_model.fit(X, y)

    dump(lgbm_model, PATH_TO_PRICE_MODEL_LGBM_D)
    print('LGBM Spb model saved !', flush=True)

示例#39

0

显示文件

文件： 01_ki_zscore_cutoff_matrix.py 项目： sugarlin1732/MS_Drug_Prediction

import numpy as np
import pandas as pd
from scipy import stats


# load predict -ligKi matrix
ki_matrix = np.loadtxt("..//..//02_drug_model//03_predicted_ki_matrix.txt", delimiter=",")

# generate zscore matrix
z_score_matrix = []

# calculate zscore
counter = 1
for prot in ki_matrix:
    print(counter)
    z_score = stats.zscore(prot)
    z_score[z_score < 2] = 0
    z_score_matrix.append(z_score)

    counter += 1


z_score_matrix = np.array(z_score_matrix)
np.savetxt(".//protein_ki_zscore_2_matrix.txt", z_score_matrix, fmt = "%.5e", delimiter=",")

示例#40

0

显示文件

文件： main.py 项目： PauloAxcel/Raman-Analysis

#    result = [result1,result2,result3]
    
    fig = plt.figure()
    
    for j in range(len(result)):
        dataset = pd.concat([base,result[j]])
        n_comp = 2
        
        
        pca = PCA(n_components=n_comp)
        principalComponents = pca.fit_transform(dataset)
        columns = ['principal component '+str(i) for i in range(1,n_comp+1)]
        principalDf = pd.DataFrame(data = principalComponents , columns = columns)
        finalDf = pd.concat([principalDf, classe], axis = 1)
        finalDf = finalDf.dropna()
        z = np.abs(stats.zscore(finalDf.iloc[:,:2]))
        finalDf = finalDf[(z < 3).all(axis=1)]
        finalDf.reset_index(drop=True)
 
 
        loadings = pca.components_.T * np.sqrt(pca.explained_variance_)
        for k in range(n_comp):
            loadings_ = loadings[:,k]
#            smooth2(smoo, loadings_)
            smoothed = smooth2(smoo, loadings_)
            peaksf, _ = find_peaks(smoothed,distance=10) 
            
            plt.plot(xData,smoothed,label='L'+str(1+k)+' '+label[i]+' vs '+classe[0][dataset.index[-1]]+'(var: %d%%)' %(pca.explained_variance_ratio_[k]*100),color=colors[1:len(colors)][j],alpha=(n_comp-k)/(n_comp))
            plt.hlines(0, xmin=xData.min(),xmax=xData.max(),ls='dotted',linewidth=1)
            plt.xlabel('Raman Shift (cm$^{-1}$)', fontsize = 16)
            plt.ylabel('PCA Loadings', fontsize = 16)

示例#41

0

显示文件

文件： 12-using-vectorized-functions.py 项目： zxl170030/DataCamp

Using vectorized functions

When performance is paramount, you should avoid using .apply() and .map() because those constructs perform Python for-loops over the data stored in a pandas Series or DataFrame. By using vectorized functions instead, you can loop over the data at the same speed as compiled code (C, Fortran, etc.)! NumPy, SciPy and pandas come with a variety of vectorized functions (called Universal Functions or UFuncs in NumPy).

You can even write your own vectorized functions, but for now we will focus on the ones distributed by NumPy and pandas.

In this exercise you're going to import the zscore method from scipy.stats and use it to compute the deviation in voter turnout in Pennsylvania from the mean in fractions of the standard deviation. In statistics, the z-score is the number of standard deviations by which an observation is above the mean - so if it is negative, it means the observation is below the mean.

Instead of using .apply() as you did in the earlier exercises, the zscore UFunc will take a pandas Series as input and return a NumPy array. You will then assign the values of the NumPy array to a new column in the DataFrame. You will be working with the election DataFrame - it has been pre-loaded for you.

Import zscore from scipy.stats.
Call zscore with election['turnout'] as input .
Print the output of type(turnout_zscore). This has been done for you.
Assign turnout_zscore to a new column in election as 'turnout_zscore'.
Print the output of election.head(). This has been done for you, so hit 'Submit Answer' to view the result.
'''
# Import zscore from scipy.stats
from scipy.stats import zscore

# Call zscore with election['turnout'] as input: turnout_zscore
turnout_zscore = zscore(election['turnout'])

# Print the type of turnout_zscore
print(type(turnout_zscore))

# Assign turnout_zscore to a new column: election['turnout_zscore']
election['turnout_zscore'] = turnout_zscore

# Print the output of election.head()
print(election.head())

示例#42

0

显示文件

def prepareForBoxplots(data, gender=True):
    """
    brings data into the correct format for generating a boxplot over all students from all years

    in this particular case we use year of university entrance as query_id, psu scores as features and notas as rank

    writes one dataset with protection_status "gender" and one with protection_status "highschool_type"
    """

    data = data[data['sem'] == 1]
    data = data[data['inactivo'] != 1]

    # drop all lines where values are missing
    data = data.dropna(
        subset=['nem', 'psu_mat', 'psu_len', 'psu_cie', 'notas_', 'uds_i_'])

    # drop all columns that are not needed
    if (gender):
        keep_cols = [
            'hombre', 'psu_mat', 'psu_len', 'psu_cie', 'nem', 'notas_',
            'uds_i_', 'uds_r_', 'uds_e_'
        ]
    else:
        keep_cols = [
            'highschool_type', 'psu_mat', 'psu_len', 'psu_cie', 'nem',
            'notas_', 'uds_i_', 'uds_r_', 'uds_e_'
        ]

    data = data[keep_cols]

    # replace NaNs with zeros
    data['uds_r_'].fillna(0)
    data['uds_e_'].fillna(0)

    # add new column for ranking scores
    data['score'] = np.zeros(data.shape[0])

    # calculate score based on grades and credits
    for idx, row in data.iterrows():
        grades = row.loc['notas_']
        credits_taken = row.loc['uds_i_']
        credits_failed = row.loc['uds_r_']
        credits_dropped = row.loc['uds_e_']

        score = grades * (credits_taken - credits_failed -
                          credits_dropped) / credits_taken
        data.loc[idx, 'score'] = score

    # don't need these columns anymore
    data = data.drop(columns=['notas_', 'uds_i_', 'uds_r_', 'uds_e_'])

    # zscore psu scores and normalize scores
    data['psu_mat'] = stats.zscore(data['psu_mat'])
    data['psu_len'] = stats.zscore(data['psu_len'])
    data['psu_cie'] = stats.zscore(data['psu_cie'])
    data['nem'] = stats.zscore(data['nem'])
    data['score'] = stats.zscore(data['score'])

    # rename protected column to prot_attr
    data.columns = [
        'prot\_attr', 'psu\_mat', 'psu\_len', 'psu\_cie', 'nem', 'score'
    ]

    return data

示例#43

0

显示文件

文件： timesegment_matching_utils.py 项目： agramfort/multiviewica

def time_segment_matching(
    data,
    win_size=10,
):
    """
    Performs time segment matching experiment
    (code inspired from brainiak tutorials at
    https://brainiak.org/events/ohbm2018/brainiak_sample_tutorials/10-func-align.html)

    Parameters
    ----------
    data: array of shape (n_subjects, n_components, n_timeframes)
        Input shared responses
    Returns
    -------
    cv_score: np array of shape (n_subjects)
        Per-subject accuracy
    """
    # Pull out shape information
    n_subjs = len(data)
    (n_features, n_TR) = data[0].shape  # Voxel/feature by timepoint

    # How many segments are there (account for edges)
    n_seg = n_TR - win_size

    # mysseg prediction prediction
    train_data = np.zeros((n_features * win_size, n_seg))

    # Concatenate the data across participants
    for ppt_counter in range(n_subjs):
        for window_counter in range(win_size):
            train_data[window_counter * n_features:(window_counter + 1) *
                       n_features, :, ] += data[
                           ppt_counter][:,
                                        window_counter:window_counter + n_seg]

    # Iterate through the participants, leaving one out
    accuracy = np.zeros(shape=n_subjs)
    for ppt_counter in range(n_subjs):

        # Preset
        test_data = np.zeros((n_features * win_size, n_seg))

        for window_counter in range(win_size):
            test_data[window_counter * n_features:(window_counter + 1) *
                      n_features, :, ] = data[ppt_counter][:, window_counter:(
                          window_counter + n_seg)]

        # Take this participant data away
        train_ppts = stats.zscore((train_data - test_data), axis=0, ddof=1)
        test_ppts = stats.zscore(test_data, axis=0, ddof=1)

        # Correlate the two data sets
        corr_mtx = test_ppts.T.dot(train_ppts)

        # If any segments have a correlation difference less than the window size and they aren't the same segments then set the value to negative infinity
        for seg_1 in range(n_seg):
            for seg_2 in range(n_seg):
                if abs(seg_1 - seg_2) < win_size and seg_1 != seg_2:
                    corr_mtx[seg_1, seg_2] = -np.inf

        # Find the segement with the max value
        rank = np.argmax(corr_mtx, axis=1)

        # Find the number of segments that were matched for this participant
        accuracy[ppt_counter] = sum(rank == range(n_seg)) / float(n_seg)

    return accuracy

示例#44

0

显示文件

def demo_rsHRF(input_file, mask_file, output_dir, para, p_jobs, file_type=".nii", mode="bids", wiener=False, temporal_mask=[]):
    # book-keeping w.r.t parameter values
    if 'localK' not in para or para['localK'] == None:
        if para['TR']<=2:
            para['localK'] = 1
        else:
            para['localK'] = 2
    # creating the output-directory if not already present
    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)
    # for four-dimensional input
    if mode != 'time-series':
        if mode == 'bids' or mode == 'bids w/ atlas':
            name = input_file.filename.split('/')[-1].split('.')[0]
            v1 = spm_dep.spm.spm_vol(input_file.filename)
        else:
            name = input_file.split('/')[-1].split('.')[0]
            v1 = spm_dep.spm.spm_vol(input_file)
        if mask_file != None:
            if mode == 'bids':
                mask_name = mask_file.filename.split('/')[-1].split('.')[0]
                v = spm_dep.spm.spm_vol(mask_file.filename)
            else:
                mask_name = mask_file.split('/')[-1].split('.')[0]
                v = spm_dep.spm.spm_vol(mask_file)
            if file_type == ".nii" or file_type == ".nii.gz":
                brain = spm_dep.spm.spm_read_vols(v)
            else:
                brain = v.agg_data().flatten(order='F')
            if  ((file_type == ".nii" or file_type == ".nii.gz") and \
                    v1.header.get_data_shape()[:-1] != v.header.get_data_shape()) or \
                ((file_type == ".gii" or file_type == ".gii.gz") and \
                    v1.agg_data().shape[0]!= v.agg_data().shape[0]):
                raise ValueError ('Inconsistency in input-mask dimensions' + '\n\tinput_file == ' + name + file_type + '\n\tmask_file == ' + mask_name + file_type)
            else:
                if file_type == ".nii" or file_type == ".nii.gz" :
                    data   = v1.get_data()
                else:
                    data   = v1.agg_data()
        else:
            print('No atlas provided! Generating mask file...')
            if file_type == ".nii" or file_type == ".nii.gz" :
                data   = v1.get_data()
                brain  = np.nanvar(data.reshape(-1, data.shape[3]), -1, ddof=0)
            else:
                data   = v1.agg_data()
                brain  = np.nanvar(data, -1, ddof=0)
            print('Done')
        voxel_ind  = np.where(brain > 0)[0]
        mask_shape = data.shape[:-1]
        nobs       = data.shape[-1]
        data1      = np.reshape(data, (-1, nobs), order='F').T
        bold_sig = stats.zscore(data1[:, voxel_ind], ddof=1)
   # for time-series input
    else:
        name = input_file.split('/')[-1].split('.')[0]
        data1 = (np.loadtxt(input_file, delimiter=","))
        if data1.ndim == 1:
            data1 = np.expand_dims(data1, axis=1)
        nobs = data1.shape[0]
        bold_sig = stats.zscore(data1, ddof=1)
    if len(temporal_mask) > 0 and len(temporal_mask) != nobs:
            raise ValueError ('Inconsistency in temporal_mask dimensions.\n' + 'Size of mask: ' + str(len(temporal_mask)) + '\n' + 'Size of time-series: ' + str(nobs))
    bold_sig = np.nan_to_num(bold_sig)
    bold_sig_deconv = processing. \
                      rest_filter. \
                      rest_IdealFilter(bold_sig, para['TR'], para['passband_deconvolve'])
    bold_sig = processing. \
               rest_filter. \
               rest_IdealFilter(bold_sig, para['TR'], para['passband'])
    data_deconv  = np.zeros(bold_sig.shape)
    event_number = np.zeros((1, bold_sig.shape[1]))
    print('Retrieving HRF ...')
    #Estimate HRF for the fourier / hanning / gamma / cannon basis functions
    if not (para['estimation'] == 'sFIR' or para['estimation'] == 'FIR'):
        bf = basis_functions.basis_functions.get_basis_function(bold_sig.shape, para)
        beta_hrf, event_bold = utils.hrf_estimation.compute_hrf(bold_sig, para, temporal_mask, p_jobs, bf=bf)
        hrfa = np.dot(bf, beta_hrf[np.arange(0, bf.shape[1]), :])
    #Estimate HRF for FIR and sFIR
    else:
        beta_hrf, event_bold = utils.hrf_estimation.compute_hrf(bold_sig, para, temporal_mask, p_jobs)
        hrfa = beta_hrf[:-1,:]
    nvar = hrfa.shape[1]
    PARA = np.zeros((3, nvar))
    for voxel_id in range(nvar):
        hrf1 = hrfa[:, voxel_id]
        PARA[:, voxel_id] = \
            parameters.wgr_get_parameters(hrf1, para['TR'] / para['T'])
    print('Done')
    print('Deconvolving HRF ...')
    if para['T'] > 1:
        hrfa_TR = signal.resample_poly(hrfa, 1, para['T'])
    else:
        hrfa_TR = hrfa
    for voxel_id in range(nvar):
        hrf = hrfa_TR[:, voxel_id]
        if not wiener:
            H = np.fft.fft(
                np.append(hrf,
                          np.zeros((nobs - max(hrf.shape), 1))), axis=0)
            M = np.fft.fft(bold_sig_deconv[:, voxel_id])
            data_deconv[:, voxel_id] = \
                np.fft.ifft(H.conj() * M / (H * H.conj() + .1*np.mean((H * H.conj()))))
        else:
            data_deconv[:, voxel_id] = iterative_wiener_deconv.rsHRF_iterative_wiener_deconv(bold_sig_deconv[:, voxel_id], hrf)
        event_number[:, voxel_id] = np.amax(event_bold[voxel_id].shape)
    # setting the output-path
    if mode == 'bids' or mode == 'bids w/ atlas':
        try:
            sub_save_dir = os.path.join(
                output_dir, 'sub-' + input_file.subject,
                'session-' + input_file.session,
                input_file.modality
            )
        except AttributeError as e:
            sub_save_dir = os.path.join(
                output_dir, 'sub-' + input_file.subject,
                input_file.modality
            )
    else:
        sub_save_dir = output_dir
    if not os.path.isdir(sub_save_dir):
        os.makedirs(sub_save_dir, exist_ok=True)
    dic = {'para': para, 'hrfa': hrfa, 'event_bold': event_bold, 'PARA': PARA}
    ext = '_hrf.mat'
    if mode == "time-series":
        dic["event_number"] = event_number
        dic["data_deconv"]  = data_deconv
        ext = '_hrf_deconv.mat'
    sio.savemat(os.path.join(sub_save_dir, name + ext), dic)
    HRF_para_str = ['Height', 'Time2peak', 'FWHM']
    if mode != "time-series":
        mask_data = np.zeros(mask_shape).flatten(order='F')
        for i in range(3):
            fname = os.path.join(sub_save_dir,
                                 name + '_' + HRF_para_str[i])
            mask_data[voxel_ind] = PARA[i, :]
            mask_data = mask_data.reshape(mask_shape, order='F')
            spm_dep.spm.spm_write_vol(v1, mask_data, fname, file_type)
            mask_data = mask_data.flatten(order='F')
        fname = os.path.join(sub_save_dir, name + '_event_number.nii')
        mask_data[voxel_ind] = event_number
        mask_data = mask_data.reshape(mask_shape, order='F')
        spm_dep.spm.spm_write_vol(v1, mask_data, fname, file_type)
        mask_data = np.zeros(data.shape)
        dat3 = np.zeros(data.shape[:-1]).flatten(order='F')
        for i in range(nobs):
            fname = os.path.join(sub_save_dir, name + '_deconv')
            dat3[voxel_ind] = data_deconv[i, :]
            dat3 = dat3.reshape(data.shape[:-1], order='F')
            if file_type == ".nii" or file_type == ".nii.gz" :
                mask_data[:, :, :, i] = dat3
            else:
                mask_data[:, i] = dat3
            dat3 = dat3.flatten(order='F')
        spm_dep.spm.spm_write_vol(v1, mask_data, fname, file_type)
    pos = 0
    while pos < hrfa_TR.shape[1]:
        if np.any(hrfa_TR[:,pos]):
            break
        pos += 1
    event_plot = lil_matrix((1, nobs))
    if event_bold.size:
        event_plot[:, event_bold[pos]] = 1
    else:
        print("No Events Detected!")
        return 0
    event_plot = np.ravel(event_plot.toarray())
    plt.figure()
    plt.plot(para['TR'] * np.arange(1, np.amax(hrfa_TR[:, pos].shape) + 1),
             hrfa_TR[:, pos], linewidth=1)
    plt.xlabel('time (s)')
    plt.savefig(os.path.join(sub_save_dir, name + '_plot_1.png'))
    plt.figure()
    plt.plot(para['TR'] * np.arange(1, nobs + 1),
             np.nan_to_num(stats.zscore(bold_sig[:, pos], ddof=1)),
             linewidth=1)
    plt.plot(para['TR'] * np.arange(1, nobs + 1),
             np.nan_to_num(stats.zscore(data_deconv[:, pos], ddof=1)),
             color='r', linewidth=1)
    markerline, stemlines, baseline = \
        plt.stem(para['TR'] * np.arange(1, nobs + 1), event_plot)
    plt.setp(baseline, 'color', 'k', 'markersize', 1)
    plt.setp(stemlines, 'color', 'k')
    plt.setp(markerline, 'color', 'k', 'markersize', 3, 'marker', 'd')
    plt.legend(['BOLD', 'deconvolved', 'events'])
    plt.xlabel('time (s)')
    plt.savefig(os.path.join(sub_save_dir, name + '_plot_2.png'))
    print('Done')
    return 0

示例#45

0

显示文件

文件： Outlier_Anomaly.py 项目： JacobPjetursson/Machine_Learning

"""
Created on Mon Nov 27 19:11:38 2017

@author: Jacob
"""
import numpy as np
from matplotlib.pyplot import (figure, imshow, bar, title, xticks, yticks, cm,
                               subplot, show)
from scipy.stats.kde import gaussian_kde
from toolbox_02450 import gausKernelDensity
from sklearn.neighbors import NearestNeighbors
from scipy import stats
import dataSetup

X = dataSetup.numbersData.values
X = stats.zscore(X)
N,M = X.shape

"""
# OUTLIER DETECTION
# Compute kernel density estimate
kde = gaussian_kde(X.ravel(), 'silverman')

scoresKDE = kde.evaluate(X.ravel())
idxKDE = scoresKDE.argsort()
scoresKDE.sort()

print('The index of the lowest density object: {0}'.format(idxKDE[0]))

# Plot kernel density estimate
figure()

示例#46

0

显示文件

def main():
    """
    load data
    """
    train_set = pd.read_csv('../data/train.csv')
    test_set = pd.read_csv('../data/test.csv')

    #Without outlier remover, with basic nanRemover 0.12416413124809748

    """
    Remove Outliers
    """

    outliers = [197, 523, 691, 854, 1182, 1298]
    print(outliers)

    z = np.abs(zscore(train_set[get_numeric_columns(train_set)]))
    row, col = np.where(z > 4)
    df = pd.DataFrame({"row": row, "col": col})
    rows_count = df.groupby(['row']).count()

    outliers = rows_count[rows_count.col > 2].index
    print(outliers)
    train_set.drop(outliers, inplace=True)


    """
    fix salePrice skewness
    """
    train_set["SalePrice"] = np.log1p(train_set["SalePrice"])
    y_train_values = train_set["SalePrice"].values

    """
    prepare combined data.
    """
    train_set_id = train_set['Id']
    test_set_id = test_set['Id']

    train_set_rows = train_set.shape[0]
    test_set_rows = test_set.shape[0]

    train_set.drop('Id', axis=1, inplace=True)
    test_set.drop('Id', axis=1, inplace=True)
    train_set.drop('SalePrice', axis=1, inplace=True)

    combined_data = pd.concat((train_set, test_set))


    """
    create data transform pipeline
    """
    transform_pipeline = Pipeline(steps=[
        ('OutlierRemover', OutlierRemover()),
        ('NaNImputer', NaNImputer()),
        ('NaNRemover', NaNRemover()),
        ('AdditionalFeatureGenerator', AdditionalFeatureGenerator()),
        ('TypeTransformer', TypeTransformer()),
        ('ErrorImputer', ErrorImputer()),
        ('SkewFixer', SkewFixer()),
        ('Scaler', Scaler()),
        ('FeatureDropper', FeatureDropper()),
        ('Dummyfier', Dummyfier()),
    ])


    transformed_data = transform_pipeline.transform(combined_data)
    train_data = transformed_data[:train_set_rows]
    predict_data = transformed_data[train_set_rows:]

    transformed_data.to_csv('transformed_Data.csv', index=False)

    """
    try various regressors
    """

    rf_param = {
        # 'bootstrap': [True],
        'max_depth': [3, 4, 5],
        'min_samples_leaf': [3, 4, 5],
        'n_estimators': [5, 7, 10]
    }
    ls_param = {'alpha': [0.0003, 0.0004, 0.0005,
                          0.0006, 0.0007, 0.0008],
                'max_iter': [10000], "normalize": [False]}

    elnet_param = {'alpha': [0.0003, 0.0004, 0.0005],
                   'l1_ratio': [0.9, 0.95, 0.99, 1],
                   'max_iter': [10000]}

    ridge_param = {'alpha': [10, 10.1, 10.2, 10.3, 10.4, 10.5]}

    svr_param = {'gamma': [1e-08, 1e-09],
                 'C': [100000, 110000],
                 'epsilon': [1, 0.1, 0.01]
                 }

    rf = get_best_estimator(train_data, y_train_values, estimator=RandomForestRegressor(),
                            params=rf_param, n_jobs=4)
    elnet = get_best_estimator(train_data, y_train_values, estimator=ElasticNet(),
                               params=elnet_param, n_jobs=4)
    lso = get_best_estimator(train_data, y_train_values, estimator=Lasso(),
                             params=ls_param, n_jobs=4)

    rdg = get_best_estimator(train_data, y_train_values, estimator=Ridge(),
                             params=ridge_param, n_jobs=4)
    svr = get_best_estimator(train_data, y_train_values, estimator=SVR(),
                             params=svr_param, n_jobs=4)

    def cv_rmse(model):
        kfolds = KFold(n_splits=5, shuffle=True, random_state=42)
        rmse = np.sqrt(-cross_val_score(model, train_data, y_train_values,
                                        scoring="neg_mean_squared_error",
                                        cv=kfolds))
        return (rmse)
    """
    print("Randomforest  model rmse : ", cv_rmse(rf).mean())
    print("elastic model rmse : ", cv_rmse(elnet).mean())
    print("lasso model rmse : ", cv_rmse(lso).mean())
    print("ridge model rmse : ", cv_rmse(rdg).mean())
    print("svr model rmse : ", cv_rmse(svr).mean())
    """

    model = StackingRegressor(
        regressors=[rf, elnet, lso, rdg, svr],
        meta_regressor=Lasso(alpha=0.0005)
        # meta_regressor=SVR(kernel='rbf')
    )

    # Fit the model on our data
    model.fit(train_data, y_train_values)
    #print("StackingRegressor model rmse : ", cv_rmse(model).mean())

    # y_pred = model.predict(train_data)
    # print(sqrt(mean_squared_error(y_train_values, y_pred)))

    # Predict test set
    ensembled = np.expm1(model.predict(predict_data))

    # sns.scatterplot(np.expm1(rf.predict(train_data),np.expm1(y_train_values)))
    # plt.show()
    # ensembled = np.expm1(rf.predict(predict_data))

    """
    export submission data
    """
    submission = pd.DataFrame({
        "Id": test_set_id,
        "SalePrice": ensembled
    })
    submission.to_csv('submission_jiwon.csv', index=False)

示例#47

0

显示文件

文件： accidents_plot.py 项目： connorgrannis/nch_python_workshop

import pandas as pd
import seaborn as sns
import numpy as np
from scipy import stats

#%%
accidents = pd.read_csv(
    r'C:\Users\rcf004\Documents\Python Scripts\US_Accidents_Dec19.csv')
accidents.dropna(
    subset=['Start_Lng', 'Start_Lat', 'Severity', 'Visibility(mi)'],
    inplace=True)

#%%
no_out = accidents[
    np.abs(stats.zscore(accidents['Visibility(mi)'].astype(int))) < 3]

#%%

df1 = no_out[no_out['Start_Time'].astype(str).str.contains('2016-')]
df2 = no_out[no_out['Start_Time'].astype(str).str.contains('2017-')]
df3 = no_out[no_out['Start_Time'].astype(str).str.contains('2018-')]
df4 = no_out[no_out['Start_Time'].astype(str).str.contains('2019-')]

#%%

a = 0.5
sz = (1, 10)
lw = 0
kwargs = {'marker': "."}
cmap = sns.cubehelix_palette(start=1.1,

示例#48

0

显示文件

文件： backtonucleotide.py 项目： jeemin5854/qbb2016-answers

                elif seq[nucpos:nucpos + 3] == "---":
                    nucpos += 3
                    continue
                else:
                    dnlist[codon] += 1
                    nucpos += 3
            except KeyError:
                nucpos += 3
                continue
        nucpos += 3

calclist=[]
for i in range(0,len(dslist)):
    calclist.append(dnlist[i] - dslist[i])

zmin =stats.zscore(calclist)

plt.figure()
for i, z in enumerate(zmin):
    if z > 3:
        plt.scatter(i,z, color ='orange')
    else:
        plt.scatter(i, z, color='blue')
plt.ylabel("zscore")
plt.xlabel("codon num")

    
    
plt.savefig("scatter.png")
plt.close()

示例#49

0

显示文件

文件： data_extractor_updated_interpolated_oct_30.py 项目： str4h4t/c_core

    date_bucket = np.hstack((date_bucket, np.asarray(tsd['timestamp'][i])))
date_bucket = np.unique(date_bucket)
date_bucket = date_bucket[date_bucket > datetime.date(2020, 1, 1)]
interpolated_time_series = []
for i in range(0, tsd.__len__()):
    series = pd.DataFrame(tsd['data'][i], tsd['timestamp'][i])
    series = series[~series.index.duplicated(keep='first')]
    series = series[series.index > datetime.date(2020, 1, 1)]
    series = series.reindex(
        date_bucket,
        fill_value=0).sort_index().mask(series == 0).interpolate()
    if series.isna().sum()[0] > series.__len__() / 2 or series.var(
    )[0] < variance_threshold:
        continue
    elif series.isna().sum()[0] > 0:
        series = series.fillna(0)
    series['zscore'] = st.zscore(series)
    interpolated_time_series.append({
        "path": tsd['path'][i],
        "node": tsd['node'][i],
        "slot": tsd['slot'][i],
        "port": tsd['port'][i],
        "pm": tsd['pm'][i],
        "raw_data": np.asarray(series[0]),
        "z-score": np.asarray(series['zscore']),
        "timestamp": np.asarray(series.index)
    })
    print(i)
f = open("vodafone_data_oct30_not_pm_filtered_interpolated.pkl", "wb")
pickle.dump(pd.DataFrame(interpolated_time_series), f)
f.close()

示例#50

0

显示文件

文件： 7_CTD_eval.py 项目： jmjung83/PetriNet_for_therapeuticTarget_of_muscleAtrophy

    ## comparison to shortest path analysis
    ALL = set(INH_PLC_list)
    HIT = PET_TG & SPL_TG
    pval = stats.hypergeom.sf(len(HIT), len(ALL), len(PET_TG), len(SPL_TG))
    print("Petrinet & shortest path", pval, [entz_dic[entz] for entz in HIT])

    ## get gold standard from CTD database
    gene_df = pd.read_table('./CTD/CTD_genes_related_to_atrophy.txt', sep='\t')

    ## get z-score of inferenece score in CTD
    infScore_ds = gene_df['InferenceScore']
    scoreList = list(infScore_ds.dropna())

    scoreDic = {}
    zScoreList = stats.zscore(scoreList)
    for ii in range(len(scoreList)):
        scoreDic[scoreList[ii]] = zScoreList[ii]

    GS_dic = {}
    for index, row in gene_df.iterrows():
        if pd.isnull(row['DirectEvidence']) == False:
            if str(row['GeneID']) == '3479':  # use IGF1R instead of IGF1
                GS_dic['3480'] = 100
            else:
                GS_dic[str(row['GeneID'])] = 100

    for index, row in gene_df.iterrows():
        if pd.isnull(row['DirectEvidence']) == True:
            if GS_dic.has_key(str(row['GeneID'])):
                continue

示例#51

0

显示文件

文件： ZScore1.py 项目： himanshuverma2908/ZScore

import numpy as np
import pandas as pd
from copy import deepcopy
from scipy import stats
from mpl_toolkits.mplot3d import Axes3D
get_ipython().magic('matplotlib inline')
#%matplotlib inline
from matplotlib import pyplot as plt
plt.rcParams['figure.figsize'] = (16, 9)
plt.style.use('ggplot')

# Importing the dataset
data = pd.read_csv('C:\\Users\\himverma\\AnacondaProjects\\KMeans\\xclaraOriginal.csv')
print("Input Data and Shape")
print(data.shape)
data.head()

# Getting the values and plotting it
f1 = data['V1'].values
#f2 = data['V2'].values
#X = np.array(list(zip(f1, f2)))
X = np.array(f1)
stats.zscore(X)
fit = stats.norm.pdf(X, np.mean(X), np.std(X)) 
plt.plot(X,fit,'-o')
plt.hist(X, 30, normed=True)
plt.show()
print(X.mean())

示例#52

0

显示文件

# "standard representation" for the course, is the number of classes, C:
C = len(classNames)


# Add offset attribute
X = np.concatenate((np.ones((X.shape[0],1)),X),1)
#attributeNames = [u'Offset']+attributeNames
M = M+1


#attributeNames = ('Offset', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline')
#attributeNames = ('Offset', 'Ash', 'Magnesium', 'Color intensity', 'Hue' 'Proline')
attributeNames = ('Offset', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline')

#Standalize the data to zero mean and standard deviation of 1
X_standarized = zscore(X, ddof=1) #Do the standalization 



############################################
#Use exercise 8.1.1
###########################################


## Crossvalidation
# Create crossvalidation partition for evaluation
K = 10
CV = model_selection.KFold(K, shuffle=True)
#CV = model_selection.KFold(K, shuffle=False)

# Values of lambda

示例#53

0

显示文件

    def create_model(self, train_X, train_y, val_X, val_y):
        """
        Args:
            train_X (pandas dataframe)
            train_y (pandas dataframe)
        Returns:
            ExtraTreesRegressor
        """
        # モデル作成
        train_X = train_X[self.feature_columns]
        train_X = stats.zscore(train_X)
        train_X = train_X.reshape(
            (train_X.shape[0], 1, train_X.shape[1]))
        val_X = val_X[self.feature_columns]
        val_X = stats.zscore(val_X)
        val_X = val_X.reshape((val_X.shape[0], 1, val_X.shape[1]))

        model = Sequential()
        model.add(LSTM(512, input_shape=(train_X.shape[1], train_X.shape[2])))
        model.add(BatchNormalization())
        model.add(Dropout(.2))

        model.add(Dense(256))
        model.add(PReLU())
        model.add(BatchNormalization())
        model.add(Dropout(.1))

        model.add(Dense(256))
        model.add(PReLU())
        model.add(BatchNormalization())
        model.add(Dropout(.1))

        model.add(Dense(128))
        model.add(PReLU())
        model.add(BatchNormalization())
        model.add(Dropout(.05))

        model.add(Dense(64))
        model.add(PReLU())
        model.add(BatchNormalization())
        model.add(Dropout(.05))

        model.add(Dense(32))
        model.add(PReLU())
        model.add(BatchNormalization())
        model.add(Dropout(.05))

        model.add(Dense(16))
        model.add(PReLU())
        model.add(BatchNormalization())
        model.add(Dropout(.05))

        model.add(Dense(1))

        # ネットワークのコンパイル
        model.compile(loss='mse', optimizer=optimizers.Adam(0.001),
                      metrics=['mse'])

        callbacks = [
            EarlyStopping(monitor='val_loss', patience=10, verbose=0),
            ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=7,
                              verbose=1, epsilon=1e-4, mode='min')
        ]

        model.fit(x=train_X, y=train_y, epochs=80,
                  validation_data=(val_X, val_y), callbacks=[callbacks])

示例#54

0

显示文件

def impute_markers(model_path,
                   data_path,
                   *,
                   save_path=None,
                   start_frame=None,
                   n_frames=None,
                   stride=1,
                   markers_to_fix=None,
                   error_diff_thresh=.25,
                   model=None):
    """Imputes the position of missing markers.

    :param model_path: Path to model to use for prediction.
    :param data_path: Path to marker and bad_frames data. Can be hdf5 or
                      mat -v7.3.
    :param save_path: Path to .mat file where predictions will be saved.
    :param start_frame: Frame at which to begin imputation.
    :param n_frames: Number of frames to impute.
    :param stride: stride length between frames for faster imputation.
    :param markers_to_fix: Markers for which to override suspicious MoCap
                           measurements
    :param error_diff_thresh: Z-scored difference threshold marking suspicious
                              frames
    :param model: Model to be used in prediction. Overrides model_path.
    :return: preds
    """
    # Check data extensions
    filename, file_extension = os.path.splitext(data_path)
    accepted_extensions = {'.h5', '.hdf5', '.mat'}
    if file_extension not in accepted_extensions:
        raise ValueError('Improper extension: hdf5 or \
                         mat -v7.3 file required.')

    # Load data
    print('Loading data')
    f = h5py.File(data_path, 'r')
    if file_extension in {'.h5', '.hdf5'}:
        markers = np.array(f['markers'][:]).T
        marker_means = np.array(f['marker_means'][:]).T
        marker_stds = np.array(f['marker_stds'][:]).T
        bad_frames = np.array(f['bad_frames'][:]).T
    else:
        # Get the markers data from the struct
        dset = 'markers_aligned_preproc'
        marker_names = list(f[dset].keys())
        n_frames_tot = f[dset][marker_names[0]][:].T.shape[0]
        n_dims = f[dset][marker_names[0]][:].T.shape[1]

        markers = np.zeros((n_frames_tot, len(marker_names) * n_dims))
        for i in range(len(marker_names)):
            marker = f[dset][marker_names[i]][:].T
            for j in range(n_dims):
                markers[:, i * n_dims + j] = marker[:, j]

        # Z-score the marker data
        marker_means = np.mean(markers, axis=0)
        marker_means = marker_means[None, ...]
        marker_stds = np.std(markers, axis=0)
        marker_stds = marker_stds[None, ...]
        print(marker_means)
        print(marker_stds)
        markers = stats.zscore(markers)

        # Get the bad_frames data from the cell
        dset = 'bad_frames_agg'
        n_markers = f[dset][:].shape[0]
        bad_frames = np.zeros((markers.shape[0], n_markers))
        for i in range(n_markers):
            reference = f[dset][i][0]
            bad_frames[np.squeeze(f[reference][:]).astype('int32') - 1, i] = 1

    # Set number of frames to impute
    if n_frames is None:
        n_frames = markers.shape[0]
    if start_frame is None:
        start_frame = 0
    print('Predicting %d frames starting at frame %d.' %
          (n_frames, start_frame))

    # Exceptions
    if n_frames > markers.shape[0]:
        raise ValueError("Improper n_frames to predict: likely asked to " +
                         "predict a greater number of frames than were " +
                         "available.")
    if (n_frames + start_frame) > markers.shape[0]:
        raise ValueError('start_frame + n_frames exceeds matrix dimensions.')
    if n_frames < 0:
        raise ValueError("Improper n_frames to predict: likely too few input" +
                         " frames.")
    if n_frames == 0:
        raise ValueError("Improper n_frames to predict: likely asked to " +
                         "predict zero frames.")

    markers = markers[start_frame:(start_frame + n_frames):stride, :]
    bad_frames = bad_frames[start_frame:(start_frame + n_frames):stride, :]

    # Load model
    if model is None:
        print('Loading model')
        model = load_model(model_path)

    # Check how many outputs the model has, and how many members if returning
    # member data.
    n_outputs = len(model.output_shape)
    if n_outputs == 2:
        return_member_data = True
    else:
        return_member_data = False
        member_predsF = [None]
        member_predsR = [None]

    # Set Markers to fix
    if markers_to_fix is None:
        markers_to_fix = np.zeros((markers.shape[1])) > 1
        # TODO(Skeleton): Automate this by including the skeleton.
        markers_to_fix[30:36] = True
        markers_to_fix[42:] = True

    # If the model can return the member predictions, do so.
    if return_member_data:
        # Forward predict
        print('Imputing markers: forward pass')
        predsF, bad_framesF, member_predsF = \
            predict_markers(model, markers, bad_frames,
                            markers_to_fix=markers_to_fix,
                            error_diff_thresh=error_diff_thresh,
                            return_member_data=return_member_data)
        # Reverse Predict
        print('Imputing markers: reverse pass')
        predsR, bad_framesR, member_predsR = \
            predict_markers(model, markers[::-1, :], bad_frames[::-1, :],
                            markers_to_fix=markers_to_fix,
                            error_diff_thresh=error_diff_thresh,
                            return_member_data=return_member_data)
    else:
        # Forward predict
        print('Imputing markers: forward pass')
        predsF, bad_framesF = \
            predict_markers(model, markers, bad_frames,
                            markers_to_fix=markers_to_fix,
                            error_diff_thresh=error_diff_thresh,
                            return_member_data=return_member_data)
        # Reverse Predict
        print('Imputing markers: reverse pass')
        predsR, bad_framesR = \
            predict_markers(model, markers[::-1, :], bad_frames[::-1, :],
                            markers_to_fix=markers_to_fix,
                            error_diff_thresh=error_diff_thresh,
                            return_member_data=return_member_data)

    # Convert to real world coordinates
    markers_world = np.zeros((markers.shape))
    predsF_world = np.zeros((predsF.shape))
    predsR_world = np.zeros((predsR.shape))
    for i in range(markers_world.shape[1]):
        markers_world[:, i] = \
            markers[:, i]*marker_stds[0, i] + marker_means[0, i]
        predsF_world[:, i] = \
            predsF[:, i]*marker_stds[0, i] + marker_means[0, i]
        predsR_world[:, i] = \
            predsR[:, i]*marker_stds[0, i] + marker_means[0, i]

    predsR_world = predsR_world[::-1, :]
    bad_framesR = bad_framesR[::-1, :]

    # This is not necessarily all of the error frames from
    # multiple_predict_recording_with_replacement, but if they overlap,
    # we would just take the weighted average.
    for i in range(bad_frames.shape[1]):
        bad_frames[:, i] = np.any(bad_framesF[:, (i * 3):(i * 3) + 3]
                                  & bad_framesR[:, (i * 3):(i * 3) + 3],
                                  axis=1)

    # Compute the weighted average of the forward and reverse predictions using
    # a logistic function
    print('Computing weighted average')
    preds_world = np.zeros(predsF_world.shape)
    for i in range(bad_frames.shape[1] * 3):
        is_bad = bad_frames[:, np.floor(i / 3).astype('int32')]
        CC = measure.label(is_bad, background=0)
        num_CC = len(np.unique(CC)) - 1
        preds_world[:, i] = predsF_world[:, i]
        for j in range(num_CC):
            length_CC = np.sum(CC == (j + 1))
            x_0 = np.round(length_CC / 2)
            k = 1
            weightR = sigmoid(np.arange(length_CC), x_0, k)
            weightF = 1 - weightR
            preds_world[CC == (j+1), i] = \
                (predsF_world[CC == (j+1), i]*weightF) +\
                (predsR_world[CC == (j+1), i]*weightR)

    # Save predictions to a matlab file.
    if save_path is not None:
        s = 'Saving to %s' % (save_path)
        print(s)
        savemat(
            save_path, {
                'preds': preds_world,
                'markers': markers_world,
                'badFrames': bad_frames,
                'member_predsF': member_predsF,
                'member_predsR': member_predsR
            })

    return preds_world

示例#55

0

显示文件

文件： preprocessing.py 项目： murraylab/gemmr

def prepare_confounders(
    sm,
    confounders=tuple(),
    hcp_confounders=False,
    hcp_confounder_software_version=True,
    squared_confounders=False,
    impute0=True,
    #headmotion_features=('movement_AbsoluteRMS_mean',
    #                     'movement_RelativeRMS_mean')
):
    """Prepare the confounder matrix.

    Parameters
    ----------
    sm : pd.DataFrame (n_samples, n_features)
        behavioral data matrix
    confounders : tuple of str
        column-names in ``sm`` to be used as confounders. If some are
        not found a warning is issued and the code will continue without the
        missing ones.
    hcp_confounders : bool
        if ``True`` 'Weight', 'Height', 'BPSystolic', 'BPDiastolic', 'HbA1C'
        as well as the cubic roots of 'FS_BrainSeg_Vol', 'FS_IntraCranial_Vol'
        are included as confounders
    hcp_confounder_software_version : bool
        if ``True`` and ``hcp_confounders`` is also ``True``, then the feature
        'fMRI_3T_ReconVrs' (encoded as a dummy variable) is used as confounder
    squared_confounders : bool
        if ``True`` the squares of all confounders (except software version, if
        used) are used as additional confounders
    impute0 : bool
        if True, missing confound values are imputed with 0 (after an
        inverse normal transformation)

    Returns
    -------
    confounders : np.ndarray (n_samples, n_features)
        confounder data matrix, if impute0 is ``False`` it can have ``NaN``s

    Raises
    ------
    ValueError
        if confounders couldn't be found in ``sm``
    """

    _confounders = [f for f in confounders if f in sm]
    if len(_confounders) != len(confounders):
        missing_confounders = [f for f in confounders if f not in sm]
        raise ValueError('Confounders not found: '
                         '{}'.format(missing_confounders))
    confounders_matrix = sm[_confounders].values

    if hcp_confounders:
        sm_confounders = sm[[
            'Weight',
            'Height',
            'BPSystolic',
            'BPDiastolic',
            'HbA1C',
        ]].values
        fs_confounders = sm[['FS_BrainSeg_Vol',
                             'FS_IntraCranial_Vol']].values**(1. / 3)
        confounders_matrix = np.hstack(
            [confounders_matrix, sm_confounders, fs_confounders])

    if squared_confounders:
        confounders_matrix = np.hstack(
            [confounders_matrix, confounders_matrix**2])

    if hcp_confounders and hcp_confounder_software_version:
        # software reconstruction version
        reconvrs = sm['fMRI_3T_ReconVrs'].values
        used_reconvrss = np.unique(reconvrs)
        print('used fMRI 3T reconstruction software versions are:',
              used_reconvrss)
        assert set(used_reconvrss.tolist()) == {'r177', 'r177 r227', 'r227'}

        # dummy-coding: r177 -> 0, r227 -> 1, "r177 r227" -> 1
        reconvrs = np.where(reconvrs == 'r177', 0, 1).reshape(-1, 1)

        confounders_matrix = np.hstack([confounders_matrix, reconvrs])

    if confounders_matrix.shape[1] > 0:

        # inverse normal transform (this also results in mean 0)
        confounders_matrix = \
            rank_based_inverse_normal_trafo(confounders_matrix)

        if impute0:
            # impute 0 for missing values
            print('{:.2f}% of values in confounders missing, imputing 0 for '
                  'these'.format(100 *
                                 (1 - np.isfinite(confounders_matrix).mean())))

            confounders_matrix[~np.isfinite(confounders_matrix)] = 0

        else:
            print('{:.2f}% of values in confounders missing'.format(
                100 * (1 - np.isfinite(confounders_matrix).mean())))

        # normalise
        confounders_matrix = zscore(confounders_matrix, nan_policy='omit')

    return confounders_matrix

示例#56

0

显示文件

    def train_price_model(self, data: pd.DataFrame):

        df = data
        df = df[((np.abs(stats.zscore(df.price)) < 2.8) &
                 (np.abs(stats.zscore(df.term)) < 2.8) &
                 (np.abs(stats.zscore(df.full_sq)) < 2.8))]

        # !!!!!!!! ADD 'was_opened'
        # Fix year: only 2019
        df = df[(df.yyyy_announce.isin([19, 20]))]
        df = df[[
            'price', 'to_center', 'full_sq', 'kitchen_sq', 'life_sq', 'rooms',
            'is_apartment', 'renovation', 'has_elevator', 'time_to_metro',
            'floor_first', 'floor_last', 'is_rented', 'rent_quarter',
            'rent_year', 'mm_announce', 'yyyy_announce', 'clusters'
        ]]
        # Save leaved columns to variable
        columns = list(df.columns)

        # Log transformation
        # Log Transformation

        df["full_sq"] = np.log1p(df["full_sq"])
        df["life_sq"] = np.log1p(df["life_sq"])
        df["kitchen_sq"] = np.log1p(df["kitchen_sq"])
        df["price"] = np.log1p(df["price"])
        df["to_center"] = np.log1p(df["to_center"])

        # Create features - predictors
        X = df.drop(['price'], axis=1)

        # Target feature
        y = df[['price']].values.ravel()

        # Split for train and test
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=1)

        # Define Gradient Boosting Machine model
        lgbm_model = LGBMRegressor(objective='regression',
                                   learning_rate=0.07,
                                   n_estimators=1250,
                                   max_depth=10,
                                   min_child_samples=1,
                                   verbose=0)
        # RF = RandomForestRegressor(n_estimators=300, verbose=1, n_jobs=-1)
        # Train GBR on train dataset
        lgbm_model.fit(X_train, y_train)
        lgbm_preds = lgbm_model.predict(X_test)
        print('The R2_score of the Gradient boost is',
              r2_score(y_test, lgbm_preds),
              flush=True)
        print('RMSE is: \n',
              mean_squared_error(y_test, lgbm_preds),
              flush=True)

        # Train GBR on full dataset
        lgbm_model.fit(X, y)
        return lgbm_model, columns

示例#57

0

显示文件

文件： lab4.py 项目： h11128/project

# # Preliminary data analysis
# ## missing value

# In[6]:

null_data = seeds[seeds.isnull().any(axis=1)]
display(null_data)

# No missing value here
#
# ## outliers

# In[7]:
"""" clean outliers here by compute Z-score of each value in the column, if abs of Z score bigger than 3 than delete this row"""
data = seeds[(np.abs(stats.zscore(seeds)) < 3).all(axis=1)]
data.info()

# only deleted 2 rows so delete the outlier rows directly is a easy way to process ouelier.
#
# ## Correlation

# In[8]:

plt.figure(figsize=(20, 7))
sns.heatmap(data.corr(), cmap='BrBG', annot=True)
plt.title('Variables Correlation', fontsize=18)
plt.show()

# As can be seen asym is most unrelated to other attributes.
#

示例#58

0

显示文件

文件： creditcard_clustering.py 项目： mengshanli/Creditcard_Clustering

plt.savefig('KDE_MINIMUM_PAYMENTS.png')
data['MINIMUM_PAYMENTS'] = data['MINIMUM_PAYMENTS'].fillna(
    data['MINIMUM_PAYMENTS'].median())

pd.isnull(data).sum()

#%%
'''
Outliers Treatment
'''
# calculate z-score
from scipy import stats
# drop string feature and features with meaningful range
data1 = data.drop(columns=['CUST_ID', 'TENURE'])

z_score = pd.DataFrame(np.abs(stats.zscore(data1)), columns=data1.columns)

# Find out features with more than 2% outliers (absolute z-score >3)
z_score3 = []
over3_index = []
for i in z_score.columns:
    indexs = z_score.index[z_score[i] > 3].tolist()
    ans = i, "{:.3f}".format(len(indexs) / len(z_score)), indexs
    z_score3.append(ans)
    if len(indexs) / len(z_score) > 0.02:
        over3_index.append(i)

# remove 'BALANCE' and 'CASH_ADVANCE' since thay are regarded as high
# discriminative features
del over3_index[0]
del over3_index[1]

示例#59

0

显示文件

文件： server.py 项目： chandrusekar47/visualizations

            int(team["L"]),
            "is_world_series_winner":
            team["WSWin"],
            "attendance":
            float(team["attendance"]),
            "avg_salary":
            float(round(average(players_in_team, "salary"), 2)),
            "batting_avg":
            float(round(average(map(batting_average, players_in_team)), 3)),
            "era":
            float(round(average(pitchers_in_team, "ERA"), 3))
        })

grouped_by_year = group_by(flat_franchise_year, "year")
for year, records in grouped_by_year.iteritems():
    z_scores_salary = np.round(stats.zscore(pluck("avg_salary", records)), 2)
    z_scores_wins = np.round(stats.zscore(pluck("wins", records)), 2)
    z_scores_batting_avg = np.round(
        stats.zscore(pluck("batting_avg", records)), 2)
    z_scores_losses = np.round(stats.zscore(pluck("losses", records)), 2)
    z_scores_attendance = np.round(stats.zscore(pluck("attendance", records)),
                                   2)
    z_scores_era = np.round(stats.zscore(pluck("era", records)), 2)
    for i, record in enumerate(records):
        record["z_avg_salary"] = z_scores_salary[i]
        record["z_wins"] = z_scores_wins[i]
        record["z_batting_avg"] = z_scores_batting_avg[i]
        record["z_losses"] = z_scores_losses[i]
        record["z_attendance"] = z_scores_attendance[i]
        record["z_era"] = z_scores_era[i]

示例#60

0

显示文件

文件： IndivComp.py 项目： pkjennings999/machine-learning-individual-competition

def removeOutliersZScore(data):
    outlierColumns = data[[IncomeColumn]].copy()
    z = numpy.abs(stats.zscore(outlierColumns))

    newData = data[(z < 12).all(axis=1)]
    return newData