def moments(self): """Calculate covariance and correlation matrices, trait, genotipic and ontogenetic means""" zs = np.array([ind["z"] for ind in self.pop]) xs = np.array([ind["x"] for ind in self.pop]) ys = np.array([ind["y"] for ind in self.pop]) bs = np.array([ind["b"] for ind in self.pop]) ymean = ys.mean(axis=0) zmean = zs.mean(axis=0) xmean = xs.mean(axis=0) ymean = ys.mean(axis=0) bmean = bs.mean(axis=0) phenotipic = np.cov(zs, rowvar=0, bias=1) genetic = np.cov(xs, rowvar=0, bias=1) heridability = genetic[np.diag_indices_from(genetic)] / phenotipic[np.diag_indices_from(phenotipic)] corr_phenotipic = np.corrcoef(zs, rowvar=0, bias=1) corr_genetic = np.corrcoef(xs, rowvar=0, bias=1) avgP = avg_ratio(corr_phenotipic, self.modules) avgG = avg_ratio(corr_genetic, self.modules) return { "y.mean": ymean, "b.mean": bmean, "z.mean": zmean, "x.mean": xmean, "P": phenotipic, "G": genetic, "h2": heridability, "avgP": avgP, "avgG": avgG, "corrP": corr_phenotipic, "corrG": corr_genetic, }
def corr_xy(x, y, similar_type=ECoreCorrType.E_CORE_TYPE_PEARS, **kwargs): """ 计算两个可迭代序列相关系数对外函数 :param x: 可迭代序列 :param y: 可迭代序列 :param similar_type: ECoreCorrType, 默认值ECoreCorrType.E_CORE_TYPE_PEARS :return: x与y的相关系数返回值 """ if similar_type == ECoreCorrType.E_CORE_TYPE_PEARS: # 皮尔逊相关系数计算 return np.corrcoef(x, y)[0][1] elif similar_type == ECoreCorrType.E_CORE_TYPE_SPERM: # 斯皮尔曼相关系数计算, 使用自定义spearmanr,不计算p_value return spearmanr(x, y)[0][1] elif similar_type == ECoreCorrType.E_CORE_TYPE_SIGN: # 序列+-符号相关系数, 使用np.sign取符号后,再np.corrcoef计算 sign_x = np.sign(x) sign_y = np.sign(y) return np.corrcoef(sign_x, sign_y)[0][1] elif similar_type == ECoreCorrType.E_CORE_TYPE_ROLLING: # pop参数window,默认使用g_rolling_corr_window window = kwargs.pop('window', g_rolling_corr_window) # 加权时间需要可迭代序列是pd.Series if not isinstance(x, pd.Series): x = pd.Series(x) if not isinstance(y, pd.Series): y = pd.Series(y) return rolling_corr(x, y, window=window)
def correlate(self, signal): """ Correlate records against one or many one-dimensional arrays. Parameters ---------- signal : array-like One or more signals to correlate against. """ s = asarray(signal) if s.ndim == 1: if size(s) != self.shape[-1]: raise ValueError("Length of signal '%g' does not match record length '%g'" % (size(s), self.shape[-1])) return self.map(lambda x: corrcoef(x, s)[0, 1], index=[1]) elif s.ndim == 2: if s.shape[1] != self.shape[-1]: raise ValueError("Length of signal '%g' does not match record length '%g'" % (s.shape[1], self.shape[-1])) newindex = arange(0, s.shape[0]) return self.map(lambda x: array([corrcoef(x, y)[0, 1] for y in s]), index=newindex) else: raise Exception('Signal to correlate with must have 1 or 2 dimensions')
def simple_cv(valence_regressors, arousal_regressors, valence_movie_matrices, arousal_movie_matrices, valence_labels_movies, arousal_labels_movies, threshold, valence_movie_t, arousal_movie_t): n_train_matrices = 21 n_valid_matrices = 6 n_test_matrices = 3 valence_labels = join_vectors(valence_labels_movies) arousal_labels = join_vectors(arousal_labels_movies) print len(valence_labels), len(arousal_labels) processes = [] n_valence_features, n_arousal_features = threshold_n_features(threshold, valence_movie_t, arousal_movie_t) valence_predictions, arousal_predictions = np.array([], dtype = 'float'), np.array([], dtype = 'float') for i in range(0, 10): valence_test_predictions, arousal_test_predictions = fold_training(valence_predictions, arousal_predictions, i, valence_regressors, arousal_regressors, valence_movie_matrices, arousal_movie_matrices, valence_labels_movies, arousal_labels_movies, n_test_matrices, n_train_matrices, n_valid_matrices, n_valence_features, n_arousal_features) valence_predictions = np.append(valence_predictions, valence_test_predictions) arousal_predictions = np.append(arousal_predictions, arousal_test_predictions) print math.sqrt(mean_squared_error(valence_labels, valence_predictions)), np.corrcoef(valence_labels, valence_predictions)[0][1] print math.sqrt(mean_squared_error(arousal_labels, arousal_predictions)), np.corrcoef(arousal_labels, arousal_predictions)[0][1]
def on_epoch_end(self, epoch, logs=None): if self.currentEpoch % self.freq == 0: self.results["epochs"].append(self.currentEpoch) # add the epoch's number evaluation = "prediction (r^2)" resultsText = "" if self.M is not None: yhatKeras = self.model.predict(self.M) yhatKeras += self.modelEpsilon # for numerical stability rSQ = np.corrcoef( self.y, yhatKeras, rowvar=0)[1,0]**2 # 0.1569 self.results["train_accuracy"].append(rSQ) resultsText += "Training " +evaluation +":" + str(rSQ) + " / " if self.M_validation is not None: yhatKeras = self.model.predict(self.M_validation) yhatKeras += self.modelEpsilon # for numerical stability rSQ = np.corrcoef( self.y_validation, yhatKeras, rowvar=0)[1,0]**2 # 0.1569 self.results["test_accuracy"].append(rSQ) resultsText += "Test " +evaluation +":" + str(rSQ) print(resultsText, flush = True) self.currentEpoch += 1
def plotetc(x,y,stat,season): cc_all = np.corrcoef(x, y['All'])[0][1] cc_opt = np.corrcoef(x, y['Optimal'])[0][1] cc_b1 = np.corrcoef(x, y['b1'])[0][1] cc_b2 = np.corrcoef(x, y['b2'])[0][1] print "Correlation coefficients for scores with {0} NAO during {1}".format(stat, season) print "Optimal\tb1\tb2\tAll" print "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}\n".format(cc_opt, cc_b1, cc_b2, cc_all) # matplotlib.rcParams['axes.grid'] = True # matplotlib.rcParams['legend.fancybox'] = True # matplotlib.rcParams['figure.figsize'] = 18, 9 # matplotlib.rcParams['savefig.dpi'] = 300 # # Set figure name and number for pdf ploting # pdfName = '{0}_{1}.pdf'.format(stat, season) # pp1 = PdfPages(os.path.join('/Users/andrew/Google Drive/Work/MeltModel/Output/',pdfName)) # fig1 = plt.figure(1) # ax1 = fig1.add_subplot(111) # ax1.plot(x, y['Optimal'], 'ok', label='Optimum') # ax1.plot(x, y['All'], 'or', label='All') # ax1.plot(x, y['b1'], 'og', label='b1') # ax1.plot(x, y['b2'], 'ob', label='b2') # ax1.set_xlabel("NAO") # ax1.set_xlim((-3,3)) # ax1.set_ylabel("Score") # # # #ax2 = ax1.twinx() # #ax2.plot(x, y['AdjOptimal'], 'ok', label='Adjusted') # #ax2.set_ylabel("Adjusted Score") # plt.title(stat) # plt.legend(loc='upper left') # pp1.savefig(bbox_inches='tight') # pp1.close() # plt.close() return 0
def make_figures(): wave1 = make_wave(0) wave2 = make_wave(offset=1) thinkplot.preplot(2) wave1.segment(duration=0.01).plot(label='wave1') wave2.segment(duration=0.01).plot(label='wave2') numpy.corrcoef(wave1.ys, wave2.ys) thinkplot.save(root='autocorr1', xlabel='time (s)', ylabel='amplitude') offsets = numpy.linspace(0, PI2, 101) corrs = [] for offset in offsets: wave2 = make_wave(offset) corr = numpy.corrcoef(wave1.ys, wave2.ys)[0, 1] corrs.append(corr) thinkplot.plot(offsets, corrs) thinkplot.save(root='autocorr2', xlabel='offset (radians)', ylabel='correlation', xlim=[0, PI2])
def corr(x, y, reps=10**4, prng=None): """ Simulate permutation p-value for Spearman correlation coefficient Parameters ---------- x : array-like y : array-like reps : int prng : RandomState instance or None, optional (default=None) If RandomState instance, prng is the pseudorandom number generator; If None, the pseudorandom number generator is the RandomState instance used by `np.random`. Returns ------- tuple Returns test statistic, left-sided p-value, right-sided p-value, two-sided p-value, simulated distribution """ if prng is None: prng = RandomState() tst = np.corrcoef(x, y)[0, 1] sims = [np.corrcoef(prng.permutation(x), y)[0, 1] for i in range(reps)] left_pv = np.sum(sims <= tst)/reps right_pv = np.sum(sims >= tst)/reps two_sided_pv = np.sum(np.abs(sims) >= np.abs(tst))/reps return tst, left_pv, right_pv, two_sided_pv, sims
def main(): #change these ranges... Chloro_range = range(1,6) LAI_range = range(1,10) TGI_data = np.empty([len(Chloro_range),len(LAI_range)]) G_R_data = np.empty([len(Chloro_range),len(LAI_range)]) VARI_data = np.empty([len(Chloro_range),len(LAI_range)]) for chloro in Chloro_range: for LAI in LAI_range: TGI, G_R_ratio, VARI = SimulatePlant(1.5, 10*chloro, 8, 0, 0.01, .009, 1, 10*LAI, 0.01, 30, 0, 10, 0, pyprosail.Planophile) TGI_data[chloro-1][LAI-1] = TGI G_R_data[chloro-1][LAI-1] = G_R_ratio VARI_data[chloro-1][LAI-1] = VARI print "TGI:" print TGI_data # in these, going down is increasing chlorophyl, and going to the right is increasing LAI print "G/R Index" print G_R_data print "VARI" print VARI_data #print "Chloro - TGI Corr:", np.corrcoef(Chloro_range, TGI_data[:,3])[1][0] #print "Chloro - Green/Red Ratio Corr:", np.corrcoef(Chloro_range, G_R_data[:,3])[1][0] #print "LAI - TGI Corr:", np.corrcoef(LAI_range, TGI_data[3])[1][0] #print "LAI - Green/Red Ratio Corr:", np.corrcoef(LAI_range, G_R_data[3])[1][0] print "LAI - VARI Corr:", np.corrcoef(LAI_range, VARI_data[3])[1][0] print "Chloro - VARI Corr:", np.corrcoef(Chloro_range, VARI_data[:,3])[1][0] D3Plot(VARI_data, "VARI", Chloro_range, LAI_range) D3Plot(TGI_data, "TGI", Chloro_range, LAI_range) D3Plot(G_R_data, "G/R", Chloro_range, LAI_range)
def main(): if len(sys.argv) < 2: print("Usage: ./bootstrap.py <project_dir>") sys.exit(-1) project_dir = sys.argv[1] project = Project(join(project_dir, "project.json")) # For each bootstraped model for (bootstrap_number, bootstrap) in enumerate(project.bootstraps): boot_dir = os.path.abspath(join(project_dir, "bootstrap{}-{}".format(bootstrap_number, type(bootstrap.base_model).__name__))) os.makedirs(boot_dir, exist_ok=True) # Interior f, ax = plot_covariance_matrix(np.corrcoef(bootstrap.internals, rowvar=0), ["fx", "fy", "ppx", "ppy", "ps"]) savefigure(f, join(boot_dir, "covariance-interior")) # For each camera for (cam_number, cam) in enumerate(bootstrap.extract_cameras()): # Scatter and distribution plots f, ax = plot_scatter(cam[:,0], cam[:, 1]) savefigure(f, join(boot_dir, "cam{}-xy".format(cam_number))) f, ax = plot_distribution(cam[:,2]) ax.set_xlabel("Z") savefigure(f, join(boot_dir, "cam{}-z".format(cam_number))) # X, Y, Z and angles covariances matrices S = np.corrcoef(cam, rowvar=0) f, ax = plot_covariance_matrix(S[:3, :3], ["X", "Y", "Z"]) savefigure(f, join(boot_dir, "covariance-cam{}-pos".format(cam_number))) f, ax = plot_covariance_matrix(S[3:, 3:], [r"$\Omega$", "$\phi$", r"$\kappa$"]) savefigure(f, join(boot_dir, "covariance-cam{}-angles".format(cam_number)))
def calc_correlation(self, Xs): for X in Xs: pass print np.corrcoef(X.T) # np.savetxt("correlations.csv", np.corrcoef(X.T), delimiter=",") print 3
def corr(x, y, reps=10**4, seed=None): r""" Simulate permutation p-value for Spearman correlation coefficient Parameters ---------- x : array-like y : array-like reps : int seed : RandomState instance or {None, int, RandomState instance} If None, the pseudorandom number generator is the RandomState instance used by `np.random`; If int, seed is the seed used by the random number generator; If RandomState instance, seed is the pseudorandom number generator Returns ------- tuple Returns test statistic, left-sided p-value, right-sided p-value, two-sided p-value, simulated distribution """ prng = get_prng(seed) tst = np.corrcoef(x, y)[0, 1] sims = [np.corrcoef(prng.permutation(x), y)[0, 1] for i in range(reps)] left_pv = np.sum(sims <= tst) / reps right_pv = np.sum(sims >= tst) / reps two_sided_pv = np.min([1, 2 * np.min([left_pv, right_pv])]) return tst, left_pv, right_pv, two_sided_pv, sims
def main(): # Define matrix dimensions Nobs = 1000 # Number of observation Nvars = 50000 # Number of variables Ncomp = 100 # Number of components # Simulated true sources S_true = np.random.logistic(0,1,(Ncomp,Nvars)) # Simulated true mixing A_true = np.random.normal(0,1,(Nobs,Ncomp)) # X = AS X = np.dot(A_true,S_true) # add some noise X = X + np.random.normal(0,1,X.shape) # apply ICA on X and ask for 2 components model = ica1(Ncomp) start = time.time() A,S = model.fit(X) total = time.time() - start print('total time: {}'.format(total)) # compare if our estimates are accurate # correlate A with Atrue and take aCorr = np.abs(np.corrcoef(A.T,A_true.T)[:Ncomp,Ncomp:]).max(axis = 0).mean() sCorr = np.abs(np.corrcoef(S,S_true)[:Ncomp,Ncomp:]).max(axis = 0).mean() print("Accuracy of estimated sources: %.2f"%sCorr) print("Accuracy of estimated mixing: %.2f"%aCorr)
def corr_matrix(df, similar_type=ECoreCorrType.E_CORE_TYPE_PEARS, **kwargs): """ 与corr_xy的区别主要是,非两两corr计算,输入参数除类别外,只有一个矩阵的输入,且输入必须为pd.DataFrame对象 or np.array :param df: pd.DataFrame or np.array, 之所以叫df,是因为在内部会统一转换为pd.DataFrame :param similar_type: ECoreCorrType, 默认值ECoreCorrType.E_CORE_TYPE_PEARS :return: pd.DataFrame对象 """ if isinstance(df, np.ndarray): # 把np.ndarray转DataFrame,便统一处理 df = pd.DataFrame(df) if not isinstance(df, pd.DataFrame): raise TypeError('df must pd.DataFrame object!!!') # FIXME 这里不应该支持ECoreCorrType.E_CORE_TYPE_PEARS.value,只严格按照ECoreCorrType对象相等 if similar_type == ECoreCorrType.E_CORE_TYPE_PEARS or similar_type == ECoreCorrType.E_CORE_TYPE_PEARS.value: # 皮尔逊相关系数计算 corr = np.corrcoef(df.T) elif similar_type == ECoreCorrType.E_CORE_TYPE_SPERM or similar_type == ECoreCorrType.E_CORE_TYPE_SPERM.value: # 斯皮尔曼相关系数计算, 使用自定义spearmanr,不计算p_value corr = spearmanr(df) elif similar_type == ECoreCorrType.E_CORE_TYPE_SIGN or similar_type == ECoreCorrType.E_CORE_TYPE_SIGN.value: # 序列+-符号相关系数, 使用np.sign取符号后,再np.corrcoef计算 corr = np.corrcoef(np.sign(df.T)) elif similar_type == ECoreCorrType.E_CORE_TYPE_ROLLING or similar_type == ECoreCorrType.E_CORE_TYPE_ROLLING.value: # pop参数window,默认使用g_rolling_corr_window window = kwargs.pop('window', g_rolling_corr_window) corr = rolling_corr(df, window=window) else: # 还是给个默认的corr计算np.corrcoef(df.T) corr = np.corrcoef(df.T) # 将计算结果的corr转换为pd.DataFrame对象,行和列索引都使用df.columns corr = pd.DataFrame(corr, index=df.columns, columns=df.columns) return corr
def Sensitivity_printImpactResults(finalResults): # Performs numerical analysis on sensitivity trials resultsFile = "Results\\Impact\\Impact_Correlation.txt" with open(resultsFile, 'w') as f: writer = csv.writer(f, delimiter = '\n', quoting=csv.QUOTE_NONE, quotechar='', escapechar='\\') for subResult in finalResults: plots = { 1: "Depression", 2: "Concealment", 3: "Discrimination", 4: "Support", 5: "Policy Score" } xArr = subResult[0] label = subResult[-1] yArrCorrelation_1 = np.corrcoef(xArr, subResult[1])[0][1] yArrCorrelation_2 = np.corrcoef(xArr, subResult[2])[0][1] depressCorrelate = "{} vs. Depression Correlation: {}".\ format(label, yArrCorrelation_1) concealCorrelate = "{} vs. Concealment Correlation: {}".\ format(label, yArrCorrelation_2) row = [depressCorrelate, concealCorrelate] writer.writerow(row) for plot in plots: Sensitivity_plotGraphs(xArr, subResult[plot], label, plots[plot], "impact")
def correlation(self): keys_a = set(self.gdp.keys()) keys_b = set(self.complaint_allstate.keys()) intersection = keys_a & keys_b corr_dict = {} ax= [] ay = [] for v in intersection: y = self.gdp[v].values() x = self.complaint_allstate[v].values() ax.append(x) ay.append(y) ''' if(len(x) != len(y)): continue else: corr_dict.update({v:np.corrcoef(x,y)[0,1]})''' if len(ax) != len(ay): if(len(ax)> len(ay)): ay = ay[:len(ax)] else: ax = ax[:len(ay)] print len(flatten(ax)),len(flatten(ay)) print np.corrcoef(flatten(ax)[:735],flatten(ay))[0,1] corrdict = OrderedDict(sorted(corr_dict.items(), key=itemgetter(1))) #print corrdict '''
def learnStructure(dataP, dataS, Pp, Ps, TAN= True): tempMatrix = [[0 for i in range(len(dataP))] for j in range(len(dataP))] for i in range(len(dataP)): for j in range(i+1, len(dataP)): temp = 0.0 if np.corrcoef(dataP[i], dataP[j])[0][1] != 1.0: temp += Pp * math.log(1-((np.corrcoef(dataP[i], dataP[j])[0][1])**2)) if np.corrcoef(dataS[i], dataS[j])[0][1] != 1.0: temp += Ps * math.log(1-((np.corrcoef(dataS[i], dataS[j])[0][1])**2)) temp *= (0.5) tempMatrix[i][j] = temp #tempMatrix[j][i] = temp MaxG = nx.DiGraph() if TAN: G = nx.from_scipy_sparse_matrix(minimum_spanning_tree(csr_matrix(tempMatrix))) adjList = G.adj i = 0 notReturnable = {} MaxG = getDirectedTree(adjList, notReturnable, MaxG, i) else: G = nx.Graph(np.asmatrix(tempMatrix)) adjList = sorted([(u,v,d['weight']) for (u,v,d) in G.edges(data=True)], key=lambda x:x[2]) i = 2 MaxG = getDirectedGraph(adjList, MaxG, i) return MaxG
def test_corrcoef2(self): # Test that _corrcoef2 returns the same result that np.corrcoef would n, m = tuple(np.random.randint(2, 5, size=2)) mean = np.random.uniform(-1, 1, size=m) cov = np.random.uniform(0, 1./m, size=(m, m)) cov = (cov + cov.T) / 2 cov.flat[::m + 1] = 1.0 X1 = np.random.multivariate_normal(mean, cov, size=n) X2 = np.random.multivariate_normal(mean, cov, size=n) expected = np.corrcoef(X1, X2, rowvar=True)[:n, n:] np.testing.assert_almost_equal( _corrcoef2(X1, X2, axis=1), expected, decimal=9 ) expected = np.corrcoef(X1, X2, rowvar=False)[:m, m:] np.testing.assert_almost_equal( _corrcoef2(X1, X2, axis=0), expected, decimal=9, ) with self.assertRaises(ValueError): _corrcoef2(X1, X2, axis=10)
def PrintResults(all_ground_truth,all_b1_output,all_b2_output,all_b3_output,all_b4_output,all_combined_output): print 'Error on baseline 1: ', numpy.std(all_ground_truth - all_b1_output,axis=0), \ numpy.mean(numpy.std(all_ground_truth - all_b1_output,axis=0)) correlation_matrix = numpy.corrcoef(all_ground_truth.T,all_b1_output.T) print 'cur_rho: ', correlation_matrix[0,3], correlation_matrix[1,4], correlation_matrix[2,5], \ (correlation_matrix[0,3]+correlation_matrix[1,4]+correlation_matrix[2,5])/3 print 'Error on baseline 2: ', numpy.std(all_ground_truth - all_b2_output,axis=0), \ numpy.mean(numpy.std(all_ground_truth - all_b2_output,axis=0)) correlation_matrix = numpy.corrcoef(all_ground_truth.T,all_b2_output.T) print 'cur_rho: ', correlation_matrix[0,3], correlation_matrix[1,4], correlation_matrix[2,5], \ (correlation_matrix[0,3]+correlation_matrix[1,4]+correlation_matrix[2,5])/3 print 'Error on baseline 3: ', numpy.std(all_ground_truth - all_b3_output,axis=0), \ numpy.mean(numpy.std(all_ground_truth - all_b3_output,axis=0)) correlation_matrix = numpy.corrcoef(all_ground_truth.T,all_b3_output.T) print 'cur_rho: ', correlation_matrix[0,3], correlation_matrix[1,4], correlation_matrix[2,5], \ (correlation_matrix[0,3]+correlation_matrix[1,4]+correlation_matrix[2,5])/3 print 'Error on baseline 4: ', numpy.std(all_ground_truth - all_b4_output,axis=0), \ numpy.mean(numpy.std(all_ground_truth - all_b4_output,axis=0)) correlation_matrix = numpy.corrcoef(all_ground_truth.T,all_b4_output.T) print 'cur_rho: ', correlation_matrix[0,3], correlation_matrix[1,4], correlation_matrix[2,5], \ (correlation_matrix[0,3]+correlation_matrix[1,4]+correlation_matrix[2,5])/3 print 'Error on combined: ', numpy.std(all_ground_truth - all_combined_output,axis=0), \ numpy.mean(numpy.std(all_ground_truth - all_combined_output,axis=0)) correlation_matrix = numpy.corrcoef(all_ground_truth.T,all_combined_output.T) print 'cur_rho: ', correlation_matrix[0,3], correlation_matrix[1,4], correlation_matrix[2,5], \ (correlation_matrix[0,3]+correlation_matrix[1,4]+correlation_matrix[2,5])/3
def covandcoef(compare_data): hx = [] hy = [] ox = [] oy = [] tx = [] ty = [] for i in compare_data: hx.append(i[4]) hy.append(i[7]) for i in range(0,7): ox.append(compare_data[i][4]) oy.append(compare_data[i][7]) for i in range(0,89): tx.append(compare_data[i][4]) ty.append(compare_data[i][7]) X = np.vstack((hx,hy)) Z = np.vstack((ox,oy)) Y = np.vstack((tx,ty)) return [[np.cov(X)[0][1],np.corrcoef(X)[0][1]],[np.cov(Y)[0][1],np.corrcoef(Y)[0][1]],[np.cov(Z)[0][1],np.corrcoef(Z)[0][1]]]
def correl(): for eof in [ 1, 2 ]: cook=[] glue=[] for model in models: fmod = '{0}/run1/dtred/{0}.space{1}.txt'.format(model,eof) fobs = '../../sst-data/detrend/ersst.space{0}.txt'.format(eof) eof_mod = np.loadtxt(fmod) print fmod eof_obs = np.loadtxt(fobs) #print eof_mod[0:40] idm = np.where(eof_mod == 999.) ido = np.where(eof_obs == 999.) eof_mod = np.delete(eof_mod, idm) eof_obs = np.delete(eof_obs, idm) cook.append([ model, np.corrcoef(eof_mod, eof_obs)[0, 1]] ) fmodpc = '{0}/run1/dtred/PC{1}.{0}.txt'.format(model,eof) fobspc = '../../sst-data/detrend/PC{0}.annual.txt'.format(eof) pc_mod = np.loadtxt(fmodpc) pc_obs = np.loadtxt(fobspc) #print pc_mod.shape, pc_obs.shape glue.append([model, np.corrcoef(pc_mod, pc_obs)[0, 1]] ) # --- Writing spatial correlation from models and Observation - EOF npcook = np.array(cook) np.savetxt('eof{0}.ar4.correl.txt'.format(eof), npcook, fmt= '%s %6s') # --- Writing time correlation from models and Observation - PC npglue = np.array(glue) np.savetxt('pc{0}.ar4.correl.txt'.format(eof), npglue, fmt= '%s %6s')
def test_nancorr_pearson(self): targ0 = np.corrcoef(self.arr_float_2d, self.arr_float1_2d)[0, 1] targ1 = np.corrcoef(self.arr_float_2d.flat, self.arr_float1_2d.flat)[0, 1] self.check_nancorr_nancov_2d(nanops.nancorr, targ0, targ1, method="pearson") targ0 = np.corrcoef(self.arr_float_1d, self.arr_float1_1d)[0, 1] targ1 = np.corrcoef(self.arr_float_1d.flat, self.arr_float1_1d.flat)[0, 1] self.check_nancorr_nancov_1d(nanops.nancorr, targ0, targ1, method="pearson")
def correlate(name, p_index, c_index, d_mean, d_sd): object = bpy.data.objects[name] uvs = getUVs(object, p_index) distances = getDistancesPerParticle(model.CONNECTION_RESULTS[c_index]['d']) uvs2 = [] delays = [] for index, ds in enumerate(distances): samples = [] for i in range(1): delay_mm = max(delayModel_delayDistribLogNormal(d_mean, d_sd), 0.1) uvs2.append(list(uvs[index,:])) delays.append(max(ds * delay_mm, 0.1)) #samples.append( max(ds * delay_mm, 0.1) ) #delays.append(np.mean(samples)) delays = np.array(delays) uvs2 = np.array(uvs2) print(len(uvs2)) corr_dist_x = np.corrcoef(uvs[:,0], distances) corr_dist_y = np.corrcoef(uvs[:,1], distances) corr_dela_x = np.corrcoef(uvs2[:,0], delays) corr_dela_y = np.corrcoef(uvs2[:,1], delays) print('Correlation x with distance: %f' % corr_dist_x[0][1]) print('Correlation x with delay: %f' % corr_dela_x[0][1]) print('Correlation y with distance: %f' % corr_dist_y[0][1]) print('Correlation y with delay: %f' % corr_dela_y[0][1])
def traverseplot(Xin,Yin,Field,name): string,nodeind,leaf,label=TraverseTree(regTree,Xin,Field) nband=Yin.shape[1] k=0 for j in leaf: Ytemp=Yin[nodeind[j],:] Xtemp=Xin[nodeind[j],:] Yptemp=regTreeModel.predict(Xtemp) fitmodel=fitModelList[k] if predind.ndim==1: Ypnewtemp=fitmodel.predict(Xtemp[:,predind.astype(int)]) else: Ypnewtemp=fitmodel.predict(Xtemp[:,predind[k,:].astype(int)]) rmse,rmse_band=RMSECal(Yptemp,Ytemp) rmsenew,rmse_bandnew=RMSECal(Ypnewtemp,Ytemp) n=nband f, axarr = plt.subplots(int(np.ceil(n/2)), 2,figsize=(10,12)) for i in range(n): pj=int(np.ceil(i/2)) pi=int(i%2) axarr[pj, pi].plot(Yptemp[:,i],Ytemp[:,i],'.') axarr[pj, pi].plot(Ypnewtemp[:,i],Ytemp[:,i],'.r') axarr[pj, pi].set_title('cluster %s,\n cc=%.3f -> %.3f, r=%.3f -> %.3f'\ %(i,np.corrcoef(Yptemp[:,i],Ytemp[:,i])[0,1],np.corrcoef(Ypnewtemp[:,i],Ytemp[:,i])[0,1], rmse_band[i],rmse_bandnew[i])) plotFun.plot121line(axarr[pj, pi]) f.tight_layout() f.suptitle(string[j],fontsize=8) f.subplots_adjust(top=0.9) plt.savefig(savedir+name+"_node%i"%j) plt.close() k=k+1
def test_c_within_and_c_between(): # mocking the correlation values store test_c_values_store = {"test_network_1":{"test_roi_1":((0,0,0),(0,0,1)), "test_roi_2":((0,1,0),(0,1,1))}, "test_network_2":{"test_roi_3":((1,0,0),(1,0,1)), "test_roi_4":((1,1,0),(1,1,1))}} data = np.zeros((2,2,2,3)) data[0,0,0] = [1,2,3] data[0,0,1] = [1,2,3] data[0,1,0] = [-1,-2,-3] data[0,1,1] = [-1,-2,-3] data[1,0,0] = [5,4,37] data[1,0,1] = [5,4,37] data[1,1,0] = [-3,-244,-1] data[1,1,1] = [-3,-244,-1] actual = connectivity_utils.c_within(data, test_c_values_store) # expected values are explicitly calculated according to the rules explained in the paper expected = {'test_network_1':(np.corrcoef([1,2,3],[-1,-2,-3])[1,0],), 'test_network_2': (np.corrcoef([5,4,37],[-3,-244,-1])[1,0],)} assert_almost_equal(expected['test_network_1'], expected['test_network_1']) assert_almost_equal(expected['test_network_2'], expected['test_network_2']) actual = connectivity_utils.c_between(data, test_c_values_store) # expected values are explicitly calculated according to the rules explained in the paper expected = [np.corrcoef([1,2,3],[5,4,37])[1,0], np.corrcoef([1,2,3],[-3,-244,-1])[1,0],np.corrcoef([-1,-2,-3],[5,4,37])[1,0], np.corrcoef([-1,-2,-3],[-3,-244,-1])[1,0]] assert_almost_equal(np.sort(expected), np.sort(actual['test_network_1-test_network_2']))
def testSVM(linkSet, patterns = None): cont, ncont, vectors = [], [], [] print "\nTesting\n" classifier = None if patterns == None: classifier = pickle.load(open("svm-classifier", "r")) else: classifier = pickle.load(open("svm-classifier2", "r")) for link in linkSet: vec = getFeatures(link, patterns) vectors += [vec] result = classifier.predict(vec) if result == 1.0: if link.endswith(".htm"): cont += [link + 'l'] else: cont += [link] else: ncont += [link] cont = [link for link in cont if checkBoilerplate(link)] #ones, zeros = clusterSimilar(cont) #cont = ones #ncont += zeros print "\nCorrelation Matrix\n" print numpy.corrcoef(numpy.transpose(numpy.array(vectors))) return sorted(cont, key=lambda x: len(x)), sorted(ncont, key=lambda x: len(x))
def _call(self, dataset): """Computes the aslmap_dcm = sl_dcm(group_data)verage correlation in similarity structure across chunks.""" chunks_attr = self.chunks_attr nchunks = len(np.unique(dataset.sa[chunks_attr])) if nchunks < 2: raise StandardError("This measure calculates similarity consistency across " "chunks and is not meaningful for datasets with only " "one chunk:") #calc neur sim b/w targ_comp targets per subject neur_sim={} for s in np.unique(dataset.sa[chunks_attr]): ds_s = dataset[dataset.sa.chunks == s] neur_sim[s+'1'] = 1 - np.corrcoef(ds_s[ds_s.sa.targets == self.targ_comp1[0]],ds_s[ds_s.sa.targets == self.targ_comp1[1]])[0][1] neur_sim[s+'2'] = 1 - np.corrcoef(ds_s[ds_s.sa.targets == self.targ_comp2[0]],ds_s[ds_s.sa.targets == self.targ_comp2[1]])[0][1] #combine xSs_behavs xSs_behav = {} for s in self.xSs_behav1: xSs_behav[s+'1'] = self.xSs_behav1[s] for s in self.xSs_behav2: xSs_behav[s+'2'] = self.xSs_behav2[s] #create dsets where cols are neural sim and mt sim for correlations behav_neur = np.array([[xSs_behav[s],neur_sim[s]] for s in neur_sim]) #correlate behav with neur sim b/w subjects if self.comparison_metric == 'spearman': xSs_corr = pearsonr(rankdata(behav_neur[:,0]),rankdata(behav_neur[:,1])) xSs_corr = pearsonr(behav_neur[:,0],behav_neur[:,1]) #returns fish z transformed r coeff ; could change to be p value if wanted... return Dataset(np.array([np.arctanh(xSs_corr[0])]))
def _region_features_for(histone, dna, region): pixels0 = histone[region].ravel() pixels1 = dna[region].ravel() bin0 = pixels0 > histone.mean() bin1 = pixels1 > dna.mean() overlap = [np.corrcoef(pixels0, pixels1)[0, 1], (bin0 & bin1).mean(), (bin0 | bin1).mean()] spi = mh.sobel(histone, just_filter=1) sp = spi[mh.erode(region)] sdi = mh.sobel(dna, just_filter=1) sd = sdi[mh.erode(region)] sobels = [ np.dot(sp, sp) / len(sp), np.abs(sp).mean(), np.dot(sd, sd) / len(sd), np.abs(sd).mean(), np.corrcoef(sp, sd)[0, 1], np.corrcoef(sp, sd)[0, 1] ** 2, sp.std(), sd.std(), ] return np.concatenate( [ [region.sum()], haralick(histone * region, ignore_zeros=True).mean(0), haralick(dna * region, ignore_zeros=True).mean(0), overlap, sobels, haralick(mh.stretch(sdi * region), ignore_zeros=True).mean(0), haralick(mh.stretch(spi * region), ignore_zeros=True).mean(0), ] )
def test_simulate_density(self): # generate a rings object both from an atomic and density model and # ensure the correlations match num_shots = 100 num_phi = 1024 nq = 100 # number of q vectors q_values = [1.0, 2.0] # atomic model traj = mdtraj.load(ref_file('pentagon.pdb')) r1 = xray.Rings.simulate(traj, 1, q_values, num_phi, num_shots) # density model grid_dimensions = [151,] * 3 grid_spacing = 1.0 # Angstroms grid = structure.atomic_to_density(traj, grid_dimensions, grid_spacing) r2 = xray.Rings.simulate_density(grid, grid_spacing, num_shots, q_values, num_phi) # compute correlations & ensure match c1 = r1.correlate_intra(1.0, 1.0) c2 = r2.correlate_intra(1.0, 1.0) R = np.corrcoef(c1, c2)[0,1] assert R > 0.95 c1 = r1.correlate_intra(2.0, 2.0) c2 = r2.correlate_intra(2.0, 2.0) R = np.corrcoef(c1, c2)[0,1] assert R > 0.95
def test(): data = SimData(400, 4, 15) cor = np.nan_to_num(np.corrcoef(data.answers, rowvar=0)) # pearson metric cor = np.nan_to_num(np.corrcoef(cor)) label1 = kmeans2(cor, 6, minit='points', iter=100)[1] # hack pocet komponent label2 = kmeans(cor, 6, True) xs, ys = mds(cor, euclid=True) plt.subplot(1, 2, 1) plt.title('kmeans2 ' + str(adjusted_rand_score(data.item_concept, label1))) plot_clustering( range(cor.shape[0]), xs, ys, labels=label1, shapes=data.item_concept, ) plt.subplot(1, 2, 2) plt.title('Kmeans ' + str(adjusted_rand_score(data.item_concept, label2))) plot_clustering( range(cor.shape[0]), xs, ys, labels=label2, shapes=data.item_concept, ) plt.show()
def __init__(self, bip, target, thr_dis=75, thr_corr=0.89, type_cor="global", drop_outliers=False, whisk=1.8): if isinstance(bip, ClassicBip.ClassicBip): __type__ = "Classic" self.bip = bip elif isinstance(bip, CanonicalBip.CanonicalBip): __type__ = "Canonical" self.bip = bip else: raise ValueError('Undefined biplotpy class') if isinstance(target, pandas.core.series.Series): if isinstance( list(set([type(el) for el in target]))[0], (int, float)): self.y = numpy.array(target) elif isinstance(target, numpy.ndarray): self.y = target else: raise ValueError('Nor ndarray numpy nor series pandas type') if isinstance(thr_dis, (float, int)) == False: raise ValueError('Nor ndarray numpy nor series pandas type') elif thr_dis > 100: raise ValueError('thr_dis must be between 25 and 100') elif thr_dis < 0: raise ValueError('thr_dis must be positive') if __type__ == "Classic": Project = bip.RowCoord.dot(bip.ColCoord.T) C = bip.ColCoord elif __type__ == "Canonical": Project = bip.Ind_Coord.dot(bip.Var_Coord.T) C = bip.Var_Coord # Positive rescalation of projections v_min = numpy.array( [abs(el) if el < 0 else el for el in Project.min(axis=0)]) for i, proj in enumerate(Project.T): Project[:, i] = proj + v_min[i] classes = numpy.unique(target) def get_outliers(d, whis): q1 = numpy.percentile(d, 25) q3 = numpy.percentile(d, 75) iq = q3 - q1 hi_val = q3 + whis * iq wisk_hi = numpy.compress(d <= hi_val, d) if len(wisk_hi) == 0 or numpy.max(wisk_hi) < q3: wisk_hi = q3 else: wisk_hi = max(wisk_hi) # get low extreme lo_val = q1 - whis * iq wisk_lo = numpy.compress(d >= lo_val, d) if len(wisk_lo) == 0 or numpy.min(wisk_lo) > q1: wisk_lo = q1 else: wisk_lo = min(wisk_lo) return list(numpy.where(d > wisk_hi)[0]) + list( numpy.where(d < wisk_lo)[0]) if drop_outliers == True: outliers = [] for var in Project.T: outliers = outliers + get_outliers(var, whisk) outliers = list(set(outliers)) self.outliers_ind = outliers perc_drop = len(outliers) * 100 / bip.data.shape[0] Project = numpy.delete(Project, (outliers), axis=0) target = numpy.delete(target, (outliers), axis=0) if perc_drop > 5.5: warnings.warn(( "You're dropping %s of the data. Try to increase 'whisk'" % perc_drop)) # Tracking class index IND = [] for cl in classes: ind_class = [] for i, el in enumerate(target): if el == cl: ind_class.append(i) IND.append(ind_class) # Number of combinations num_c = int(len(classes) * (len(classes) - 1) / 2) Disc = numpy.zeros((bip.data.shape[1], num_c)) comb = numpy.array(list(itertools.combinations(classes, r=2))) # Disc vectors for i, cmb in enumerate(comb): Disc[:, i] = abs(Project[IND[cmb[0]]].mean(axis=0) - Project[IND[cmb[1]]].mean(axis=0)) # Drop correlated variables POS = [] for v in Disc.T: for i, el in enumerate(v): if el > numpy.percentile(v, thr_dis): POS.append(i) POS = list(set(POS)) if type_cor == "global": Corr_matr = numpy.tril(numpy.corrcoef(bip.data[:, POS].T), -1) elif type_cor == "coord": Corr_matr = numpy.tril(numpy.corrcoef(C[POS, :]), -1) elif type_cor == "discr": Corr_matr = numpy.tril(numpy.corrcoef(Disc[POS, :]), -1) else: raise ValueError('type_cor must be "global", "coord" or "discr"') self.Corr_matr = Corr_matr ### Correlation threshold (23/01/2018) #pos_corr = numpy.where(Corr_matr > thr_corr) #disc_vect = Disc.sum(axis = 1) #self.disc_vect = disc_vect #del_el = [] #if pos_corr: # for i in range(len(pos_corr[0])): # ind = [pos_corr[0][i],pos_corr[1][i]] # ind_del = [] # if ((ind[0] in POS) and (ind[1] in POS)): # a = numpy.array([disc_vect[ind[0]],disc_vect[ind[1]]]) # ind_del.append(POS.index(pos_corr[ numpy.argwhere(a.min() == a)[0][0] ][0])) ### Correlation threshold (01/02/2018) pos_corr = numpy.where(Corr_matr > thr_corr) disc_vect = Disc[POS, :].sum(axis=1) ind_del = [] if pos_corr: for i in range(len(pos_corr[0])): if disc_vect[pos_corr[0][i]] > disc_vect[pos_corr[1][i]]: ind_del.append(pos_corr[1][i]) else: ind_del.append(pos_corr[0][i]) ind_del = list(set(ind_del)) if ind_del: POS = [el for i, el in enumerate(POS) if i not in ind_del] self.var_sel = list(numpy.array(bip.col_names)[POS])
y = np.asarray(y) vt = np.asarray(vt) vmax = np.asarray(vmax) pc = np.asarray(pc) rmax = np.asarray(rmax) r35 = np.asarray(r35) dpc = np.asarray(dpc) r35_holland1980 = np.asarray(r35_holland1980) AL = np.asarray(AL) # R35 is actually delta R35 r35 = r35 - rmax r35_holland1980 = r35_holland1980 - rmax # Correlations xcorrelation = np.corrcoef(r35, x) ycorrelation = np.corrcoef(r35, y) timecorrelation = np.corrcoef(r35, time) vmaxcorrelation = np.corrcoef(r35, vmax) dpccorrelation = np.corrcoef(r35, dpc) vtcorrelation = np.corrcoef(r35, vt) rmaxcorrelation = np.corrcoef(r35, rmax) #============================================================================== # Part 2: necessity -> other relations result in large error + confidence bounds around coeffcients #============================================================================== if plot_rels == 1: plt.close('all') r35_holland1980a = r35_holland1980[~np.isnan(r35_holland1980)] r35a = r35[~np.isnan(r35_holland1980)] vmaxa = vmax[~np.isnan(r35_holland1980)]
#!/usr/bin/env python # A light statistical warm-up in Python # Task: Implement these functions # (without using the numpy built-ins) # * my_mean # * my_var # * my_cov # * my_cor # So that this file can be run without error ### YOUR CODE HERE # Do not edit below this line import numpy as np x, y = (np.random.randn(100) for _ in range(2)) def equal(a, b): np.testing.assert_allclose(a, b) equal(my_mean(x), np.mean(x)) equal(my_var(x), np.var(x)) equal(my_cov(x, y), np.cov(x, y)) equal(my_cor(x, y), np.corrcoef(x, y))
compute_all_hippocampus('D:\\Data\\Registration_Philip3_Hippocampus') ave_data = np.load('ave_patches.npy') # matrix_r = np.corrcoef(ave_data) # matrix_r_abs = np.abs(matrix_r) hipp_data = np.load('hipp_label_patches.npy') data_list = np.where(np.sum(ave_data, axis=1) > 10000) hipp_list = np.where(np.sum(hipp_data, axis=1) > 0.1) print(np.sum(hipp_data)) for seed in hipp_list[0]: if seed not in data_list[0]: print("Not all hipp in ave_list") end_nodes = ave_data[data_list] start_nodes = hipp_data[hipp_list] matrix_r = np.corrcoef(end_nodes) matrix_r_abs = np.abs(matrix_r) s = np.zeros(len(matrix_r_abs)) Y = np.zeros(len(matrix_r_abs)) for i in range(len(matrix_r_abs)): for j in range(len(matrix_r_abs)): if matrix_r_abs[i, j] > 0.3 and i != j: s[i] = s[i] + 1 for i in range(len(matrix_r_abs)): for j in range(len(matrix_r_abs)): if matrix_r_abs[i, j] > 0.3 and i != j: Y[i] = Y[i] + pow((matrix_r_abs[i, j] / s[i]), 2) plt.plot(Y) df = pd.DataFrame(matrix_r_abs) print(df) # sns.heatmap(df, annot=True)
def main(): # Read settings ---------------------------------------------------- # Brain data brain_dir = '/home/share/data/fmri_shared/datasets/Deeprecon/fmriprep' subjects_list = {'TH': 'TH_ImageNetTest_volume_native.h5'} rois_list = { 'VC': 'ROI_VC = 1', } # Image features features_dir = '/home/ho/Documents/brain-decoding-examples/python/feature-prediction/data/features/ImageNetTest' network = 'caffe/VGG_ILSVRC_19_layers' features_list = [ 'conv1_1', 'conv1_2', 'conv2_1', 'conv2_2', 'conv3_1', 'conv3_2', 'conv3_3', 'conv3_4', 'conv4_1', 'conv4_2', 'conv4_3', 'conv4_4', 'conv5_1', 'conv5_2', 'conv5_3', 'conv5_4', 'fc6', 'fc7', 'fc8' ][::-1] features_list = ['fc6', 'fc7', 'fc8'][::-1] target_subject = 'AM' Lambda = 0.1 data_rep = 5 # Model parameters gpu_device = 1 # Results directory results_dir_root = './NCconverter_results' # Converter models nc_models_dir_root = os.path.join(results_dir_root, 'pytorch_converter_training', 'model') selected_converter_type = 'conv5' # Misc settings analysis_basename = os.path.splitext(os.path.basename(__file__))[0] # Pretrained model metadata pre_results_dir_root = '/home/share/data/contents_shared/ImageNetTraining/derivatives/feature_decoders' pre_analysis_basename = 'deeprecon_fmriprep_rep5_500voxel_allunits_fastl2lir_alpha100' pre_models_dir_root = os.path.join(pre_results_dir_root, pre_analysis_basename) # Load data -------------------------------------------------------- print('----------------------------------------') print('Loading data') data_brain = { sbj: bdpy.BData(os.path.join(brain_dir, dat_file)) for sbj, dat_file in subjects_list.items() } data_features = Features(os.path.join(features_dir, network)) # Initialize directories ------------------------------------------- makedir_ifnot(results_dir_root) makedir_ifnot('tmp') # Analysis loop ---------------------------------------------------- print('----------------------------------------') print('Analysis loop') for sbj, roi, feat in product(subjects_list, rois_list, features_list): print('--------------------') print('Subject: %s' % sbj) print('ROI: %s' % roi) # Distributed computation setup # ----------------------------- subject_name = sbj + '2' + target_subject + '_' + str( data_rep * 20) + 'p' + '_lambda' + str(Lambda) analysis_id = analysis_basename + '-' + subject_name + '-' + roi + '-' + feat results_dir_prediction = os.path.join(results_dir_root, analysis_basename, 'decoded_features', network, feat, subject_name, roi) results_dir_accuracy = os.path.join(results_dir_root, analysis_basename, 'prediction_accuracy', network, feat, subject_name, roi) if os.path.exists(results_dir_prediction): print('%s is already done. Skipped.' % analysis_id) continue dist = DistComp(lockdir='tmp', comp_id=analysis_id) if dist.islocked_lock(): print('%s is already running. Skipped.' % analysis_id) continue # Preparing data # -------------- print('Preparing data') start_time = time() # Brain data x = data_brain[sbj].select(rois_list[roi]) # Brain data x_labels = data_brain[sbj].select( 'image_index') # Image labels in the brain data # Target features and image labels (file names) y = data_features.get_features(feat) y_labels = data_features.index image_names = data_features.labels # Get test data x_test = x x_test_labels = x_labels y_test = y y_test_labels = y_labels # Averaging brain data x_test_labels_unique = np.unique(x_test_labels) x_test_averaged = np.vstack([ np.mean(x_test[(x_test_labels == lb).flatten(), :], axis=0) for lb in x_test_labels_unique ]) print('Total elapsed time (data preparation): %f' % (time() - start_time)) # Convert x_test_averaged nc_models_dir = os.path.join(nc_models_dir_root, subject_name, roi, 'model') x_test_averaged = test_ncconverter(nc_models_dir, x_test_averaged, gpu_device) # Prediction # ---------- print('Prediction') start_time = time() y_pred = test_fastl2lir_div( os.path.join(pre_models_dir_root, network, feat, target_subject, roi, 'model'), x_test_averaged) print('Total elapsed time (prediction): %f' % (time() - start_time)) # Calculate prediction accuracy # ----------------------------- print('Prediction accuracy') start_time = time() y_pred_2d = y_pred.reshape([y_pred.shape[0], -1]) y_true_2d = y.reshape([y.shape[0], -1]) y_true_2d = get_refdata(y_true_2d, y_labels, x_test_labels_unique) n_units = y_true_2d.shape[1] accuracy = np.array([ np.corrcoef(y_pred_2d[:, i].flatten(), y_true_2d[:, i].flatten())[0, 1] for i in range(n_units) ]) accuracy = accuracy.reshape((1, ) + y_pred.shape[1:]) print('Mean prediction accuracy: {}'.format(np.mean(accuracy))) print('Total elapsed time (prediction accuracy): %f' % (time() - start_time)) # Save results # ------------ print('Saving results') makedir_ifnot(results_dir_prediction) makedir_ifnot(results_dir_accuracy) start_time = time() # Predicted features for i, lb in enumerate(x_test_labels_unique): # Predicted features feat = np.array([y_pred[i, ] ]) # To make feat shape 1 x M x N x ... image_filename = image_names[ int(lb) - 1] # Image labels are one-based image indexes # Save file name save_file = os.path.join(results_dir_prediction, '%s.mat' % image_filename) # Save hdf5storage.savemat(save_file, {u'feat': feat}, format='7.3', oned_as='column', store_python_metadata=True) print('Saved %s' % results_dir_prediction) # Prediction accuracy save_file = os.path.join(results_dir_accuracy, 'accuracy.mat') hdf5storage.savemat(save_file, {u'accuracy': accuracy}, format='7.3', oned_as='column', store_python_metadata=True) print('Saved %s' % save_file) print('Elapsed time (saving results): %f' % (time() - start_time)) dist.unlock() print('%s finished.' % analysis_basename)
img1 = cv2.imread('/home/sinadabiri/Dropbox/Images/cell1.tif',0) img2 = cv2.imread('/home/sinadabiri/Dropbox/Images/cell2.tif',0) row, col = img1.shape # row1, col1 = img1.shape # row2, col2 = img2.shape print "height =" ,row, "width = " , col # centerRow1, centerCol1 = row1/2, col1/2 # centerRow2, centerCol2 = row2/2, col2/2 width = col-1 i=0 j = 0 overLapCorrCoef = np.zeros((col),np.uint8) overLapCorrCoef= np.corrcoef(img1[:,116:1:-1], img2[:,1:116], rowvar=False)[116,:] # overLapCorrCoef= np.corrcoef(img1[:,116:1:-1], img2[:,1:116], rowvar=True) # overLapCorrCoef= np.corrcoef(img1[:,116], img2[:,1], rowvar=True) print (overLapCorrCoef) print (np.size(overLapCorrCoef)) # print "image 1 ", img1[:,116], "image 2 ", img2[:,1] # for i in range(col): # if i < col: # overLapCorrCoef[:,i]= np.corrcoef(img1[:,width],img2[:,i], rowvar=False)[1,0] # print (overLapCorrCoef[i]) # # width = width -1 # else: # break
def scores(key, paths, config): values = [mapreduce.OutputCollector(p) for p in paths] try: values = [item.load() for item in values] except Exception as e: print(e) return None y_true_splits = [item["y_true"].ravel() for item in values] y_pred_splits = [item["y_pred"].ravel() for item in values] y_true = np.concatenate(y_true_splits) y_pred = np.concatenate(y_pred_splits) prob_pred_splits = [item["prob_pred"].ravel() for item in values] prob_pred = np.concatenate(prob_pred_splits) # Prediction performances p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None) auc = roc_auc_score(y_true, prob_pred) # balanced accuracy (recall_mean) bacc_splits = [ recall_score(y_true_splits[f], y_pred_splits[f], average=None).mean() for f in range(len(y_true_splits)) ] auc_splits = [ roc_auc_score(y_true_splits[f], prob_pred_splits[f]) for f in range(len(y_true_splits)) ] print("bacc all - mean(bacc) %.3f" % (r.mean() - np.mean(bacc_splits))) # P-values success = r * s success = success.astype('int') prob_class1 = np.count_nonzero(y_true) / float(len(y_true)) pvalue_recall0_true_prob = binom_test(success[0], s[0], 1 - prob_class1, alternative='greater') pvalue_recall1_true_prob = binom_test(success[1], s[1], prob_class1, alternative='greater') pvalue_recall0_unknwon_prob = binom_test(success[0], s[0], 0.5, alternative='greater') pvalue_recall1_unknown_prob = binom_test(success[1], s[1], 0.5, alternative='greater') pvalue_bacc = binom_test(success[0] + success[1], s[0] + s[1], p=0.5, alternative='greater') # Beta's measures of similarity betas = np.hstack([item["beta"][:, penalty_start:].T for item in values]).T # Correlation R = np.corrcoef(betas) R = R[np.triu_indices_from(R, 1)] # Fisher z-transformation / average z_bar = np.mean(1. / 2. * np.log((1 + R) / (1 - R))) # bracktransform r_bar = (np.exp(2 * z_bar) - 1) / (np.exp(2 * z_bar) + 1) # threshold betas to compute fleiss_kappa and DICE try: betas_t = np.vstack([ array_utils.arr_threshold_from_norm2_ratio(betas[i, :], .99)[0] for i in range(betas.shape[0]) ]) # Compute fleiss kappa statistics beta_signed = np.sign(betas_t) table = np.zeros((beta_signed.shape[1], 3)) table[:, 0] = np.sum(beta_signed == 0, 0) table[:, 1] = np.sum(beta_signed == 1, 0) table[:, 2] = np.sum(beta_signed == -1, 0) fleiss_kappa_stat = fleiss_kappa(table) # Paire-wise Dice coeficient ij = [[i, j] for i in range(betas.shape[0]) for j in range(i + 1, betas.shape[0])] dices = list() for idx in ij: A, B = beta_signed[idx[0], :], beta_signed[idx[1], :] dices.append( float(np.sum((A == B)[(A != 0) & (B != 0)])) / (np.sum(A != 0) + np.sum(B != 0))) dice_bar = np.mean(dices) except: dice_bar = fleiss_kappa_stat = 0 # Proportion of selection within the support accross the CV support_count = (betas_t != 0).sum(axis=0) support_count = support_count[support_count > 0] support_prop = support_count / betas_t.shape[0] scores = OrderedDict() scores['key'] = key scores['recall_0'] = r[0] scores['recall_1'] = r[1] scores['bacc'] = r.mean() scores['bacc_se'] = np.std(bacc_splits) / np.sqrt(len(bacc_splits)) scores["auc"] = auc scores['auc_se'] = np.std(auc_splits) / np.sqrt(len(auc_splits)) scores['pvalue_recall0_true_prob_one_sided'] = pvalue_recall0_true_prob scores['pvalue_recall1_true_prob_one_sided'] = pvalue_recall1_true_prob scores[ 'pvalue_recall0_unknwon_prob_one_sided'] = pvalue_recall0_unknwon_prob scores[ 'pvalue_recall1_unknown_prob_one_sided'] = pvalue_recall1_unknown_prob scores['pvalue_bacc_mean'] = pvalue_bacc scores['prop_non_zeros_mean'] = float(np.count_nonzero(betas_t)) / \ float(np.prod(betas.shape)) scores['beta_r_bar'] = r_bar scores['beta_fleiss_kappa'] = fleiss_kappa_stat scores['beta_dice_bar'] = dice_bar scores['beta_dice'] = str(dices) scores['beta_r'] = str(R) scores['beta_support_prop_select_mean'] = support_prop.mean() scores['beta_support_prop_select_sd'] = support_prop.std() return scores
# Read all the cleaned data files. cleaned_train = pd.read_csv("Cleaned_train.csv") cleaned_test = pd.read_csv("Cleaned_test.csv") # Keep numerical features num_features = cleaned_train.select_dtypes(include=np.number) correl = num_features.corr() # SalePrice correlation matrix k = 11 plt.figure(figsize=(10, 10)) sns.set_style(style='white') figtext_args, figtext_kwargs = add_fignum( "Fig 8. Correlation Matrix Heatmap of Sale Price") cols = correl.nlargest(k, 'SalePrice')['SalePrice'].index cm = np.corrcoef(cleaned_train[cols].values.T) sns.set(font_scale=1.25) plt.title("Correlation Heatmap of Sale Price with 10 most related variable\n", weight='bold') mask = np.triu(np.ones_like(cm, dtype=np.bool)) cmap = sns.diverging_palette(220, 10, as_cmap=True) hm = sns.heatmap(cm, mask=mask, cmap=cmap, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
#%% same_bins = np.linspace(0, 12, 50) bins, pdf = PDF(cube[12].data.flatten() * 0.304, same_bins) bins_con, pdf_con = PDF(cube_con[13].data.flatten() * 0.304, same_bins) #bins_old,pdf_old=PDF(data_old[13].flatten()*0.304,same_bins) bins_3ord, pdf_3ord = PDF(cube_3ord[13].data.flatten() * 0.304, same_bins) bins_2m, pdf_2m = PDF(cube_2m[13].data.flatten() * 0.304, same_bins) #bins_nh,pdf_nh=PDF(cube_nh.data.flatten(),same_bins) sat_bins, sat_pdf = PDF(grid_z1.flatten() / 1000., same_bins) plt.figure() plt.plot(bins, pdf, label='ALL_ICE_PROC R=%1.2f' % np.corrcoef(pdf[:], sat_pdf[:])[0, 1]) #plt.plot(bins_con, pdf_con,label='con R=%1.2f'%np.corrcoef(pdf_con[:],sat_pdf[:])[0,1]) #plt.plot(bins_old, pdf_old,label='old R=%1.2f'%np.corrcoef(pdf_old[20:],sat_pdf[20:])[0,1]) #plt.plot(bins_nh, pdf_nh,label='no hallet R=%1.2f'%np.corrcoef(pdf_nh[20:],sat_pdf[20:])[0,1]) plt.plot(bins_2m, pdf_2m, label='2_ORD_MORE R=%1.2f' % np.corrcoef(pdf_2m[:], sat_pdf[:])[0, 1]) plt.plot(bins_3ord, pdf_3ord, label='3_ORD_LESS R=%1.2f' % np.corrcoef(pdf_3ord[:], sat_pdf[:])[0, 1]) plt.plot(sat_bins, sat_pdf, label='satellite') plt.legend() plt.title('Cloud top height') plt.ylabel('Normalized PDF') plt.xlabel('Cloud top height $Km$')
if 'spark' in pref[1]: children.append( famStruct[l.strip('\n\r')][2] ) else: children.append(l.strip('\n\r')) f = np.load(gnpFn, 'rb') print("Keys: %s" % list(f.keys()), file=sys.stderr) gnp = f.get('GN').astype(float) print("reading parents npz:", file=sys.stderr) print("gnp.shape", gnp.shape, file=sys.stderr) f.close() f = np.load(gncFn, 'rb') print("Keys: %s" % list(f.keys()), file=sys.stderr) gnc = f.get('GN').astype(float) print("reading children npz:", file=sys.stderr) print("gnc.shape", gnc.shape, file=sys.stderr) f.close() prs = gnp chn = gnc corr = np.corrcoef(chn) id = np.where(corr > 0.95) l = len(id[0]) idd = set([id[0][i] for i in range(l) if id[0][i] != id[1][i]]) for i in idd: print(children[i])
def correlation(predictions, targets): ranked_preds = predictions.rank(pct=True, method="first") return np.corrcoef(ranked_preds, targets)[0, 1]
def main(): print(MODEL_FILE) print("Loading data...") # The training data is used to train your model how to predict the targets. #training_data = read_csv("numerai_training_data.csv") # The tournament data is the data that Numerai uses to evaluate your model. #tournament_data = read_csv("numerai_tournament_data.csv") contest = str(233) directory = 'F:\\Numerai\\numerai' + contest + '\\' print("Loading data...") # The training data is used to train your model how to predict the targets. training_data = pd.read_csv(directory + "numerai_training_data.csv").set_index("id") # The tournament data is the data that Numerai uses to evaluate your model. tournament_data = pd.read_csv( directory + "numerai_tournament_data.csv").set_index("id") #MODEL_FILE = directory + "example_model.xgb" feature_names = [ f for f in training_data.columns if f.startswith("feature") ] print(f"Loaded {len(feature_names)} features") # This is the model that generates the included example predictions file. # Taking too long? Set learning_rate=0.1 and n_estimators=200 to make this run faster. # Remember to delete example_model.xgb if you change any of the parameters below. model = XGBRegressor(max_depth=5, learning_rate=0.01, n_estimators=2000, n_jobs=-1, colsample_bytree=0.1) if MODEL_FILE.is_file(): print("Loading pre-trained model...") model.load_model(MODEL_FILE) else: print("Training model...") model.fit(training_data[feature_names], training_data[TARGET_NAME]) print("Training model... {MODEL_FILE}") model.save_model("F:\\Numerai\\numerai233\\example_model.xgb") # Generate predictions on both training and tournament data print("Generating predictions...") training_data[PREDICTION_NAME] = model.predict( training_data[feature_names]) tournament_data[PREDICTION_NAME] = model.predict( tournament_data[feature_names]) # Check the per-era correlations on the training set (in sample) train_correlations = training_data.groupby("era").apply(score) print( f"On training the correlation has mean {train_correlations.mean()} and std {train_correlations.std()}" ) print( f"On training the average per-era payout is {payout(train_correlations).mean()}" ) """Validation Metrics""" # Check the per-era correlations on the validation set (out of sample) validation_data = tournament_data[tournament_data.data_type == "validation"] validation_correlations = validation_data.groupby("era").apply(score) print( f"On validation the correlation has mean {validation_correlations.mean()} and " f"std {validation_correlations.std()}") print( f"On validation the average per-era payout is {payout(validation_correlations).mean()}" ) # Check the "sharpe" ratio on the validation set validation_sharpe = validation_correlations.mean( ) / validation_correlations.std() print(f"Validation Sharpe: {validation_sharpe}") print("checking max drawdown...") rolling_max = (validation_correlations + 1).cumprod().rolling( window=100, min_periods=1).max() daily_value = (validation_correlations + 1).cumprod() max_drawdown = -(rolling_max - daily_value).max() print(f"max drawdown: {max_drawdown}") # Check the feature exposure of your validation predictions feature_exposures = validation_data[feature_names].apply( lambda d: correlation(validation_data[PREDICTION_NAME], d), axis=0) max_feature_exposure = np.max(np.abs(feature_exposures)) print(f"Max Feature Exposure: {max_feature_exposure}") # Check feature neutral mean print("Calculating feature neutral mean...") feature_neutral_mean = get_feature_neutral_mean(validation_data) print(f"Feature Neutral Mean is {feature_neutral_mean}") # Load example preds to get MMC metrics example_preds = pd.read_csv( "F:\\Numerai\\numerai233\\example_predictions_target_kazutsugi.csv" ).set_index("id")["prediction_kazutsugi"] validation_example_preds = example_preds.loc[validation_data.index] validation_data["ExamplePreds"] = validation_example_preds print("calculating MMC stats...") # MMC over validation mmc_scores = [] corr_scores = [] for _, x in validation_data.groupby("era"): series = neutralize_series(pd.Series(unif(x[PREDICTION_NAME])), pd.Series(unif(x["ExamplePreds"]))) mmc_scores.append(np.cov(series, x[TARGET_NAME])[0, 1] / (0.29**2)) corr_scores.append( correlation(unif(x[PREDICTION_NAME]), x[TARGET_NAME])) val_mmc_mean = np.mean(mmc_scores) val_mmc_std = np.std(mmc_scores) val_mmc_sharpe = val_mmc_mean / val_mmc_std corr_plus_mmcs = [c + m for c, m in zip(corr_scores, mmc_scores)] corr_plus_mmc_sharpe = np.mean(corr_plus_mmcs) / np.std(corr_plus_mmcs) corr_plus_mmc_mean = np.mean(corr_plus_mmcs) corr_plus_mmc_sharpe_diff = corr_plus_mmc_sharpe - validation_sharpe print(f"MMC Mean: {val_mmc_mean}\n" f"Corr Plus MMC Sharpe:{corr_plus_mmc_sharpe}\n" f"Corr Plus MMC Diff:{corr_plus_mmc_sharpe_diff}") # Check correlation with example predictions corr_with_example_preds = np.corrcoef( validation_example_preds.rank(pct=True, method="first"), validation_data[PREDICTION_NAME].rank(pct=True, method="first"))[0, 1] print(f"Corr with example preds: {corr_with_example_preds}") # Save predictions as a CSV and upload to https://numer.ai tournament_data[PREDICTION_NAME].to_csv("F:\\Numerai\\numerai233\\" + TOURNAMENT_NAME + "_submission.csv")
def plot_correlations(out_dir, ref_dir): """ plots correlation and L2-norm values between reference and output seismograms """ print('comparing seismograms') print(' reference directory: %s' % ref_dir) print(' output directory : %s\n' % out_dir) # checks if directory exists if not os.path.isdir(ref_dir): print("Please check if directory exists: ", ref_dir) sys.exit(1) if not os.path.isdir(out_dir): print("Please check if directory exists: ", out_dir) sys.exit(1) # seismogram file ending ## global version: ending = '.sem.ascii' # MX*.sem.ascii, .. ## cartesian version: ending = '.sem*' # .semd, .semv, .sema, .semp, .. # gets seismograms files = glob.glob(out_dir + '/*' + ending) if len(files) == 0: print("no seismogram files with ending ", ending, " found") print("Please check directory: ", out_dir) sys.exit(1) files.sort() corr_min = 1.0 err_max = 0.0 shift_max = 0.0 # gets time step size from first file syn_file = files[0] print(" time step: reading from first file ", syn_file) syn_time = np.loadtxt(syn_file)[:, 0] dt = syn_time[1] - syn_time[0] print(" time step: size = ", dt) # warning if dt <= 0.0: print("warning: invalid time step size for file ", files[0]) # determines window length if USE_SUB_WINDOW_CORR: # moving window print(" using correlations in moving sub-windows") print(" minimum period: ", TMIN) # checks if dt <= 0.0: # use no moving window window_length = len(syn_time) - 1 else: # window length for minimum period window_length = int(TMIN / dt) print(" moving window length: ", window_length) print("") print("comparing ", len(files), "seismograms") print("") # outputs table header print("|%-30s| %13s| %13s| %13s|" % ('file name', 'corr', 'err', 'time shift')) # counter n = 0 for f in files: # build reference and synthetics file names # specfem file: **network**.**station**.**comp**.sem.ascii fname = os.path.basename(f) names = str.split(fname, ".") # trace net = names[0] sta = names[1] cha = names[2] # filenames # old format #fname_old = sta + '.' + net + '.' + cha + '.sem.ascii' #ref_file = ref_dir + '/' + fname_old #syn_file = out_dir + '/' + fname_old # new format ref_file = ref_dir + '/' + fname syn_file = out_dir + '/' + fname # makes sure files are both available if not os.path.isfile(ref_file): print(" file " + ref_file + " not found") continue if not os.path.isfile(syn_file): print(" file " + syn_file + " not found") continue # numpy: reads in file data ref0 = np.loadtxt(ref_file)[:, 1] syn0 = np.loadtxt(syn_file)[:, 1] #debug #print(" seismogram: ", fname, "vs", fname_old," lengths: ",len(ref0),len(syn0)) # cuts common length length = min(len(ref0), len(syn0)) if length <= 1: continue # length warning if len(ref0) != len(syn0): print( "** warning: mismatch of file length in both files syn/ref = %d / %d" % (len(syn0), len(ref0))) #print("** warning: using smaller length %d" % length) # time step size in reference file ref_time = np.loadtxt(ref_file)[:, 0] dt_ref = ref_time[1] - ref_time[0] # mismatch warning if abs(dt - dt_ref) / dt > 1.e-5: print( "** warning: mismatch of time step size in both files syn/ref = %e / %e" % (dt, dt_ref)) #print("** warning: using time step size %e from file %s" %(dt,syn_file)) #debug #print("common length: ",length) ref = ref0[0:length] syn = syn0[0:length] # least square test norm = np.linalg.norm sqrt = np.sqrt # normalized by power in reference solution fac_norm = norm(ref) # or normalized by power in (ref*syn) #fac_norm = sqrt(norm(ref)*norm(syn)) if fac_norm > 0.0: err = norm(ref - syn) / fac_norm else: err = norm(ref - syn) #debug #print('norm syn = %e norm ref = %e' % (norm(syn),fac_norm)) # correlation test # total length if fac_norm > 0.0: corr_mat = np.corrcoef(ref, syn) else: if norm(ref - syn) > 0.0: corr_mat = np.cov(ref - syn) else: # both zero traces print("** warning: comparing zero traces") corr_mat = 1.0 corr = np.min(corr_mat) # time shift if fac_norm > 0.0: # shift (in s) by cross correlation shift = get_cross_correlation_timeshift(ref, syn, dt) else: # no correlation with zero trace shift = 0.0 # correlation in moving window if USE_SUB_WINDOW_CORR: # moves window through seismogram for i in range(0, length - window_length): # windowed signals x = ref[i:i + window_length] y = syn[i:i + window_length] # correlations corr_win = np.corrcoef(x, y) corr_w = np.min(corr_win) corr = min(corr, corr_w) # cross-correlation array shift_w = get_cross_correlation_timeshift(x, y, dt) if abs(shift) < abs(shift_w): shift = shift_w # statistics corr_min = min(corr, corr_min) err_max = max(err, err_max) if abs(shift_max) < abs(shift): shift_max = shift # info string info = "" if corr < TOL_CORR: info += " poor correlation" if err > TOL_ERR: info += " poor match" if abs(shift) > TOL_SHIFT: info += " significant shift" # print results to screen print("|%-30s| %13.5f| %13.5le| %13.5le| %s" % (fname, corr, err, shift, info)) # counter n += 1 # check if any comparison done if n == 0: # values indicating failure corr_min = 0.0 err_max = 1.e9 shift_max = 1.e9 # print min(coor) max(err) print( "|---------------------------------------------------------------------------|" ) print("|%30s| %13.5f| %13.5le| %13.5le|" % ('min/max', corr_min, err_max, shift_max)) # output summary print("\nsummary:") print("%d seismograms compared\n" % n) if n == 0: print("\nno seismograms found for comparison!\n\n") print("correlations: values 1.0 perfect, < %.1f poor correlation" % TOL_CORR) if corr_min < TOL_CORR: print(" poor correlation seismograms found") else: print(" no poor correlations found") print("") print("L2-error : values 0.0 perfect, > %.2f poor match" % TOL_ERR) if err_max > TOL_ERR: print(" poor matching seismograms found") else: print(" no poor matches found") print("") print("Time shift : values 0.0 perfect, > %.2f significant shift" % TOL_SHIFT) if abs(shift_max) > TOL_SHIFT: print(" significant time shift in seismograms found") else: print(" no significant time shifts found") print("")
xi = soln[0] sig = soln[1] mparams = (q, nu_S, nu_W, kappa, muW, muS, lambd, chiS, eta, xi, mu, \ gamma, yvect, rho, sig) Hhist, yhist, Ahist, Chist, bhist, Uhist, zhist = runsim(T, epshist, mparams) HrsSlept = np.zeros(ndays) for d in range(0, ndays): HrsSlept[d] = np.sum(Ahist[d:d + q - 1]) / pph HrsMean = np.mean(HrsSlept) HrsStd = np.std(HrsSlept) HrsAuto = np.corrcoef(HrsSlept[0:ndays - 1], HrsSlept[1:ndays]) HrsAuto = HrsAuto[0, 1] print('xi: ', xi) print('sigma: ', sig) print('average hours of sleep per day: ', HrsMean) print('st dev of hours of sleep per day: ', HrsStd) print('autocorreation of sleep per day: ', HrsAuto) print(' ') data = (HrsMean, HrsStd) pkl.dump(data, open(name + '.pkl', 'wb')) # Simulate with no shocks to find SS sig = 0. ndays = 5 T = ndays * q # number of periods to simulate
def scaling(self): data = self.raw train = data.drop(['測站', '測項', '日期'], axis=1) train = np.array(train.replace('NR', 0), dtype=np.float32) # Transform to 18 factors x #train_data train_norm = np.zeros((18, 1)) for i in range(240): single_day = train[i * 18:(i + 1) * 18, :] train_norm = np.append(train_norm, single_day, axis=1) train_norm = np.delete(train_norm, 0, axis=1) # Insert pm2.5, pm10 square and pm2.5 ^ 3 train_norm = np.insert(train_norm, len(train_norm), train_norm[9, :]**2, axis=0) train_norm = np.insert(train_norm, len(train_norm), train_norm[8, :]**2, axis=0) train_norm = np.insert(train_norm, len(train_norm), train_norm[8, :] * train_norm[9, :], axis=0) train_norm = np.insert(train_norm, len(train_norm), train_norm[5, :]**2, axis=0) train_norm = np.insert(train_norm, len(train_norm), train_norm[6, :]**2, axis=0) train_norm = np.insert(train_norm, len(train_norm), train_norm[7, :]**2, axis=0) train_norm = np.insert(train_norm, len(train_norm), train_norm[12, :]**2, axis=0) # Extract labels self.__label = [] for mon in range(12): for hr in range(471): self.__label.append(train_norm[9, (mon * 480) + hr + 9]) # Standardization self.mu = train_norm.mean(axis=1) self.std = train_norm.std(axis=1) for j in range(train_norm.shape[0]): if self.std[j] != 0: train_norm[j, ] = (train_norm[j, ] - self.mu[j]) / self.std[j] # Shrink unrelevent feature into 0.0 by PCC to pm2.5 cor_mat = np.corrcoef(train_norm)[9, :] print(cor_mat) d = [] for i in range(len(cor_mat)): if abs(cor_mat[i]) < 0.2: d.append(i) train_norm = np.delete(train_norm, d, axis=0) # Extract features self.__data = [] for mon in range(12): for hr in range(471): feature = train_norm[:, (mon * 480) + hr:(mon * 480) + hr + 9] self.__data.append(feature) return self, d
y_pred = np.sign(np.inner(w, phi)) if y_pred == 0: y_pred = 1 if y != y_pred: # update w_s and A_s Wlast = copy.deepcopy(W) w = w + y * np.dot(np.linalg.inv(np.kron(A, np.eye(d))), phi) W = np.reshape(w, (d, K), order='F') if t >= EPOCH: A = np.linalg.inv((1.0 / (K + 1)) * (np.eye(K) + np.ones((K, K)))) s += 1 print("Task relatedness Matrix for CMTL:\n", np.linalg.inv(A)) print('\n') print("Learned task weights correlation for CMTL:\n", np.corrcoef([w[0:10], w[10:20], w[20:30]])) print("\n") print("True weights correlation for CMTL:\n", np.corrcoef([w1, w2, w3])) nIncorrect = 0 #Testing Accuracy: for dataSetNumber in range(100): ypred_1 = np.sign(np.dot(w[0:10], x_1[:, dataSetNumber])) if ypred_1 != y_1[dataSetNumber]: nIncorrect += 1 ypred_2 = np.sign(np.dot(w[10:20], x_2[:, dataSetNumber])) if ypred_2 != y_2[dataSetNumber]: nIncorrect += 1 ypred_3 = np.sign(np.dot(w[20:30], x_3[:, dataSetNumber]))
def delete_mixtures(params, nb_cpu, nb_gpu, use_gpu): templates = load_data(params, 'templates') data_file = params.data_file N_e = params.getint('data', 'N_e') N_total = params.nb_channels N_t = params.getint('detection', 'N_t') template_shift = params.getint('detection', 'template_shift') cc_merge = params.getfloat('clustering', 'cc_merge') x, N_tm = templates.shape nb_temp = N_tm // 2 merged = [nb_temp, 0] mixtures = [] to_remove = [] overlap = get_overlaps(params, extension='-mixtures', erase=True, normalize=False, maxoverlap=False, verbose=False, half=True, use_gpu=use_gpu, nb_cpu=nb_cpu, nb_gpu=nb_gpu) filename = params.get('data', 'file_out_suff') + '.overlap-mixtures.hdf5' result = [] norm_templates = load_data(params, 'norm-templates') templates = load_data(params, 'templates') result = load_data(params, 'clusters') best_elec = load_data(params, 'electrodes') limits = load_data(params, 'limits') nodes, edges = get_nodes_and_edges(params) inv_nodes = numpy.zeros(N_total, dtype=numpy.int32) inv_nodes[nodes] = numpy.argsort(nodes) distances = numpy.zeros((nb_temp, nb_temp), dtype=numpy.float32) over_x = overlap.get('over_x')[:] over_y = overlap.get('over_y')[:] over_data = overlap.get('over_data')[:] over_shape = overlap.get('over_shape')[:] overlap.close() overlap = scipy.sparse.csr_matrix((over_data, (over_x, over_y)), shape=over_shape) for i in xrange(nb_temp - 1): distances[i, i + 1:] = numpy.argmax( overlap[i * nb_temp + i + 1:(i + 1) * nb_temp].toarray(), 1) distances[i + 1:, i] = distances[i, i + 1:] all_temp = numpy.arange(comm.rank, nb_temp, comm.size) overlap_0 = overlap[:, N_t].toarray().reshape(nb_temp, nb_temp) sorted_temp = numpy.argsort( norm_templates[:nb_temp])[::-1][comm.rank::comm.size] M = numpy.zeros((2, 2), dtype=numpy.float32) V = numpy.zeros((2, 1), dtype=numpy.float32) to_explore = xrange(comm.rank, len(sorted_temp), comm.size) if comm.rank == 0: to_explore = get_tqdm_progressbar(to_explore) for count, k in enumerate(to_explore): k = sorted_temp[k] electrodes = numpy.take(inv_nodes, edges[nodes[best_elec[k]]]) overlap_k = overlap[k * nb_temp:(k + 1) * nb_temp].tolil() is_in_area = numpy.in1d(best_elec, electrodes) all_idx = numpy.arange(len(best_elec))[is_in_area] been_found = False for i in all_idx: if not been_found: overlap_i = overlap[i * nb_temp:(i + 1) * nb_temp].tolil() M[0, 0] = overlap_0[i, i] V[0, 0] = overlap_k[i, distances[k, i]] for j in all_idx[i + 1:]: M[1, 1] = overlap_0[j, j] M[1, 0] = overlap_i[j, distances[k, i] - distances[k, j]] M[0, 1] = M[1, 0] V[1, 0] = overlap_k[j, distances[k, j]] try: [a1, a2] = numpy.dot(scipy.linalg.inv(M), V) except Exception: [a1, a2] = [0, 0] a1_lim = limits[i] a2_lim = limits[j] is_a1 = (a1_lim[0] <= a1) and (a1 <= a1_lim[1]) is_a2 = (a2_lim[0] <= a2) and (a2 <= a2_lim[1]) if is_a1 and is_a2: new_template = ( a1 * templates[:, i].toarray() + a2 * templates[:, j].toarray()).ravel() similarity = numpy.corrcoef( templates[:, k].toarray().ravel(), new_template)[0, 1] if similarity > cc_merge: if k not in mixtures: mixtures += [k] been_found = True break #print "Template", k, 'is sum of (%d, %g) and (%d,%g)' %(i, a1, j, a2) #print mixtures to_remove = numpy.unique(numpy.array(mixtures, dtype=numpy.int32)) to_remove = all_gather_array(to_remove, comm, 0, dtype='int32') if len(to_remove) > 0: slice_templates(params, to_remove) slice_clusters(params, result, to_remove=to_remove) comm.Barrier() if comm.rank == 0: os.remove(filename) return [nb_temp, len(to_remove)]
y_pred=sess.run(real_logits,feed_dict={X: pred}) Prob_pred=sess.run(tf.sigmoid(y_pred)) #Check if the Cov and mean are good np.set_printoptions(suppress=True) Mean_pred = np.mean(np.transpose(pred),axis=1) Mean_X = np.mean(np.transpose(X_batch),axis=1) Cov_pred = np.around(np.cov(np.transpose(pred)), decimals=3) #print(np.around(np.cov(np.transpose(pred)), decimals=2)) Cov_X = np.around(np.cov(np.transpose(X_batch)), decimals=3) #print(np.around(np.cov(np.transpose(X_batch)), decimals=2)) Corr_pred = np.around(np.corrcoef(np.transpose(pred)), decimals=3) Corr_X = np.around(np.corrcoef(np.transpose(X_batch)), decimals=3) #plot the loss plt.figure(num=0, figsize=(7, 5)) plot_loss(d_loss_list,g_loss_list) plt.figure(num=1, figsize=(7, 5)) D0 = pd.DataFrame(np.transpose((X_batch[:,0],pred[:,0]))) D0.plot.density() plt.xlim((-25, 25)) plt.title('return series of stock 1') plt.figure(num=2, figsize=(7, 5))
def correlation(x, y): return np.corrcoef(x, y)[0, 1]
def adjacency_correlation(signals): ''' Faster version of adjacency matrix with correlation metric ''' signals = np.reshape(signals, (signals.shape[0], -1)) return np.abs(np.nan_to_num(np.corrcoef(signals)))
score_x = numpy.random.normal(171.77, 5.54, n) score_y = numpy.random.normal(62.49, 7.89, n) #適当にちょっとノイズ入れる score_x.sort() score_x = numpy.around(score_x + numpy.random.normal(scale=3.0, size=n), 2) score_y.sort() score_y = numpy.around(score_y + numpy.random.normal(size=n), 2) #最大値 print "Max x: " + str(numpy.max(score_x)) + " y: " + str(numpy.max(score_y)) #最小値 print "Min x: " + str(numpy.min(score_x)) + " y: " + str(numpy.min(score_y)) #平均値 print "Avg x: " + str(numpy.mean(score_x)) + " y: " + str(numpy.mean(score_y)) #第1四分位 print "1Q x:" + str(stats.scoreatpercentile(score_x, 25)) + " y: " + str( stats.scoreatpercentile(score_y, 25)) #中央値 print "Med x: " + str(numpy.median(score_x)) + " y: " + str( numpy.median(score_y)) #第3四分位 print "3Q x:" + str(stats.scoreatpercentile(score_x, 75)) + " y: " + str( stats.scoreatpercentile(score_y, 75)) #分散 print "Var x: " + str(numpy.var(score_x)) + " y: " + str(numpy.var(score_y)) #標準偏差 print "S.D. x: " + str(numpy.std(score_x)) + " y:" + str(numpy.std(score_y)) #相関係数 cor = numpy.corrcoef(score_x, score_y) print "Correlation Coefficient : " + str(cor[0, 1])
def crossvalidate(self, X, Y, zDim_list=np.linspace(0, 10, 11), n_folds=10, verbose=True, rand_seed=None): N, D = X.shape # make sure z dims are integers z_list = zDim_list.astype(int) # create k-fold iterator if verbose: print('Crossvalidating pCCA model to choose # of dims...') cv_kfold = ms.KFold(n_splits=n_folds, shuffle=True, random_state=rand_seed) # iterate through train/test splits i = 0 LLs = np.zeros([n_folds, len(z_list)]) for train_idx, test_idx in cv_kfold.split(X): if verbose: print(' Fold ', i + 1, ' of ', n_folds, '...') X_train, X_test = X[train_idx], X[test_idx] Y_train, Y_test = Y[train_idx], Y[test_idx] # iterate through each zDim for j in range(len(z_list)): tmp = prob_cca() tmp.train_maxLL(X_train, Y_train, z_list[j]) z, curr_LL = tmp.estep(X_test, Y_test) LLs[i, j] = curr_LL i = i + 1 sum_LLs = LLs.sum(axis=0) # find the best # of z dimensions and train CCA model max_idx = np.argmax(sum_LLs) zDim = z_list[max_idx] self.train_maxLL(X, Y, zDim) # cross-validate to get canonical correlations if verbose: print('Crossvalidating pCCA model to compute canon corrs...') zx, zy = np.zeros((2, N, zDim)) for train_idx, test_idx in cv_kfold.split(X): X_train, X_test = X[train_idx], X[test_idx] Y_train, Y_test = Y[train_idx], Y[test_idx] tmp = prob_cca() tmp.train_maxLL(X_train, Y_train, zDim) z, curr_LL = tmp.estep(X_test, Y_test) zx[test_idx, :] = z['zx_mu'] zy[test_idx, :] = z['zy_mu'] cv_rho = np.zeros(zDim) for i in range(zDim): tmp = np.corrcoef(zx[:, i], zy[:, i]) cv_rho[i] = tmp[0, 1] self.params['cv_rho'] = cv_rho return sum_LLs, z_list, sum_LLs[max_idx], z_list[max_idx]
def evaluation(img_akaze, img_circle): ##################################### PROGRAMMATIC ANALYSIS OF CHECK-POINTS ######################################### # load the image and convert it to grayscale img_perfect = cv2.imread("team_id_2_comparison.png") coordinates = [[29.5, 250.0], [38.0, 385.0], [160.97999572753906, 417.5], [114.12354278564453, 338.9093322753906], [88.5, 259.0], [158.53448486328125, 202.6724090576172], [187.5, 38.5], [261.2481384277344, 121.8302230834961], [270.5, 243.0], [291.4565124511719, 422.2826232910156], [387.043701171875, 360.78155517578125], [343.0, 274.5], [362.0, 166.5]] feature_list = [636, 395, 1046, 500, 1605] # programmatic checkpoints circle_radius = 8 check_list = [] check_counter = 0 for i in coordinates: i[0] = int(i[0]) i[1] = int(i[1]) roi = img_circle[i[1] - (3 * circle_radius):i[1] + (3 * circle_radius), i[0] - (3 * circle_radius):i[0] + (3 * circle_radius)] roi = roi.reshape(int(roi.size / 3), 3) if [255, 255, 255] in roi.tolist(): check_list.append(1) check_counter += 1 else: check_list.append(0) check_result = ((check_counter / check_list.__len__()) * 100) print("Programmatic Analysis Result = ") print(check_result) ####################################### ANALYSIS USING FEATURE MATCHING ################################################## # load the image and convert it to grayscale gray1 = cv2.cvtColor(img_perfect, cv2.COLOR_BGR2GRAY) gray2 = cv2.cvtColor(img_akaze, cv2.COLOR_BGR2GRAY) # initialize the AKAZE descriptor, then detect keypoints and extract # local invariant descriptors from the image sift = cv2.xfeatures2d.SIFT_create() surf = cv2.xfeatures2d.SURF_create() akaze = cv2.AKAZE_create() brisk = cv2.BRISK_create() orb = cv2.ORB_create() (akazekps1, akazedescs1) = akaze.detectAndCompute(gray1, None) (akazekps2, akazedescs2) = akaze.detectAndCompute(gray2, None) (siftkps1, siftdescs1) = sift.detectAndCompute(gray1, None) (siftkps2, siftdescs2) = sift.detectAndCompute(gray2, None) (surfkps1, surfdescs1) = surf.detectAndCompute(gray1, None) (surfkps2, surfdescs2) = surf.detectAndCompute(gray2, None) (briskkps1, briskdescs1) = brisk.detectAndCompute(gray1, None) (briskkps2, briskdescs2) = brisk.detectAndCompute(gray2, None) (orbkps1, orbdescs1) = orb.detectAndCompute(gray1, None) (orbkps2, orbdescs2) = orb.detectAndCompute(gray2, None) #print("No of KeyPoints:") #print("akazekeypoints1: {}, akazedescriptors1: {}".format(len(akazekps1), akazedescs1.shape)) #print("akazekeypoints2: {}, akazedescriptors2: {}".format(len(akazekps2), akazedescs2.shape)) #print("siftkeypoints1: {}, siftdescriptors1: {}".format(len(siftkps1), siftdescs1.shape)) #print("siftkeypoints2: {}, siftdescriptors2: {}".format(len(siftkps2), siftdescs2.shape)) #print("surfkeypoints1: {}, surfdescriptors1: {}".format(len(surfkps1), surfdescs1.shape)) #print("surfkeypoints2: {}, surfdescriptors2: {}".format(len(surfkps2), surfdescs2.shape)) #print("briskkeypoints1: {}, briskdescriptors1: {}".format(len(briskkps1), briskdescs1.shape)) #print("briskkeypoints2: {}, briskdescriptors2: {}".format(len(briskkps2), briskdescs2.shape)) #print("orbkeypoints1: {}, orbdescriptors1: {}".format(len(orbkps1), orbdescs1.shape)) #print("orbkeypoints2: {}, orbdescriptors2: {}".format(len(orbkps2), orbdescs2.shape)) # Match the fezatures bfakaze = cv2.BFMatcher(cv2.NORM_HAMMING) bf = cv2.BFMatcher(cv2.NORM_L2) akazematches = bfakaze.knnMatch(akazedescs1, akazedescs2, k=2) siftmatches = bf.knnMatch(siftdescs1, siftdescs2, k=2) surfmatches = bf.knnMatch(surfdescs1, surfdescs2, k=2) briskmatches = bf.knnMatch(briskdescs1, briskdescs2, k=2) orbmatches = bf.knnMatch(orbdescs1, orbdescs2, k=2) # Apply ratio test on AKAZE matches goodakaze = [] for m, n in akazematches: if m.distance < 0.9 * n.distance: goodakaze.append([m]) im3akaze = cv2.drawMatchesKnn(img_perfect, akazekps1, img_akaze, akazekps2, goodakaze[:100], None, flags=2) cv2.imshow("AKAZE matching", im3akaze) goodakaze = np.asarray(goodakaze) print("akaze") similarity_akaze = (goodakaze.shape[0] / feature_list[0]) * 100 print(similarity_akaze) # Apply ratio test on SIFT matches goodsift = [] for m, n in siftmatches: if m.distance < 0.9 * n.distance: goodsift.append([m]) im3sift = cv2.drawMatchesKnn(img_perfect, siftkps1, img_akaze, siftkps2, goodsift[:], None, flags=2) cv2.imshow("SIFT matching", im3sift) goodsift = np.asarray(goodsift) print("sift") similarity_sift = (goodsift.shape[0] / feature_list[1]) * 100 print(similarity_sift) # Apply ratio test on SURF matches goodsurf = [] for m, n in surfmatches: if m.distance < 0.9 * n.distance: goodsurf.append([m]) im3surf = cv2.drawMatchesKnn(img_perfect, surfkps1, img_akaze, surfkps2, goodsurf[:], None, flags=2) cv2.imshow("SURF matching", im3surf) goodsurf = np.asarray(goodsurf) print("surf") similarity_surf = (goodsurf.shape[0] / feature_list[2]) * 100 print(similarity_surf) # Apply ratio test on ORB matches goodorb = [] for m, n in orbmatches: if m.distance < 0.9 * n.distance: goodorb.append([m]) im3orb = cv2.drawMatchesKnn(img_perfect, orbkps1, img_akaze, orbkps2, goodorb[:], None, flags=2) cv2.imshow("ORB matching", im3orb) goodorb = np.asarray(goodorb) print("orb") similarity_orb = (goodorb.shape[0] / feature_list[3]) * 100 print(similarity_orb) # Apply ratio test on BRISK matches goodbrisk = [] for m, n in briskmatches: if m.distance < 0.9 * n.distance: goodbrisk.append([m]) im3brisk = cv2.drawMatchesKnn(img_perfect, briskkps1, img_akaze, briskkps2, goodbrisk[:], None, flags=2) cv2.imshow("BRISK matching", im3brisk) goodbrisk = np.asarray(goodbrisk) print("brisk") similarity_brisk = (goodbrisk.shape[0] / feature_list[4]) * 100 print(similarity_brisk) features_result = (similarity_akaze + similarity_brisk + similarity_orb + similarity_sift + similarity_surf) / 5 print("Overall similarity using features: ") print() ######################################### HOG CORRELATION ############################################### bin_n = 16 #img = cv2.imread("not_perfect_trajectory2.png") gx = cv2.Sobel(img_perfect, cv2.CV_32F, 1, 0) gy = cv2.Sobel(img_perfect, cv2.CV_32F, 0, 1) mag, ang = cv2.cartToPolar(gx, gy) # quantizing binvalues in (0...16) bins = np.int32(bin_n * ang / (2 * np.pi)) # Divide to 4 sub-squares bin_cells = bins[:10, :10], bins[10:, :10], bins[:10, 10:], bins[10:, 10:] mag_cells = mag[:10, :10], mag[10:, :10], mag[:10, 10:], mag[10:, 10:] hists = [ np.bincount(b.ravel(), m.ravel(), bin_n) for b, m in zip(bin_cells, mag_cells) ] hist1 = np.hstack(hists) rows, cols, _ = img_akaze.shape M = cv2.getRotationMatrix2D((cols / 2, rows / 2), 0, 1) img_akaze = cv2.warpAffine(img_akaze, M, (cols, rows)) gx = cv2.Sobel(img_akaze, cv2.CV_32F, 1, 0) gy = cv2.Sobel(img_akaze, cv2.CV_32F, 0, 1) mag, ang = cv2.cartToPolar(gx, gy) # quantizing binvalues in (0...16) bins = np.int32(bin_n * ang / (2 * np.pi)) # Divide to 4 sub-squares bin_cells = bins[:10, :10], bins[10:, :10], bins[:10, 10:], bins[10:, 10:] mag_cells = mag[:10, :10], mag[10:, :10], mag[:10, 10:], mag[10:, 10:] hists = [ np.bincount(b.ravel(), m.ravel(), bin_n) for b, m in zip(bin_cells, mag_cells) ] hist2 = np.hstack(hists) hog_result = (np.corrcoef((hist1, hist2)[0][1]) * 100) print("HOG CORRELATION RESULT = ") print(hog_result) return (check_result, features_result, hog_result) cv2.imshow("image_akaze", img_akaze) cv2.imshow("img_circle", img_circle) cv2.waitKey(0)
if os.path.exists('hahow_courses.json'): with open('hahow_courses.json', 'r', encoding='utf-8') as f: courses = json.load(f) else: courses = crawl() print('hahow 共有 %d 堂課程' % len(courses)) # 取出程式類課程 #programming_classes = [c for c in courses if '55de81ac9d1fa51000f94770' in c['categories']] # 取出程式類課程的募資價/上線價/學生數,並顯示統計資料 pre_order_prices = list() prices = list() tickets = list() lengths = list() for c in courses: if '55de81ac9d1fa51000f94770' in c['categories']: pre_order_prices.append(c['preOrderedPrice']) prices.append(c['price']) tickets.append(c['numSoldTickets']) lengths.append(c['totalVideoLengthInSeconds']) print('平均募資價:', np.mean(pre_order_prices)) print('平均上線價:', np.mean(prices)) print('平均學生數:', np.mean(tickets)) print('平均課程分鐘:', np.mean(lengths) / 60) # print(np.corrcoef([tickets, pre_order_prices, prices, length])) corrcoef = np.corrcoef([tickets, pre_order_prices, prices, lengths]) print('募資價與學生數之相關係數: ', corrcoef[0, 1]) print('上線價與學生數之相關係數: ', corrcoef[0, 2]) print('課程長度與學生數之相關係數: ', corrcoef[0, 3])
def pearson_corr(self, all_auto_array): pearson_result = np.corrcoef(all_auto_array[:, 1000:]) print("pearson result is completely calculated")
X.shape # In[11]: X1 = X # In[12]: SVD = TruncatedSVD(n_components=10) decomposed_matrix = SVD.fit_transform(X) decomposed_matrix.shape # In[13]: correlation_matrix = np.corrcoef(decomposed_matrix) correlation_matrix.shape # In[14]: X.index[99] # In[15]: i = "6117036094" product_names = list(X.index) product_ID = product_names.index(i) product_ID # In[16]:
# ------------------------------------------------------------------- # 1.caculate the square differenz between two vectors sq_diff_ab = np.square(mean_vector_amp - mean_vector_bp) sse_ab = np.sum(sq_diff_ab) norm_ab = np.sqrt(sse_ab) print('the L2-Norm is %.2f' % norm_ab) # 2.threshold and ratio counter = 0 threshold = 0.01 print('the threshold is %.2f%%' % (threshold * 100)) for i in range(num_elements): diff = np.abs(mean_vector_amp[0][i] - mean_vector_bp[0][i]) if diff <= threshold: counter += 1 ratio = float(counter) / num_elements print('the ratio is %.2f%%' % (ratio * 100)) # 3.caculate the correlation between two vectors cocoef_matrix = np.corrcoef(mean_array_amp, mean_array_bp) cocoef = cocoef_matrix[0, 1] print('the correlation coefficient is %0.3f' % cocoef) # 4.kruskalwallis test for median difference between two distribution H, pvalue = kruskalwallis(mean_vector_amp[0], mean_vector_bp[0]) print('the p-value is %.2f' % pvalue) if pvalue > 0.05: print("accept null hypothesis: no significant difference between two groups") # -------------------------------------------------------------------
def average_pearson_score(x): if isinstance(x, DataFrame): x = x.values rho = corrcoef(x, rowvar=0) return mean(abs(rho[triu_indices_from(rho, 1)]))
def pearson_correlation(y_true, y_pred): return np.corrcoef(y_true, y_pred)[0][1]
def evaluate(self, y_true, y_pred, silent=False, auxiliary_metrics=False, detailed_report=True, high_always_good=False): """ Evaluate predictions. Args: silent (bool): Should we print which metric is being used as well as performance. auxiliary_metrics (bool): Should we compute other (problem_type specific) metrics in addition to the default metric? detailed_report (bool): Should we computed more-detailed versions of the auxiliary_metrics? (requires auxiliary_metrics=True). high_always_good (bool): If True, this means higher values of returned metric are ALWAYS superior (so metrics like MSE should be returned negated) Returns single performance-value if auxiliary_metrics=False. Otherwise returns dict where keys = metrics, values = performance along each metric. """ assert isinstance(y_true, (np.ndarray, pd.Series)) assert isinstance(y_pred, (np.ndarray, pd.Series)) # TODO: Enable DataFrame for y_pred_proba # TODO: Consider removing _remove_missing_labels, this creates an inconsistency between how .score, .score_debug, and .evaluate compute scores. y_true, y_pred = self._remove_missing_labels(y_true, y_pred) performance = self.eval_metric(y_true, y_pred) metric = self.eval_metric.name if not high_always_good: performance = performance * self.eval_metric._sign # flip negative once again back to positive (so higher is no longer necessarily better) if not silent: logger.log(20, f"Evaluation: {metric} on test data: {performance}") if not auxiliary_metrics: return performance # Otherwise compute auxiliary metrics: auxiliary_metrics = [] if self.problem_type == REGRESSION: # Adding regression metrics pearson_corr = lambda x, y: corrcoef(x, y)[0][1] pearson_corr.__name__ = 'pearson_correlation' auxiliary_metrics += [ mean_absolute_error, explained_variance_score, r2_score, pearson_corr, mean_squared_error, median_absolute_error, # max_error ] else: # Adding classification metrics auxiliary_metrics += [accuracy_score, balanced_accuracy_score, matthews_corrcoef] if self.problem_type == BINARY: # binary-specific metrics # def auc_score(y_true, y_pred): # TODO: this requires y_pred to be probability-scores # fpr, tpr, _ = roc_curve(y_true, y_pred, pos_label) # return auc(fpr, tpr) f1micro_score = lambda y_true, y_pred: f1_score(y_true, y_pred, average='micro') f1micro_score.__name__ = f1_score.__name__ auxiliary_metrics += [f1micro_score] # TODO: add auc? # elif self.problem_type == MULTICLASS: # multiclass metrics # auxiliary_metrics += [] # TODO: No multi-class specific metrics for now. Include top-5, top-10 accuracy here. performance_dict = OrderedDict({metric: performance}) for metric_function in auxiliary_metrics: if isinstance(metric_function, tuple): metric_function, metric_kwargs = metric_function else: metric_kwargs = None metric_name = metric_function.__name__ if metric_name not in performance_dict: try: # only compute auxiliary metrics which do not error (y_pred = class-probabilities may cause some metrics to error) if metric_kwargs: performance_dict[metric_name] = metric_function(y_true, y_pred, **metric_kwargs) else: performance_dict[metric_name] = metric_function(y_true, y_pred) except ValueError: pass if not silent: logger.log(20, "Evaluations on test data:") logger.log(20, json.dumps(performance_dict, indent=4)) if detailed_report and (self.problem_type != REGRESSION): # Construct confusion matrix try: performance_dict['confusion_matrix'] = confusion_matrix(y_true, y_pred, labels=self.label_cleaner.ordered_class_labels, output_format='pandas_dataframe') except ValueError: pass # One final set of metrics to report cl_metric = lambda y_true, y_pred: classification_report(y_true, y_pred, output_dict=True) metric_name = 'classification_report' if metric_name not in performance_dict: try: # only compute auxiliary metrics which do not error (y_pred = class-probabilities may cause some metrics to error) performance_dict[metric_name] = cl_metric(y_true, y_pred) except ValueError: pass if not silent and metric_name in performance_dict: logger.log(20, "Detailed (per-class) classification report:") logger.log(20, json.dumps(performance_dict[metric_name], indent=4)) return performance_dict