def CalcCorrelation(percentage, N, index): CreateTempResFile(percentage, N) getTrecEval(measure, index) x = [res.std for Qnr, res in QueriesRes.iteritems()] y = [res.trecScore for Qnr, res in QueriesRes.iteritems()] std_p = pearsonr(x, y)[0] std_s = spearmanr(x, y)[0] x = [ res.std / math.sqrt(len(Qterms[Qnr].split())) for Qnr, res in QueriesRes.iteritems() ] std_n_p = pearsonr(x, y)[0] std_n_s = spearmanr(x, y)[0] x = [res.MAD for Qnr, res in QueriesRes.iteritems()] mad_p = pearsonr(x, y)[0] mad_s = spearmanr(x, y)[0] x = [ res.MAD / math.sqrt(len(Qterms[Qnr].split())) for Qnr, res in QueriesRes.iteritems() ] mad_n_p = pearsonr(x, y)[0] mad_n_s = spearmanr(x, y)[0] if debug: print "N", N, "----", "Percentage", percentage print "std pearson ", std_p print "std spearman ", std_s print "std norm pearson ", std_n_p print "std norm spearman", std_n_s print "MAD pearson ", mad_p print "MAD spearman ", mad_s print "MAD norm pearson ", mad_n_p print "MAD norm spearman", mad_n_s return (std_p, std_s, std_n_p, std_n_s, mad_p, mad_s, mad_n_p, mad_n_s)
def plotSpecific(A, y): change_threshold = np.arange(100) * 0.01 samples = A[:, 5] (before, after) = getPrevNext(y, 10) (beforeE, afterE) = getPrevNext(y, 80) corr = [] corrE = [] for ch_th in change_threshold: B = (samples > ch_th).astype(int) (co, p) = spearmanr(B[10:], after[10:]) corr.append(co) (coE, pE) = spearmanr(B[80:], afterE[80:]) corrE.append(coE) l1, = plt.plot(change_threshold, corr, 'b') l2, = plt.plot(change_threshold, corrE, 'r') plt.legend([l1, l2], ['Next 10 Builds', 'Next 80 Builds'], loc=1) plt.xlim([0, 1]) plt.ylabel('Correlation') plt.xlabel('Change Threshold') plt.title('Spearman: ' + featureList[5] + ' vs Builds') plt.show() next_threshold = np.arange(1, 250) B = (samples > 0.0).astype(int) corr = [] for nx_th in next_threshold: (before, after) = getPrevNext(y, nx_th) (co, p) = spearmanr(B[nx_th:], before[nx_th:]) corr.append(co) plt.plot(next_threshold, corr) plt.xlim([1, 250]) plt.ylabel('Correlation') plt.xlabel('Previous Builds') plt.title('Spearman: ' + featureList[5] + ' at (0.0) vs Previous n Builds') plt.show()
def test_similarity_2(model, vocab): """Test the model for similarity. Method: get correlation between model similarity and similarity of items in the test set. This method is using data from Ruts et al. (2004)""" d = ruts_etal_similarity.get_similarity_dict() results = {category: {"skipped": set()} for category in d} pred_overall = [] actual_overall = [] for category in d: predicted_values = [] actual_values = [] for pair, score in d[category].items(): if set(pair).issubset(vocab): predicted_values.append(model.similarity(*pair)) actual_values.append(score) else: results[category]["skipped"].update(set(pair) - vocab) pred_overall += predicted_values actual_overall += actual_values results[category]["pairs_tested"] = len(predicted_values) results[category]["pearsonr"] = pearsonr(predicted_values, actual_values) results[category]["spearmanr"] = spearmanr(predicted_values, actual_values) results["overall"] = dict() results["overall"]["pairs_tested"] = len(predicted_values) results["overall"]["pearsonr"] = pearsonr(pred_overall, actual_overall) results["overall"]["spearmanr"] = spearmanr(pred_overall, actual_overall) return results
def calc_auc_on_flat_results(all_y_train, all_scores_train, all_test_real_tags, all_test_score_tags): try: test_auc = metrics.roc_auc_score(all_test_real_tags, all_test_score_tags) test_rho, p_value = stats.spearmanr(all_test_real_tags, all_test_score_tags) train_auc = metrics.roc_auc_score(all_y_train, all_scores_train) train_rho, pval_train = stats.spearmanr(all_y_train, all_scores_train) print("summary-----------------------") print("test_auc: " + str(test_auc)) print("test_rho: " + str(test_rho)) print("train_auc: " + str(train_auc)) print("train_rho: " + str(train_rho)) except ValueError: # Compute ROC curve and ROC area for each class print("train classification_report") train_auc = metrics.classification_report(all_y_train, all_scores_train) for row in train_auc.split("\n"): print(row) print("test classification_report") test_auc = metrics.classification_report(all_test_real_tags, all_test_score_tags) for row in test_auc.split("\n"): print(row) train_rho, pval_train = stats.spearmanr(all_y_train, all_scores_train) test_rho, p_value = stats.spearmanr(all_test_real_tags, all_test_score_tags) return train_auc, test_auc, train_rho, test_rho
def CalcCorrelation(percentage, N, index): CreateTempResFile(percentage, N) getTrecEval(measure, index) x = [res.std for Qnr, res in QueriesRes.iteritems()] y = [res.trecScore for Qnr, res in QueriesRes.iteritems()] std_p = pearsonr(x, y)[0] std_s = spearmanr(x, y)[0] x = [res.std / math.sqrt(len(Qterms[Qnr].split())) for Qnr, res in QueriesRes.iteritems()] std_n_p = pearsonr(x, y)[0] std_n_s = spearmanr(x, y)[0] x = [res.MAD for Qnr, res in QueriesRes.iteritems()] mad_p = pearsonr(x, y)[0] mad_s = spearmanr(x, y)[0] x = [res.MAD / math.sqrt(len(Qterms[Qnr].split())) for Qnr, res in QueriesRes.iteritems()] mad_n_p = pearsonr(x, y)[0] mad_n_s = spearmanr(x, y)[0] if debug: print "N", N, "----", "Percentage", percentage print "std pearson ", std_p print "std spearman ", std_s print "std norm pearson ", std_n_p print "std norm spearman", std_n_s print "MAD pearson ", mad_p print "MAD spearman ", mad_s print "MAD norm pearson ", mad_n_p print "MAD norm spearman", mad_n_s return (std_p, std_s, std_n_p, std_n_s, mad_p,mad_s, mad_n_p, mad_n_s)
def eval(m, tok, task_name): with open(task_name, "r", encoding="utf-8") as f: lines = f.readlines() lines = [line.strip().split("\t") for line in lines] ys = [float(line[2]) for line in lines] input_term = [] for line in lines: input_term.append(line[0]) for line in lines: input_term.append(line[1]) if tok is not None: preds_cls, preds_mean = get_simlarity_bert(input_term[0:len(lines)], input_term[len(lines):], m, tok) c_cls, p_cls = spearmanr(preds_cls, ys) print(task_name, "CLS", c_cls, p_cls) c_mean, p_mean = spearmanr(preds_mean, ys) print(task_name, "MEAN", c_mean, p_mean) else: try: dim = m.values()[0].shape[0] except BaseException: try: dim = m.vector_size except BaseException: dim = 300 preds = get_simlarity(input_term[0:len(lines)], input_term[len(lines):], m, dim) c, p = spearmanr(preds, ys) print(task_name, c, p)
def correlation(human_df, vectors, corpus): all_values = defaultdict(list) facet_human_vals = defaultdict(list) facet_pred_vals = defaultdict(list) for i, row in human_df.iterrows(): pred_df = get_facet_sims_of_books(vectors, corpus, row['ID_A'], row['ID_B']) real_val = row["Similarity"] pred_val = pred_df[row["Facet"]].values[0] facet_human_vals[row["Facet"]].append(real_val) facet_pred_vals[row["Facet"]].append(pred_val) all_values['real'].append(real_val) all_values['predicted'].append(pred_val) # noinspection PyTypeChecker complete_correlation = stats.spearmanr(all_values['real'], all_values['predicted']) facet_correlation = {} for facet in facet_human_vals: # noinspection PyTypeChecker facet_correlation[facet] = stats.spearmanr(facet_human_vals[facet], facet_pred_vals[facet]) return complete_correlation, facet_correlation
def gen_input_goatools(filename, infos, exps, cutoff, q_info, q_exp, gos): out = open(filename, "w") datas = [] ids = [] pro_gos = [] ccs = [] for info, exp in zip(infos, exps): if info != q_info: if "positive" in filename: if (float(spearmanr(exp, q_exp)[0]) >= cutoff): if get_pro_id(info) is not None: out.write(get_pro_id(info) + "\n") detect = False for pro_id, go_list in gos.items(): if pro_id in info: pro_gos.append(go_list) detect = True break if not detect: pro_gos.append("NA") datas.append(exp) ids.append(info) ccs.append("{0:.5f}".format(float(spearmanr(exp, q_exp)[0]))) elif "negative" in filename: if (float(spearmanr(exp, q_exp)[0]) <= cutoff): detect = False if get_pro_id(info) is not None: out.write(get_pro_id(info) + "\n") for pro_id, go_list in gos.items(): if pro_id in info: pro_gos.append(go_list) detect = True break if not detect: pro_gos.append("NA") datas.append(exp) ids.append(info) ccs.append("{0:.5f}".format(float(spearmanr(exp, q_exp)[0]))) out.close() out_go = open(filename + "_go", "w") call(["python3", args.goatools_path, "--pval=0.05", "--indent", "--obo=" + args.obo_file, filename, args.population_file, args.go_association], stdout=out_go) out_go.close() fh = open(filename + "_go", "r") start = False enrichs = [] for row in csv.reader(fh, delimiter='\t'): if start: if row[2] == "e": enrichs.append(row[0].replace(".", "")) if row[0] == "GO": start = True fh.close() os.remove(filename) os.remove(filename + "_go") return datas, ids, pro_gos, enrichs, ccs
def plot_correlation(regr, name, X, y, loo): regr.fit(X, y) y_pred, y_true = [], [] for train_index, test_index in loo.split(X): X_train, X_test = X[train_index], X[test_index] Y_train, Y_test = y[train_index], y[test_index] _=regr.fit(X_train, Y_train) pred = regr.predict(X_test) y_pred.append(np.squeeze(pred)) y_true.append(np.squeeze(Y_test)) y_pred = np.array(y_pred) y_true = np.array(y_true) r2 = round(metrics.r2_score(y_true, y_pred),4) pearson = round(pearsonr(y_true, y_pred)[0],4) spearman = round(spearmanr(y_true, y_pred)[0],4) print('R-squared:', metrics.r2_score(y_true, y_pred)) print('Person:', pearsonr(y_true, y_pred)) print(spearmanr(y_true, y_pred),'\n') trace_1 = go.Scatter( x = y_true, y = y_pred, mode = 'markers', name='Scatter', marker = dict(size = 12, opacity = 0.5) ) xs, ys = np.array(y_true), np.array(y_pred) regr = linear_model.LinearRegression() regr.fit(xs.reshape(-1, 1), ys.reshape(-1, 1)) ys_pred = regr.predict(xs.reshape(-1, 1)) trace_2 = go.Scatter( x = xs, y = np.squeeze(ys_pred), name='Regression', mode = 'lines', line = dict(width = 4) ) name += 'R-squared: ' + str(r2) + \ ', Pearson: ' + str(pearson) + \ ', Spearman: ' + str(spearman) layout = go.Layout( title=name, width=650, yaxis= dict(title='Predicted'), xaxis= dict(title='Breteau index'), font=dict(size=16) ) fig = go.Figure(data=[trace_1, trace_2], layout=layout) iplot(fig)
def spearman(set_1, set_2, onlyfound=False): if onlyfound: set1 = [] set2 = [] for s1, s2 in zip(set_1, set_2): if s1 != -1 and s2 != -1: set1.append(s1) set2.append(s2) return spearmanr(set1, set2)[0] else: return spearmanr(set_1, set_2)[0]
def outputResults(out1_epsilon, out2_epsilon, kernel, train_lt, test_lt): # Output the results to the appropriate output files writeFloatList(out1_epsilon, TRAINPREDICTIONSEPSILONFILENAME) writeFloatList(out2_epsilon, VALIDATIONPREDICTIONSEPSILONFILENAME) print "Pearson correlation between training labels and predictions, epsilon SVR:" print pearsonr(train_lt, out1_epsilon) print "Spearman correlation between training labels and predictions, epsilon SVR:" print spearmanr(train_lt, out1_epsilon) print "Pearson correlation between validation labels and predictions, epsilon SVR:" print pearsonr(test_lt, out2_epsilon) print "Spearman correlation between validation labels and predictions, epsilon SVR:" print spearmanr(test_lt, out2_epsilon)
def find_spearman_score(y, pred_y): if np.ndim(y) == 2: count = 0 sum = 0 for i in range(pred_y.shape[1]): corr = stats.spearmanr(y[:, i], pred_y[:, i])[0] if np.isnan(corr): continue count = count + 1 sum = sum + corr return sum / count else: corr = stats.spearmanr(y, pred_y)[0] return corr
def main(kdts_path, drop_zeros): # Read in kdts data with open(kdts_path, "rb") as infile: slice_idx_to_data = pkl.load(infile) # Reduce to flat lists of distances gk = ('wlst', 'logical_time', 5) slice_indices = sorted(slice_idx_to_data.keys()) nd_fraction_labels = [0, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] flat_dists_seq = get_distances_seq(slice_idx_to_data, slice_indices, gk) if drop_zeros: for i in range(1, len(flat_dists_seq)): dists = flat_dists_seq[i] without_zeros = list(filter(lambda x: x != 0, dists)) flat_dists_seq[i] = without_zeros #for i in range(len(flat_dists_seq)): # dists = flat_dists_seq[i] # n_zeros = 0 # for d in dists: # if d == 0.0: # n_zeros += 1 # percent_zeros = n_zeros / len(dists) # print("% ND: {} --> # zeros: {}, % zeros: {}".format(nd_fraction_labels[i],n_zeros, percent_zeros)) # Associate each kernel distance with the non-determinism fraction of the # runs its generating graphs represent nd_fraction_seq = [] dist_seq = [] for i in range(len(nd_fraction_labels)): for d in flat_dists_seq[i]: nd_fraction_seq.append(nd_fraction_labels[i]) dist_seq.append(d) pearson_r, pearson_p = pearsonr(nd_fraction_seq, dist_seq) spearman_r, spearman_p = spearmanr(nd_fraction_seq, dist_seq) print("Kernel distance vs. % ND --> Pearson-R = {}, p = {}".format( pearson_r, pearson_p)) print("Kernel distance vs. % ND --> Spearman-R = {}, p = {}".format( spearman_r, spearman_p)) all_stats_seq = get_stats_seq(flat_dists_seq) for stat in ["mean", "median", "max", "variance"]: stats_seq = [s[stat] for s in all_stats_seq] pearson_r, pearson_p = pearsonr(nd_fraction_labels, stats_seq) spearman_r, spearman_p = spearmanr(nd_fraction_labels, stats_seq) print("Kernel distance {} vs. % ND --> Pearson-R = {}, p = {}".format( stat, pearson_r, pearson_p)) print("Kernel distance {} vs. % ND --> Spearman-R = {}, p = {}".format( stat, spearman_r, spearman_p))
def calc_auc_on_joined_results(Cross_validation, y_trains, y_train_preds, y_tests, y_test_preds): all_y_train = np.array(y_trains).flatten() all_y_train for i in range(Cross_validation): all_y_train = all_y_train + y_trains[i] all_predictions_train = [] for i in range(Cross_validation): all_predictions_train = all_predictions_train + list(y_train_preds[i]) all_test_real_tags = [] for i in range(Cross_validation): all_test_real_tags = all_test_real_tags + y_tests[i] all_test_pred_tags = [] for i in range(Cross_validation): all_test_pred_tags = all_test_pred_tags + list(y_test_preds[i]) try: train_auc = metrics.roc_auc_score(all_y_train, all_predictions_train) #fpr, tpr, thresholds = metrics.roc_auc_score(all_test_real_tags, all_test_pred_tags) # test_auc = metrics.auc(fpr, tpr) test_auc = metrics.roc_auc_score(all_test_real_tags, all_test_pred_tags) train_rho, pval_train = stats.spearmanr( all_y_train, np.array(all_predictions_train)) test_rho, p_value = stats.spearmanr(all_test_real_tags, np.array(all_test_pred_tags)) except ValueError: # Compute ROC curve and ROC area for each class print("train classification_report") train_auc = metrics.classification_report(all_y_train, all_predictions_train) for row in train_auc.split("\n"): print(row) print("test classification_report") test_auc = metrics.classification_report(all_test_real_tags, all_test_pred_tags) for row in test_auc.split("\n"): print(row) train_rho, pval_train = stats.spearmanr( all_y_train, np.array(all_predictions_train)) test_rho, p_value = stats.spearmanr(all_test_real_tags, np.array(all_test_pred_tags)) return all_y_train, all_predictions_train, all_test_real_tags, all_test_pred_tags,\ train_auc, test_auc, train_rho, test_rho
def getStatistics(A, y): prNx_threshold = [2, 3, 5, 10] change_threshold = [0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.2, 0.5] for feature in range(A.shape[1]): print('\n') print('#' * 150) print(featureList[feature]) samples = A[:, feature] print('M vs Out ' + str(pearsonr(samples, y))) for ch_th in change_threshold: B = (A[:, feature] > ch_th).astype(int) print('Changes over Threshold ' + str(ch_th) + ': ' + str((B == 1).sum())) if ((B == 1).sum()) > 0: print('Ch (' + str(ch_th) + ') vs Out : ' + str(spearmanr(B, y))) failsIfChange = 0 for i in range(len(B)): if B[i] == 1 and y[i] != 0: failsIfChange += 1 print('P(fail | change): ' + str(failsIfChange) + '/' + str((B == 1).sum()) + ' = ' + str(failsIfChange / (B == 1).sum())) print('P(change | fail): ' + str(failsIfChange) + '/' + str((y != 0).sum()) + ' = ' + str(failsIfChange / (y != 0).sum())) for pr_th in prNx_threshold: (before, after) = getPrevNext(y, pr_th) print('M vs Bef (' + str(pr_th) + '): ' + str(pearsonr(samples[pr_th:], before[pr_th:]))) print('M vs Nxt (' + str(pr_th) + '): ' + str(pearsonr(samples[pr_th:], after[pr_th:]))) for ch_th in change_threshold: B = (A[:, feature] > ch_th).astype(int) if ((B == 1).sum()) > 0: print('Ch (' + str(ch_th) + ') vs Bef (' + str(pr_th) + '): ' + str(spearmanr(B[pr_th:], before[pr_th:]))) print('Ch (' + str(ch_th) + ') vs Nxt (' + str(pr_th) + '): ' + str(spearmanr(B[pr_th:], after[pr_th:]))) print('#' * 150)
def npccf(x, y, method="spearmanr", min_lag=-10, max_lag=10): """ Compute cross correlation of time series x and y from min_lag to max_lag (based on nonparametric correlation). r(lag) = corr(x[t-lag], y[t]). Parameters ---------- x: time series y: time series method: "spearmanr" or "kendalltau" min_lag : int, default -10 max_lag : int, default 10 Returns ---------- a dictionary with keys "corrs" (correlation coefficient corresponding to the lags), "lags" (corresponding lags), "lb" (lower bound) and "ub" (upper bound). """ n1 = len(x) n2 = len(y) assert (n1 == n2 ), "The length of time series x and time series y must be equal!" assert (min_lag <= max_lag), "min_lag must less than or equal to max_lag!" nlags = max_lag - min_lag + 1 corrs = np.empty(nlags) if method == "spearmanr": for k, lag in enumerate(range(min_lag, (max_lag + 1))): if lag == 0: corrs[k] = spearmanr(x, y)[0] if lag < 0: corrs[k] = spearmanr(x[(-lag):], y[:lag])[0] if lag > 0: corrs[k] = spearmanr(x[:(-lag)], y[lag:])[0] elif method == "kendalltau": for k, lag in enumerate(range(min_lag, (max_lag + 1))): if lag == 0: corrs[k] = kendalltau(x, y)[0] if lag < 0: corrs[k] = kendalltau(x[(-lag):], y[:lag])[0] if lag > 0: corrs[k] = kendalltau(x[:(-lag)], y[lag:])[0] else: raise ValueError("The method %s is not supported." % method) return { "corrs": corrs, "lags": range(min_lag, (max_lag + 1)), "lb": np.repeat(-1 / np.sqrt(n1), nlags), "ub": np.repeat(1 / np.sqrt(n1), nlags) }
def optimize_dist(nf, optimize=1): dist_vec = [ ] #array with accuracies for each pair within each LOOVC fold def nf_select(nf): #fselector = mvpa2.FixedNElementTailSelector(np.round(nf), tail='upper',mode='select', sort=False) #sbfs = mvpa2.SensitivityBasedFeatureSelection(mvpa2.OneWayAnova(), fselector, enable_ca=['sensitivities'], auto_train=True) if (optimize == 1): not_test_ds = ds[ds.chunks != chunk] train_ds = not_test_ds[not_test_ds.chunks != val_chunk] #sbfs.train(train_ds) #ds2 = sbfs(not_test_ds) #optimize nf & include validation set for computing dists elif (optimize == 0): train_ds = ds[ ds.chunks != chunk] #retrain with all data if not optimizing #sbfs.train(train_ds) #ds2 = sbfs(ds) #pick top features with training & use whole dataset for computing dists return ds2 #ds2 = nf_select(nf) for y in range(0, len(pair_list2)): def mask(y, ds): stim_mask_train0 = (ds.targets == pair_list2[y][0]) stim_mask_train1 = (ds.targets == pair_list2[y][1]) ds_stim1 = ds[stim_mask_train0] ds_stim2 = ds[stim_mask_train1] return ds_stim1, ds_stim2 ds_stim1, ds_stim2 = mask(y, ds) dist_vec.append( distance_funcs(np.mean(ds_stim1, axis=0), np.mean(ds_stim2, axis=0), ds.samples, ds_stim1, ds_stim2, dist)) if (optimize == 1): corr_test = spearmanr(val_accs, dist_vec) #corr_test = pearsonr(val_accs[i],dist_vec) elif (optimize == 0): corr_test = spearmanr(test_accs, dist_vec) #corr_test = pearsonr(test_accs[i],dist_vec) corr = corr_test[0] pval = corr_test[1] #print corr, ',', pval, 'distance:', dist, np.round(nf), ', features,', 'chunk', chunk if (optimize == 1): return 1 - corr elif (optimize == 0): return corr, pval, dist_vec
def checarspearman_paralelo(): lista = [ [rbc_gel, rbc_amb], [muc_amb, muc_gel], [caoxd_amb, caoxd_gel], [hya_amb, hya_gel], [bac_amb, bac_gel], [pat_amb, pat_gel], [wbc_amb, wbc_gel], [epi_amb, epi_gel], [tri_amb, tri_gel], [uri_amb, uri_gel], [yea_amb, yea_gel], [amo_amb, amo_gel], ] relatorio = [] print(___l) print() print(" RELATÓRIO DE CORRELAÇÕES (SPEARMAN) PARALELO") print() print(___l) for ex in lista: exame1, exame2 = ex[0], ex[1] print() print("RESULTADOS DE", list(Series(exame1["EXAME"]))[0], ":") for i in ["H4", "H8", "H12", "H24"]: p = "" resultado = spearmanr(np.array(exame1[i].map(float)), np.array(exame2[i].map(float))) relatorio.append(resultado) if resultado[1] > 0.05: p = "***" print("Entre", i, "e", i, ":", resultado[0], "(Spearman ρ), e ", resultado[1], "(valor de p)", p) print(___l) return relatorio
def correlations_ground_truth(): print 'ground truth' #load network wikipedia = load_graph("output/weightedpagerank/wikipedianetwork_hyp_engineering.xml.gz") #read counts with zeros article_counts = pd.read_csv(TMP+'article_counts.tsv', sep='\t') cor = {} for damping in [0.8,0.9]: page_rank = pagerank(wikipedia, damping=damping) wikipedia.vertex_properties['page_rank_'+str(damping)] = page_rank page_rank_values = list() counts = list() correlations_values = {} for index, row in article_counts.iterrows(): counts.append(float(row['counts'])) page_rank_values.append(page_rank[wikipedia.vertex(int(row['target_article_id']))]) print 'pearson' p = pearsonr(page_rank_values, counts) print p correlations_values['pearson']=p print 'spearmanr' s = spearmanr(page_rank_values, counts) print s correlations_values['spearmanr']=s print 'kendalltau' k = kendalltau(page_rank_values, counts) print k correlations_values['kendalltau']=k cor['page_rank_'+str(damping)]=correlations_values write_pickle(HOME+'output/correlations/correlations_pagerank.obj', cor)
def correlations_weighted_unweighted(labels): #load network print 'weighted vs unweighted' name = '_'.join(labels) wikipedia = load_graph("output/weightedpagerank/wikipedianetwork_hyp_engineering_"+name+".xml.gz") #read counts with zeros wikipedia_u = load_graph("output/weightedpagerank/wikipedianetwork_sem_sim_distinct_links.xml.gz") correlations_weighted_pagerank = {} for label in labels: for damping in [0.8,0.85,0.9]: correlations_values={} key_weighted = label+"_page_rank_weighted_"+str(damping) pagerank_weighted = wikipedia.vertex_properties[key_weighted] key_unweighted = "page_rank"+str(damping) pagerank_unweighted = wikipedia_u.vertex_properties[key_unweighted] print 'pearson' p = pearsonr(pagerank_weighted.a, pagerank_unweighted.a) print p correlations_values['pearson']=p print 'spearmanr' s = spearmanr(pagerank_weighted.a, pagerank_unweighted.a) print s correlations_values['spearmanr']=s print 'kendalltau' k = kendalltau(pagerank_weighted.a, pagerank_unweighted.a) print k correlations_values['kendalltau']=k correlations_weighted_pagerank[label+str(damping)]=correlations_values write_pickle(HOME+'output/correlations/correlations_pagerank_weightedvsunweighted'+name+'.obj', correlations_weighted_pagerank)
def control_followers(db, ids, feats, data, repins, dataset): divs = [1, 200, 600, 1700, 4700, 13000, 1000000] n = len(divs) pins = db.get_pins_info(ids) followers = np.asarray([pins[pid][2] for pid in ids]) groups = [] for i in xrange(n-1) : g = np.nonzero((followers>=divs[i]) & (followers<divs[i+1]))[0] groups.append(g) print "%d < followers < %d (%d)\t" % (divs[i], divs[i+1], len(g)) corrs = np.ones((len(feats), len(groups)), float) for i in xrange(len(feats)) : print "Feature:", feats[i] for j in xrange(len(groups)): _data = data[groups[j],i] _repins = repins[groups[j]] corrs[i,j] = spearmanr(_data, _repins)[1] print corrs[i,j],
def mono_bin(Y, X, n = 20): r = 0 good=Y.sum() bad=Y.count()-good while np.abs(r) < 1: d1 = pd.DataFrame({"X": X, "Y": Y, "Bucket": pd.qcut(X.rank(method='first'), n)}) d2 = d1.groupby('Bucket', as_index = True) r, p = stats.spearmanr(d2.mean().X, d2.mean().Y) n = n - 1 d3 = pd.DataFrame(d2.X.min(), columns = ['min']) d3['min']=d2.min().X d3['max'] = d2.max().X d3['sum'] = d2.sum().Y d3['total'] = d2.count().Y d3['rate'] = d2.mean().Y d3['woe']=np.log((d3['rate']/(1-d3['rate']))/(good/bad)) d3['goodattribute'] = d3['sum'] / good d3['badattribute'] = (d3['total'] - d3['sum']) / bad iv = ((d3['goodattribute'] - d3['badattribute']) * d3['woe']).sum() d4 = (d3.sort_values(by='min')).reset_index(drop=True) print("=" * 60) print(d4) cut = [] cut.append(float('-inf')) for i in range(1, n + 1): qua = X.quantile(i / (n + 1)) cut.append(round(qua, 4)) cut.append(float('inf')) woe = list(d4['woe'].round(3)) return iv,cut,woe
def get_performance_simple(d, cutoff=CUTOFF_AFFINITY_LOG): """ INPUT: y = A list of measured affinities in log10(ic50 nM) units. ypred = A list of predicted affinities in log10(ic50 nM) units. OUTPUT: 1) pearson 2) aroc 3) rmsd """ meas = d['y'] labels = [x < cutoff for x in meas] # 1 = binder; 0 = nonbinder pred = d['ypred'] # Get AUC: auc_model = ROC(pred, labels) # Get Pearson's correlation: # cor_pearson, cor_pearson_pai cor_pearson = pearsonr(meas, pred) cor_spearman = spearmanr(meas, pred) rmsd = get_rmsd(meas, pred) row = (cor_pearson[0], auc_model[0], rmsd) return row
def get_bigrams_for_feature(word_feature, neus, capacity): """ Extract unigram features. The most frequent $capacity number of bigrams will be selected. Then these bigrams will be filted based on pearson regression. """ top_bigrams = word_feature.get_top_bigrams(capacity) print 'most frequent bigrams: ', top_bigrams candicate_map = dict() for bigram in top_bigrams: bg_counts = word_feature.get_feauture_by_bigram(bigram) p = spearmanr(bg_counts, neus) if not math.isnan(p[0]): candicate_map[bigram] = p selected_bigrams = list() for candicate_bigram in candicate_map.keys(): idx = 0 while idx < len(selected_bigrams): if abs(candicate_map[selected_bigrams[idx]][0]) < abs(candicate_map[candicate_bigram][0]): break idx += 1 selected_bigrams.insert(idx, candicate_bigram) if len(selected_bigrams) > bigram_capacity: selected_bigrams = selected_bigrams[:bigram_capacity] print '======== selected bigrams for feature ========' for selected_bigram in selected_bigrams: print selected_bigram, ':', candicate_map.get(selected_bigram) return selected_bigrams
def get_unigrams_for_feature(word_feature, neus, capacity): """ Extract unigram features. The most frequent $capacity number of unigrams will be selected. Then these words will be filted based on spearman rank correlation. """ top_unigrams = word_feature.get_top_unigrams(capacity) print 'most frequent unigrams: ', top_unigrams candicate_map = dict() for unigram in top_unigrams: w_counts = word_feature.get_feauture_by_unigram(unigram) p = spearmanr(w_counts, neus) if not math.isnan(p[0]): candicate_map[unigram] = p selected_unigrams = list() for candicate_unigram in candicate_map.keys(): if len(selected_unigrams) == 0: selected_unigrams.append(candicate_unigram) continue idx = 0 while idx < len(selected_unigrams): if abs(candicate_map[selected_unigrams[idx]][0]) < abs(candicate_map[candicate_unigram][0]): break idx += 1 selected_unigrams.insert(idx, candicate_unigram) if len(selected_unigrams) > unigram_capacity: selected_unigrams = selected_unigrams[:unigram_capacity] print '======== selected unigrams for feature ========' for selected_unigram in selected_unigrams: print selected_unigram, ':', candicate_map.get(selected_unigram) return selected_unigrams
def get_metrics(self, y, yhat, name): mse = self.compute_mse(y, yhat) pearson = pearsonr(y, yhat)[0][0] kendall = kendalltau(y, yhat)[0] spearman = spearmanr(y, yhat)[0] return {"lat": self.lat, "lon": self.lon, "model": name, "mse": mse, "pearson": pearson, "kendall": kendall, "spearman": spearman}
def correlationBetweenAllFeaturesAndMOS(self): w, h = self.datafortrain.shape PLCC = [] SROCC = [] for i in range(h - 1): x = self.datafortrain[:, 0] y = self.datafortrain[:, i + 1] plcc, pval = statstool.pearsonr(x, y) srocc, pval = statstool.spearmanr(x, y) PLCC.append(plcc), SROCC.append(srocc) N = 15 ind = np.arange(N) # the x locations for the groups width = 0.35 # the width of the bars fig, ax = plt.subplots() # add some ax.set_xlabel(u'特征序号', fontsize=18) ax.set_ylabel(u'相关系数', fontsize=18) # ax.set_title('The Correlation between features and mos') rects1 = ax.bar(ind, tuple(PLCC), width, color='r') rects2 = ax.bar(ind + width, tuple(SROCC), width, color='b') ax.set_xticks(ind) ax.set_xticklabels( ('f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15')) ax.legend((rects1[0], rects2[0]), ('PLCC', 'SROCC')) plt.legend(loc='center right') plt.show()
def test_goodness(model, vocab): """Tests the model on its ability to create a goodness ranking for a category. Method: get spearman (rank) correlation between the predicted and the actual ranking. This method is using data from De Deyne et al. (2008)""" d = dedeyne_etal_goodness.get_goodness_rankings() results = {category: dict() for category in d} categories = set(d.keys()) & vocab for category in categories: exemplars = set(d[category]) & vocab sorted_exemplars = [ b for a, b in sorted([(model.similarity(category, ex), ex) for ex in exemplars], reverse=True) ] predicted_ranking = [] actual_ranking = [] for exemplar in exemplars: actual_ranking.append(d[category].index(exemplar)) predicted_ranking.append(sorted_exemplars.index(exemplar)) results[category]["spearman"] = spearmanr(predicted_ranking, actual_ranking) results[category]["kendall"] = kendalltau(predicted_ranking, actual_ranking) results[category]["num_items"] = len(exemplars) avg_spearman = float(sum(abs(results[cat]["spearman"][0]) for cat in categories)) / len(categories) avg_kendall = float(sum(abs(results[cat]["kendall"][0]) for cat in categories)) / len(categories) results["overall"] = dict() results["overall"]["avg_spearman"] = avg_spearman results["overall"]["avg_kendall"] = avg_kendall return results
def spearman_with_errors(x, y, yerr, Nmc=1000, plotflag=False, verbose=False): ysim = np.zeros(Nmc, 'f') rhosim = np.zeros(Nmc, 'f') psim = np.zeros(Nmc, 'f') for i in range(Nmc): ysim = np.random.normal(y, scale=yerr, size=len(y)) rhosim[i], psim[i] = spearmanr(x, ysim) cave = np.mean(rhosim) cstd = np.std(rhosim) q1 = 50 - 34 # mean minus one std lower = np.percentile(rhosim, q1) q2 = 50 + 34 # mean minus one std upper = np.percentile(rhosim, q2) print 'mean (median) = %5.2f (%5.2f), std = %5.2f' % ( cave, np.median(rhosim), cstd) print 'confidence interval from sorted list of MC fit values:' print 'lower = %5.2f (%5.2f), upper = %5.2f (%5.2f)' % (lower, cave - cstd, upper, cave + cstd) k, pnorm = normaltest(rhosim) print 'probability that distribution of slopes is normal = %5.2f' % (pnorm) if plotflag: plt.figure(figsize=(10, 4)) plt.subplot(1, 2, 1) plt.hist(rhosim, bins=10, normed=True) plt.xlabel(r'$Spearman \ \rho $') plt.axvline(x=cave, ls='-', color='k') plt.axvline(x=lower, ls='--', color='k') plt.axvline(x=upper, ls='--', color='k') plt.subplot(1, 2, 2) plt.hist(np.log10(psim), bins=10, normed=True) plt.xlabel(r'$\log_{10}(p \ value)$') return rhosim, psim
def third_order_poly_fit_plot (x, y, outname, yerror): def func(x, p1, p2, p3, p4): return p1 + p2 * x + p3 * x**2 + p4 * x**3 xdata = np.array(x) ydata = np.array(y) ymedian = np.median(y) xnew = np.arange(1, max(x), 0.001) popt, pcov, infodict, mesg, ier = curve_fit(func, xdata, ydata,p0=(1, 1, 1, 1),full_output=1) ynew = [func(i, popt[0], popt[1], popt[2], popt[3]) for i in xnew] #plt.figure() fig, ax = plt.subplots() plt.errorbar(x, y, marker='x', yerr=yerror, ls="None") plt.plot(xnew, ynew) #plt.plot(x, y, 'x', xnew, ynew) plt.axis([.5, 9.5, 0, max( max(y), max(ynew) ) + 1]) ax.set_xticklabels(['', '80S', 'poly2', 'poly3', 'poly4', 'poly5', 'poly6', 'poly7', 'poly8', 'cyto']) plt.legend(['Input', 'Third order polynomial']) residuals = sum(infodict['fvec']**2) plt.title("Sq. Resid.: %5.4f; Res/Median: %5.4f" % (residuals, residuals/ymedian)) perr = np.sqrt(np.diag(pcov)) perr_percent = [ np.fabs(perr[i]/popt[i]) for i in range(len(popt))] avg_percent_error = np.mean(perr_percent) total_percent_error = sum(perr_percent) weighted_perr = sum([ perr_percent[i] * np.fabs(popt[i]) for i in range(len(popt))]) prsn = pearsonr( [func(i, popt[0], popt[1], popt[2], popt[3]) for i in x], y)[0] sprmn = spearmanr( [func(i, popt[0], popt[1], popt[2], popt[3]) for i in x], y)[0] def prt(inp): #"pretty" return ["%3.2f" % inp[i] for i in range(len(inp))] plt.text(.75, 1, "Parms %s\nerrors %s\n%% error %s\nmean %%: %3.2f sum %%: %3.2f weighted %%: %3.2f pearson: %3.2f spearman: %3.2f" % (prt(popt), prt(perr), prt(perr_percent), avg_percent_error, total_percent_error, weighted_perr, prsn, sprmn), fontsize=8) plt.savefig(outname) plt.close(fig)
def compare_models(m1, tl1, m2, tl2): "Test how well the two models correlate" # Ensure overlap between the two tag lists: overlap = set(tl1) & set(tl2) # Get the row indices for the tags: m1_indices = {name: i for i, name in enumerate(tl1)} m2_indices = {name: i for i, name in enumerate(tl2)} # Prepare lists to collect data: m1_values = [] m2_values = [] differences = [] # Create shorthand for the cosine similarity function: cosine = lambda x, y: float(pairwise_distances(x, y, metric='cosine')) # For all combinations of tags, compute the distances for a, b in combinations(overlap, 2): m1_pred = cosine(m1[m1_indices[a]], m1[m1_indices[b]]) m2_pred = cosine(m2[m2_indices[a]], m2[m2_indices[b]]) m1_values.append(m1_pred) m2_values.append(m2_pred) differences.append((abs(m1_pred - m2_pred), a + ' ' + b)) # Correlate the two sets of distances correlation, sig = spearmanr(m1_values, m2_values) return { "correlation": correlation, "significance": sig, "differences": sorted(differences, reverse=True) }
def evaluate_word2vec(model, measure='men'): "Evaluates a model on the basis of the provided similarity measure." # Load the similarity measure, if possible. try: sim_dict = __resource__[measure] except KeyError: return None # Select pairs that can be used for testing. sim_words = {word for pair in sim_dict for word in pair} usable_words = set(model.vocab.keys()) & sim_words usable_pairs = { key for key in sim_dict.keys() if set(key).issubset(usable_words) } # Gather lists of actual values and 'predictions' actual_values = [] predicted_values = [] for a, b in usable_pairs: actual_values.append(sim_dict[(a, b)]) predicted_values.append(model.similarity(a, b)) # Compute the correlation: correlation, sig = spearmanr(actual_values, predicted_values) return { "correlation": correlation, "explained": correlation * correlation, "significance": sig, "test_pairs": len(usable_pairs), "predictions": dict(zip(usable_pairs, predicted_values)) }
def get_unigrams_for_feature(word_feature, neus, capacity): """ Extract unigram features. The most frequent $capacity number of unigrams will be selected. Then these words will be filted based on spearman rank correlation. """ top_unigrams = word_feature.get_top_unigrams(capacity) print 'most frequent unigrams: ', top_unigrams candicate_map = dict() for unigram in top_unigrams: w_counts = word_feature.get_feauture_by_unigram(unigram) p = spearmanr(w_counts, neus) if not math.isnan(p[0]): candicate_map[unigram] = p selected_unigrams = list() for candicate_unigram in candicate_map.keys(): if len(selected_unigrams) == 0: selected_unigrams.append(candicate_unigram) continue idx = 0 while idx < len(selected_unigrams): if abs(candicate_map[selected_unigrams[idx]][0]) < abs( candicate_map[candicate_unigram][0]): break idx += 1 selected_unigrams.insert(idx, candicate_unigram) if len(selected_unigrams) > unigram_capacity: selected_unigrams = selected_unigrams[:unigram_capacity] print '======== selected unigrams for feature ========' for selected_unigram in selected_unigrams: print selected_unigram, ':', candicate_map.get(selected_unigram) return selected_unigrams
def evaluate(self, embs, data): details = [] results = [] cnt_found_pairs_total = 0 for (x, y), sim in data: x = x.lower() y = y.lower() # print(x,y) if embs.has_word(x) and embs.has_word(y) and not math.isnan( embs.get_vector(x).dot(embs.get_vector(y))): # print(m.get_row(x).dot(m.get_row(y))) v = embs.get_vector(x).dot(embs.get_vector(y)) results.append((v, sim)) cnt_found_pairs_total += 1 details.append([x, y, float(v), float(sim)]) else: if not self.ignore_oov: # results.append((-1, sim)) # details.append([x, y, str(-1), str(sim)]) results.append((0, sim)) details.append([x, y, str(0), str(sim)]) # print('oov') pass if len(results) <= 2: return -1, cnt_found_pairs_total, [] actual, expected = zip(*results) # print(actual) return spearmanr(actual, expected)[0], cnt_found_pairs_total, details
def calculateMetrics(obs, mod, obsStart, obsEnd, modStart, modEnd, obsStep, modStep, modTimes): if obsStep > 1 or modStep > 1: obs, mod, plotTimes = matchSeriesMonth(obs, mod, obsStart, obsEnd, modStart, modEnd, modTimes) if obsStep <= 1 and modStep <= 1: obs, mod, plotTimes = matchSeriesDay(obs, mod, obsStart, obsEnd, modStart, modEnd, modTimes) obsSel = np.isnan(obs) == False modSel = np.isnan(mod) == False sel = obsSel & modSel if len(obs) > timeSize and len(mod) > timeSize: obsSel = np.isnan(obs) == False modSel = np.isnan(mod) == False sel = obsSel & modSel if len(obs[sel]) > timeSize and len(mod[sel]) > timeSize: R = spearmanr(obs[sel], mod[sel])[0] NS = nashSutcliffe(obs[sel], mod[sel]) RMSE = rmse(obs[sel], mod[sel]) Bias, numPoints = bias(obs[sel], mod[sel]) KGE, CC, Alpha, Beta = kge(obs[sel], mod[sel]) AC = anomalyCorrelation(obs[sel], mod[sel]) return R, AC, KGE, CC, Alpha, Beta, NS, RMSE, Bias, numPoints else: return np.zeros((10)) else: return np.zeros((10))
def spearman_with_errors(x,y,yerr,Nmc=1000,plotflag=False,verbose=False): ysim=np.zeros(Nmc,'f') rhosim=np.zeros(Nmc,'f') psim=np.zeros(Nmc,'f') for i in range(Nmc): ysim=np.random.normal(y,scale=yerr,size=len(y)) rhosim[i],psim[i] = spearmanr(x,ysim) cave=np.mean(rhosim) cstd=np.std(rhosim) q1=50-34 # mean minus one std lower=np.percentile(rhosim,q1) q2=50+34 # mean minus one std upper=np.percentile(rhosim,q2) print 'mean (median) = %5.2f (%5.2f), std = %5.2f'%(cave,np.median(rhosim),cstd) print 'confidence interval from sorted list of MC fit values:' print 'lower = %5.2f (%5.2f), upper = %5.2f (%5.2f)'%(lower,cave-cstd, upper,cave+cstd) k,pnorm=normaltest(rhosim) print 'probability that distribution of slopes is normal = %5.2f'%(pnorm) if plotflag: plt.figure(figsize=(10,4)) plt.subplot(1,2,1) plt.hist(rhosim,bins=10,normed=True) plt.xlabel(r'$Spearman \ \rho $') plt.axvline(x=cave,ls='-',color='k') plt.axvline(x=lower,ls='--',color='k') plt.axvline(x=upper,ls='--',color='k') plt.subplot(1,2,2) plt.hist(np.log10(psim),bins=10,normed=True) plt.xlabel(r'$\log_{10}(p \ value)$') return rhosim,psim
def get_ranking_correlations(grp): grp = pd.pivot_table(grp, index=self.platform_col, values='value', columns=self.content_col).fillna(0) platform1s = [] platform2s = [] corrs = [] for i, p1 in enumerate(platforms): for p2 in platforms[i + 1:]: if p1 in grp.index and p2 in grp.index: corr = spearmanr(grp.loc[p1].values, grp.loc[p2].values)[0] platform1s.append(p1) platform2s.append(p2) corrs.append(corr) corr = pd.DataFrame({ 'platform1': platform1s, 'platform2': platform2s, 'value': corrs }) return corr
def mono_bin(Y, X, n=20): r = 0 good = Y.sum() bad = Y.count() - good while np.abs(r) < 1: pdqcut = pd.qcut(X, n).value_counts() # pd.qcut根据这些值的频率来选择箱子的均匀间隔 d1 = pd.DataFrame({"X": X, "Y": Y, "Bucket": pd.qcut(X, n)}) d2 = d1.groupby('Bucket', as_index=True) # 等级相关程度的统计分析指标 r, p = stats.spearmanr(d2.mean().X, d2.mean().Y) n = n - 1 d3 = pd.DataFrame(d2.X.min(), columns=['min']) d3['min'] = d2.min().X d3['max'] = d2.max().X d3['sum'] = d2.sum().Y d3['total'] = d2.count().Y d3['rate'] = d2.mean().Y d3['woe'] = np.log((d3['rate'] / (1 - d3['rate'])) / (good / bad)) d3['goodattribute'] = d3['sum'] / good d3['badattribute'] = (d3['total'] - d3['sum']) / bad iv = ((d3['goodattribute'] - d3['badattribute']) * d3['woe']).sum() d4 = (d3.sort_index(by='min')) print("=" * 60) print(d4) cut = [] cut.append(float('-inf')) for i in range(1, n + 1): qua = X.quantile(i / (n + 1)) cut.append(round(qua, 4)) cut.append(float('inf')) woe = list(d4['woe'].round(3)) return d4, iv, cut, woe
def per_class_scatter(fold_dict, metadata, stat_name, md_name, md_dict, param_val=0.1, smooth=True, avg=False, axis_label_size=14, legend_title_size=14, legend_label_size=12, title_size=16, save_file=None): data_list = [] for epoch_num in sorted(fold_dict): if epoch_num == '5': for fold in fold_dict[epoch_num]: for entry in fold_dict[epoch_num][fold]: param_val_i, stat_list = entry if param_val_i == param_val: for cls_idx, stat_val in enumerate(stat_list): cls = metadata['idx_map'][str(cls_idx)] md_val = md_dict[str(fold)][cls][md_name] data_list.append((md_val, stat_val)) print('Num points: {}'.format(len(data_list))) arr = np.array(data_list).T cc, pv = pearsonr(*arr) sc, sv = spearmanr(*arr) print('Pearson Correlation: {:.3f}, {:.5f}'.format(cc, pv)) print('Spearman Correlation: {:.3f}, {:.5f}'.format(sc, sv)) plt.figure() plt.scatter(*arr, edgecolors='black') plt.xscale('log') plt.title('{} vs. {} for 35 Classes'.format(stat_name, metric_title_dict[md_name]), fontsize=title_size) plt.xlabel('{}'.format(metric_axis_dict[md_name]), fontsize=axis_label_size) plt.ylabel('{}'.format(stat_name), fontsize=axis_label_size) if save_file is not None: fig_path = os.path.join(FIG_DIR, save_file) plt.savefig(fig_path, bbox_inches="tight", format='png', dpi=300)
def per_peak_scatter(fold_dict, metadata, param_name, md_name, md_dict, smooth=True, avg=False): data_list = [] for epoch_num in sorted(fold_dict): if epoch_num == '5': for fold in sorted(fold_dict[epoch_num])[:-1]: entry_list = sorted(fold_dict[epoch_num][fold], key=lambda x: x[0]) param_arr = np.array(list(zip(*entry_list))[0]) entry_arr = np.array(list(zip(*entry_list))[1]) max_idx = np.argmax(entry_arr, axis=0) max_list = param_arr[max_idx] for cls_idx, max_val in enumerate(max_list): cls = metadata['idx_map'][str(cls_idx)] md_val = md_dict[str(fold)][cls][md_name] data_list.append((md_val, max_val)) print('Num points: {}'.format(len(data_list))) arr = np.array(data_list).T cc, pv = pearsonr(*arr) sc, sv = spearmanr(*arr) print('Pearson Correlation: {:.3f}, {:.5f}'.format(cc, pv)) print('Spearman Correlation: {:.3f}, {:.5f}'.format(sc, sv)) plt.figure() plt.scatter(*arr) plt.xscale('log') plt.title('{} vs. {} for 35 Classes'.format(param_name, md_name)) plt.xlabel('{}'.format(md_name)) plt.ylabel('{}'.format(param_name))
def compare_localness_lenses(): localness = collections.defaultdict(lambda: [(0, 0), (0, 0)]) for (i, path) in enumerate([EDITOR_COUNTS, SOURCE_COUNTS]): totals = collections.defaultdict(int) locals = collections.defaultdict(int) for row in sg_open_csvr(path): c = int(row['count']) key = (row['project'], row['article_country']) totals[key] += c if row['article_country'] == row['other_country']: locals[key] += c for key in locals: localness[key][i] = (1.0 * locals[key] / totals[key], totals[key]) X = [] Y = [] for ((editor_p, editor_n), (source_p, source_n)) in localness.values(): if editor_n > 100 and source_n > 100: X.append(editor_p) Y.append(source_p) from scipy.stats.stats import pearsonr, spearmanr print 'n = ', len(X) print 'spearman', spearmanr(X, Y) print 'pearson', pearsonr(X, Y) print 'num where source locality is higher: ', len([1 for (x, y) in zip(X, Y )if y > x])
def evaluate_model(matrix, feature_names, measure='men'): "Evaluates a model on the basis of the provided similarity measure." # Load the similarity measure, if possible. try: sim_dict = __resource__[measure] except KeyError: return None # Select pairs that can be used for testing. sim_words = {word for pair in sim_dict for word in pair} usable_words = set(feature_names) & sim_words usable_pairs = { key for key in sim_dict.keys() if set(key).issubset(usable_words) } # Gather lists of actual values and 'predictions' actual_values = [] predicted_values = [] indices = {name: i for i, name in enumerate(feature_names)} cosine = lambda x, y: float(pairwise_distances(x, y, metric='cosine')) for a, b in usable_pairs: actual_values.append(sim_dict[(a, b)]) predicted_values.append(cosine(matrix[indices[a]], matrix[indices[b]])) # Compute the correlation: correlation, sig = spearmanr(actual_values, predicted_values) return { "correlation": correlation, "explained": correlation * correlation, "significance": sig, "test_pairs": len(usable_pairs), "predictions": dict(zip(usable_pairs, predicted_values)) }
def correlation(X, Y): indices = [i for i in range(len(X)) if X[i] != None] x1 = [X[i] for i in indices] x2 = [Y[i] for i in indices] print(min(x1), max(x1)) return spearmanr(x1, x2), pearsonr(x1, x2)
def dist_to_string(self, judgements): output_string = [] human_similarities = [] cosine_similarities = [] for judgement in judgements: if not judgement.strip(): continue line = judgement.split(",") word_1 = line[0] word_1_index = self.get_word_id(word_1) word_2 = line[1] word_2_index = self.get_word_id(word_2) human_similarities.append(float(line[2])) word_1_context = self.distributional_model[word_1_index] word_2_context = self.distributional_model[word_2_index] length = 10 if len(word_1_context) > 10 and len(word_2_context) > 10 else min(len(word_1_context), len(word_2_context)) word_1_top_10 = sorted(word_1_context.items(), key=lambda kv: (-kv[1], kv[0]))[:length] word_2_top_10 = sorted(word_2_context.items(), key=lambda kv: (-kv[1], kv[0]))[:length] output_string.append(word_1 + " " + " ".join(['%s: %i' % (self.word2idx[k], v) for k, v in word_1_top_10])) output_string.append(word_2 + " " + " ".join(['%s: %i' % (self.word2idx[k], v) for k, v in word_2_top_10])) word_1_values = [t[1] for t in word_1_top_10] word_2_values = [t[1] for t in word_2_top_10] cosine_similarity = 1 - spatial.distance.cosine(word_1_values, word_2_values) cosine_similarities.append(cosine_similarity) output_string.append(word_1 + "," + word_2 + ":" + str(cosine_similarity)) output_string.append("correlation:" + str(spearmanr(cosine_similarities, human_similarities)[0])) return "\n".join(output_string)
def generate_statistics(self, data_x, data_e, data_t, name, session_dict, save=True): # self.saver.restore(sess=self.session, save_path=self.save_path) ci, cost, rae, ranking, gen, reg, disc, layer_one_recon, t_reg, t_mse = \ self.predict_concordance_index(x=data_x, e=data_e, t=data_t, outcomes= session_dict[ 'outcomes']) observed_idx = self.extract_observed_death(name=name, observed_e=data_e, observed_t=data_t, save=save) median_predicted_time = self.median_predict_time(session_dict) if name == 'Train': self.predicted_time_train = median_predicted_time if name == 'Valid': self.val_loss = cost observed_empirical = data_t[observed_idx] observed_predicted = median_predicted_time[observed_idx] observed_ci = concordance_index( event_times=observed_empirical, predicted_scores=np.nan_to_num(observed_predicted), event_observed=data_e[observed_idx]) corr = spearmanr(observed_empirical, observed_predicted) ##### ibs / ibll ##### time_grid = np.linspace(data_t.min(), data_t.max(), 100) ds = np.array(time_grid - np.array([0.0] + time_grid[:-1].tolist())) bs, bll = get_scores(y_train=self.train_t, delta_train=self.train_e, y_test=data_t, delta_test=data_e, pred_train=self.predicted_time_train, pred_test=median_predicted_time, time_grid=time_grid, surv_residual=False, cens_residual=False) ibs = sum(bs * ds) / (time_grid.max() - time_grid.min()) ibll = sum(bll * ds) / (time_grid.max() - time_grid.min()) ###################### results = "{} || loss: {}, CI: {}, IBS: {}, IBLL: {}".format( name, np.round(cost, 4), np.round(ci, 4), np.round(ibs, 4), np.round(ibll, 4)) # logging.debug(results) print(results) if name == 'Test': self.ctd = ci self.ibs = ibs self.nbll = ibll
def get_bigrams_for_feature(word_feature, neus, capacity): """ Extract unigram features. The most frequent $capacity number of bigrams will be selected. Then these bigrams will be filted based on pearson regression. """ top_bigrams = word_feature.get_top_bigrams(capacity) print 'most frequent bigrams: ', top_bigrams candicate_map = dict() for bigram in top_bigrams: bg_counts = word_feature.get_feauture_by_bigram(bigram) p = spearmanr(bg_counts, neus) if not math.isnan(p[0]): candicate_map[bigram] = p selected_bigrams = list() for candicate_bigram in candicate_map.keys(): idx = 0 while idx < len(selected_bigrams): if abs(candicate_map[selected_bigrams[idx]][0]) < abs( candicate_map[candicate_bigram][0]): break idx += 1 selected_bigrams.insert(idx, candicate_bigram) if len(selected_bigrams) > bigram_capacity: selected_bigrams = selected_bigrams[:bigram_capacity] print '======== selected bigrams for feature ========' for selected_bigram in selected_bigrams: print selected_bigram, ':', candicate_map.get(selected_bigram) return selected_bigrams
def correlations_speed(cur, variable1, variable2, table): """ Correlation of 2 variables (including scatter plot) """ x = select(cur, variable1, table) y = select(cur, variable2, table) # Scatterplot # mpl.style.use('ggplot') fig = plt.figure() ax = fig.add_subplot(1, 1, 1) ax.set_xlabel("Gap") ax.set_ylabel("Sentiment magnitude") fig.suptitle('Correlation funding gap and sentiment magnitude') plt.scatter(x, y) plt.show() # Pearson correlation and p-value p_corr_speed_length = pearsonr(x, y) print("Pearson: ", p_corr_speed_length) # Spearman correaltion and p-value s_corr_speed_length = spearmanr(x, y) print("Spearman: ", s_corr_speed_length) # Kendall correlation and p-value k_corr_speed_length = kendalltau(x, y) print("Kendall: ", k_corr_speed_length)
def byGene(geneSpanFN, wigDir1, wigDir2, chrom, strand, outFN, simulation = False): '''hela must be 2nd wigDir2 cuz strand flip''' strand = str(strand) #undo autocast print 'loading wigs' oppStrand = bioLibCG.switchStrand(strand) coord_value1 = cgWig.loadSingleWig(wigDir1, chrom, strand, 'ALL') coord_value2 = cgWig.loadSingleWig(wigDir2, chrom, oppStrand, 'ALL') print 'calculating bin values' f = open(geneSpanFN, 'r') fOut = open(outFN, 'w') for line in f: ls = line.strip().split('\t') sChrom, sStrand = ls[1], ls[2] if sChrom != chrom or sStrand != strand: continue geneName = ls[0] geneStarts = [int(x) for x in ls[3].split(',')] geneEnds = [int(x) for x in ls[4].split(',')] spanPairs = zip(geneStarts, geneEnds) frameLength = 10 skipAmount = 2 theSpan = fullSpanFromPairs(spanPairs) spanLength = len(theSpan) binAvgs1 = [] binAvgs2 = [] for theBinAvg, theCoord_Val in [(binAvgs1, coord_value1), (binAvgs2, coord_value2)]: #mix up bins if simulation if simulation: newSpan = mixSpanByBin(theSpan, frameLength) else: newSpan = theSpan i = 0 while (i+frameLength) < (spanLength+1): binNums = newSpan[i:(i + frameLength)] theBinAvg.append(binAvg(theCoord_Val, binNums)) i = i + skipAmount #get rid of all 0,0 pairs for correlation editPairs = zip(binAvgs1, binAvgs2) newPairs = [pair for pair in editPairs if not (pair[0] == 0 and pair[1] == 0)] newX = [pair[0] for pair in newPairs] newY = [pair[1] for pair in newPairs] dataLoad = sum(binAvgs1) + sum(binAvgs2) dataLoad = float(dataLoad)/2 pcc = pStats.pearsonr(binAvgs1, binAvgs2) scc, pVal = pStats.spearmanr(binAvgs1, binAvgs2) outString = [geneName, pcc[0], ','.join([str(x) for x in binAvgs1]), ','.join([str(x) for x in binAvgs2]), '%s:%s:%s' % (sChrom, sStrand, theSpan[0]), dataLoad, scc] fOut.write('\t'.join([str(x) for x in outString]) + '\n') fOut.close() f.close()
def evaluate(representation, data): results = [] for (x, y), sim in data: # if representation.oov(x) or representation.oov(y): # continue results.append((representation.similarity(x, y), sim)) actual, expected = zip(*results) return spearmanr(actual, expected)[0]
def spearmanc(sites, data, ages, deltas = False): res = {} random_ages = list(permutations(ages)) if deltas else gen_random_ages() for s in sites: values = (getvd if deltas else getvf)(data, s) if len(set(values)) == 1: continue res[s] = {} res[s]['score'], res[s]['pval'] = spearmanr(values, ages) res[s]['rscore'] = json.dumps([spearmanr(values, rage)[0] for rage in random_ages]) res[s]['data'] = json.dumps(values) return res
def innerQuery(name, dest, count, doprint,stopwords,trecoutput,sorm,normalized): with open(dest, "w") as f: f.write("<parameters>\n") f.write("<index>/home/ginger/Documents/IR/Project2-qpp/" + name + "-index</index>\n") f.write("<runID>2016</runID>\n") f.write("<trecFormat>true</trecFormat>\n") f.write("<stemmer>Krovetz</stemmer>\n") f.write("<count>" + str(count) + "</count>\n") f.write("<baseline>okapi,k1:1.2,b:0.75,k3:7</baseline>\n") counter1, min1, max1, wcount1 = makeQueries("../../IR2016/queries/topics.301-350", dest, f,doprint) counter2, min2, max2, wcount2 = makeQueries("../../IR2016/queries/topics.351-400", dest, f,doprint) counter3, min3, max3, wcount3 = makeQueries("../../IR2016/queries/topics.401-450", dest, f,doprint) f.write(stopwords) f.write("</parameters>\n") wordcounts = wcount1 wordcounts.extend(wcount2) wordcounts.extend(wcount3) if doprint: counter = sum([counter1, counter2, counter3]) minval = min([min1, min2, min3]) maxval = max([max1, max2, max3]) print "queries,150,", 1. * counter / 150, ",", minval, ",", maxval #trecoutput.write( "name,k,recip,p10") #do indri stuff qrelsfile = "../qrels/qrels.txt" namefile = str(name) + str(count) resfile = "../queries/results/" + namefile + ".txt" queryfile = "../queries/" + namefile + ".txt" if sorm: fun = mad namefile = "MAD" + namefile else: fun = np.std os.system( "IndriRunQuery " + queryfile + " > " + resfile) correlations0 = "Pearsons " correlations1 = "Spearman " for cut in np.arange(0.1,1,0.2): newresfile = "../queries/results/" + namefile + "." + str(cut) + ".txt" trecfile = "../trec/" + namefile + "." + str(cut) + ".txt" sds = applyCut(resfile,newresfile,count,cut,fun,normalized) os.system( "../trec_eval -q " + qrelsfile + " " + newresfile + " > " + trecfile) p10s = parseTrec_Eval(trecfile, name + "," + str(count),trecoutput) correlations0 += " & " + str(round(pearsonr(sds,p10s)[0],4)) correlations1 += " & " + str(round(spearmanr(sds, p10s)[0],4)) lineending = " \\\\ \\hline" print '''\\begin{table}[h!] \\centering \\begin{tabular}{|l|l|l|l|l|l|} \\hline''' print name + " " + str(count ) + " & $\sigma_{0.1\%}$ & $\sigma_{0.3\%}$ & $\sigma_{0.5\%}$ & $\sigma_{0.7\%}$ & $\sigma_{0.9\%}$" + lineending print correlations0 + lineending print correlations1 + lineending print '''\\end{tabular} \\caption{$\\sigma_{\\%}$ correlations for ''' + namefile + '''}
def generate_graphs(team_alias): fig = plt.figure(figsize=(15, 6)) ax = fig.add_subplot(121) times_to_cross, results = get_data(team_alias) ax.hist(times_to_cross, bins=8 * 10, range=(0, 8), facecolor="green") plt.xlabel("Time to cross halfcourt (s)") plt.ylabel("Normalized frequency") plt.title("Histogram of time to cross halfcourt for %s" % (team_alias)) mean = np.mean(times_to_cross) std = np.std(times_to_cross) n = len(times_to_cross) text = ax.text( 0.05, 0.85, "Mean: {:.2f} \nStd Dev: {:.2f} \n# Possessions: {: d}".format(mean, std, n), transform=ax.transAxes ) ax2 = fig.add_subplot(122) points, times = np.histogram(times_to_cross, bins=8 * 10, range=(0.00, 8.00), weights=results) num_crosses, times = np.histogram(times_to_cross, bins=8 * 10, range=(0.00, 8.00)) to_remove = [] for index, ncross in enumerate(num_crosses): if abs(ncross) == 0: to_remove.append(index) points = np.delete(points, to_remove) num_crosses = np.delete(num_crosses, to_remove) times = np.delete(times, to_remove) # hist1 = np.array(map(lambda x: 1.0 if abs(x) == 0 else float(x), hist1)) avg_points = points / num_crosses times = np.delete(times, -1) ax2.scatter(times, avg_points) # avg = np.mean(avg_points) # std = np.std(avg_points) # corr, pvals = pearsonr(times, avg_points) # ax2.text(0.05,0.85, 'Avg: {:.2f} \nStd Dev: {:.2f} \nPearson\' corr: {:.2f}'.format(avg, std, corr), transform=ax2.transAxes) avg_all = np.mean(avg_points) std = np.std(avg_points) pcorr_all, p_val = pearsonr(times_to_cross, results) spcorr_all, sp_val = spearmanr(times_to_cross, results) ax2.text( 0.05, 0.85, "Pearson Corr: {:.2f}\n\ Pearson 2t_val: {:.2f}\n\ Spearman Corr: {:.2f}\n\ Spearman 2t_val: {:.2f}".format( pcorr_all, p_val, spcorr_all, sp_val ), transform=ax2.transAxes, ) # plt.bar(left=bar_x, height=final_hist, width=0.1) plt.xlabel("Time to cross halfcourt (s)") plt.ylabel("Average points per possession") plt.title("Avg points vs time to cross halfcourt for %s" % (team_alias)) fig.savefig("csvs/%s/graphs.png" % (team_alias))
def spearman_boot(x,y,N=5000,cont_int=68.): boot_rho=zeros(N,'f') boot_p=zeros(N,'f') for i in range(N): indices=randint(0,len(x)-1,len(x)) xboot=x[indices] yboot=y[indices] boot_rho[i],boot_p[i]=spearmanr(xboot,yboot) return scoreatpercentile(boot_rho,per=50),scoreatpercentile(boot_p,per=50)#,boot_rho,boot_p
def get_correlations(self, x, y): from scipy.stats.stats import spearmanr, pearsonr correlations = [] rows = self.db.view('results/all').rows for row in rows: x_values = row['value'].get(x) y_values = row['value'].get(y) correlations.append((row.key, spearmanr(x_values, y_values), pearsonr(x_values, y_values))) return correlations
def final_corr(): zfile = open("cluster_zscore.txt", "r") efile = open("entropy_list_redo.txt", "r") clust_zscore = dict() clust_entropy = dict() for each in efile: line = each.split() cluster = int(line[0]) entropy = float(line[1]) clust_entropy[cluster] = entropy for each in zfile: line = each.split() cluster = int(line[0]) zscore = float(line[1]) clust_zscore[cluster] = zscore x = [] y = [] for i in range(150000): if not clust_zscore.has_key(i) or not clust_entropy.has_key(i): continue x.append(clust_entropy[i]) y.append(clust_zscore[i]) correlation, pvalue = spearmanr(x,y) print "spearman " + str(correlation) print pvalue correlation, pvalue = pearsonr(x,y) print "pearson " + str(correlation) x2 = [] y2 = [] for i in range(150000): if not clust_zscore.has_key(i): continue x2.append(i) y2.append(clust_zscore[i]) x1 = [] y1 = [] for i in range(150000): if not clust_entropy.has_key(i): continue x1.append(i) y1.append(clust_entropy[i]) plt.scatter(x,y) plt.title('Entropy of a Cluster vs. citing distance (redo)') plt.xlabel('Entropy') plt.ylabel('Citing Distance z scores') plt.savefig('entropy_citingscores_redo.png') plt.show() efile.close() zfile.close()
def evaluate(representation, data): results = [] seen_num = 0 for (x, y), sim in data: if representation.similarity(x, y) is not None : seen_num += 1 results.append((representation.similarity(x, y), sim)) actual, expected = zip(*results) print ("seen/total: " + str(seen_num) + "/" + str(len(data))) return spearmanr(actual, expected)[0]
def draw_plot(ax, x, y, color, x_axis, y_axis, title): scatterplot.draw_actual_plot(ax, x, y, color, x_axis, y_axis, title, size=40) coeff, pval = pearsonr(x, y) rho, pval = spearmanr(x, y) mae = mean_abs_error(x, y) conv.add_text_dict(ax, { "PCC" : coeff, "Rho" : rho, "MAE" : mae }) scatterplot.add_x_y_line(ax, min_val=min(x), max_val=max(x)) return [coeff, rho, mae]
def compute_worker_bam(obj, chr_tbp): print chr_tbp file_a = pysam.Samfile(obj.file_a, 'rb') file_b = pysam.Samfile(obj.file_b, 'rb') feature_data = open(obj.feature + "/" + chr_tbp) feature_out_path = obj.tmp_path + "/" + chr_tbp feature_out = open(feature_out_path, 'w') start = 0 end = 0 vals1 = 0 vals2 = 0 corr = 0 for line in feature_data: line = line.strip() sline = line.split() #pdb.set_trace() start = int(sline[1]) - 1 - obj.flank end = int(sline[2]) + obj.flank vals1 = np.zeros(end - start) vals2 = np.zeros(end - start) for column in file_a.pileup(reference=chr_tbp, start=start, end=end): if (column.pos >= start and column.pos < end): # pdb.set_trace() try: vals1[(column.pos - start)] = column.n except: pdb.set_trace() for column in file_b.pileup(reference=chr_tbp, start=start, end=end): if (column.pos >= start and column.pos < end): # pdb.set_trace() try: vals2[(column.pos - start)] = column.n except: pdb.set_trace() if obj.corr_type == "cross" or obj.corr_type == "auto": corr = ss.fftconvolve(vals1, vals2, 'same') elif obj.corr_type == "spearmanr": corr = [stats.spearmanr(vals1, vals2)[0]] elif obj.corr_type == "pearsonr": corr = [stats.pearsonr(vals1, vals2)[0]] feature_out.write("\t".join(sline[3:5] + map(str,corr)) + "\n") feature_data.close() feature_out.close()
def apply_stats(data,runTTest): peakList = getPeakList(data) tempList = list() colNames = ['Fatty Acid Type', #1 'Peak Name', #2 'Pearson Coefficient', #3 'Pearson P Value', #4 'Spearman Coefficient', #5 'Spearman P Value', #6 'P Geometric Mean (%)', #7 'Q Geometric Mean (ug/ml)', #8 'P Mean (%)', #9 'P Stdev', #10 'Q Mean (ug/ml)', #11 'Q Stdev', #12 'P T-test', #13 'P T-test P value', #14 'Q T-test', #15 'Q T-test P value', #16 'Common Name'] #17 for entry in peakList: try: pearson = pearsonr(entry['p'],entry['q']) spearman = spearmanr(entry['p'],entry['q']) if runTTest == 'y': ttestP = ttest_1samp(entry['p'],0) ttestQ = ttest_1samp(entry['q'],0) else: ttestP = ('-','-') ttestQ = ('-','-') tempList += [entry['FAtype'], #1 entry['peakName'], #2 pearson[0], #3 pearson[1], #4 spearman[0], #5 spearman[1], #6 gmean(entry['p']), #7 gmean(entry['q']), #8 np.mean(entry['p']), #9 np.std(entry['p'],ddof=1), #10 np.mean(entry['q']), #11 np.std(entry['q'],ddof=1), #12 ttestP[0], #13 ttestP[1], #14 ttestQ[0], #15 ttestQ[1], #16 entry['common']], #17 except: pass return pd.DataFrame(tempList, columns=colNames)