def test_spearmanr(self): # Tests some computations of Spearman's rho (x, y) = ([5.05, 6.75, 3.21, 2.66], [1.65, 2.64, 2.64, 6.95]) assert_almost_equal(mstats.spearmanr(x, y)[0], -0.6324555) (x, y) = ([5.05, 6.75, 3.21, 2.66, np.nan], [1.65, 2.64, 2.64, 6.95, np.nan]) (x, y) = (ma.fix_invalid(x), ma.fix_invalid(y)) assert_almost_equal(mstats.spearmanr(x, y)[0], -0.6324555) x = [ 2.0, 47.4, 42.0, 10.8, 60.1, 1.7, 64.0, 63.1, 1.0, 1.4, 7.9, 0.3, 3.9, 0.3, 6.7 ] y = [ 22.6, 8.3, 44.4, 11.9, 24.6, 0.6, 5.7, 41.6, 0.0, 0.6, 6.7, 3.8, 1.0, 1.2, 1.4 ] assert_almost_equal(mstats.spearmanr(x, y)[0], 0.6887299) x = [ 2.0, 47.4, 42.0, 10.8, 60.1, 1.7, 64.0, 63.1, 1.0, 1.4, 7.9, 0.3, 3.9, 0.3, 6.7, np.nan ] y = [ 22.6, 8.3, 44.4, 11.9, 24.6, 0.6, 5.7, 41.6, 0.0, 0.6, 6.7, 3.8, 1.0, 1.2, 1.4, np.nan ] (x, y) = (ma.fix_invalid(x), ma.fix_invalid(y)) assert_almost_equal(mstats.spearmanr(x, y)[0], 0.6887299) # test for namedtuple attributes res = mstats.spearmanr(x, y) attributes = ('correlation', 'pvalue') check_named_results(res, attributes, ma=True)
def test_spearmanr(self): "Tests some computations of Spearman's rho" (x, y) = ([5.05, 6.75, 3.21, 2.66], [1.65, 2.64, 2.64, 6.95]) assert_almost_equal(mstats.spearmanr(x, y)[0], -0.6324555) (x, y) = ([5.05, 6.75, 3.21, 2.66, np.nan], [1.65, 2.64, 2.64, 6.95, np.nan]) (x, y) = (ma.fix_invalid(x), ma.fix_invalid(y)) assert_almost_equal(mstats.spearmanr(x, y)[0], -0.6324555) # x = [ 2.0, 47.4, 42.0, 10.8, 60.1, 1.7, 64.0, 63.1, 1.0, 1.4, 7.9, 0.3, 3.9, 0.3, 6.7 ] y = [ 22.6, 08.3, 44.4, 11.9, 24.6, 0.6, 5.7, 41.6, 0.0, 0.6, 6.7, 3.8, 1.0, 1.2, 1.4 ] assert_almost_equal(mstats.spearmanr(x, y)[0], 0.6887299) x = [ 2.0, 47.4, 42.0, 10.8, 60.1, 1.7, 64.0, 63.1, 1.0, 1.4, 7.9, 0.3, 3.9, 0.3, 6.7, np.nan ] y = [ 22.6, 08.3, 44.4, 11.9, 24.6, 0.6, 5.7, 41.6, 0.0, 0.6, 6.7, 3.8, 1.0, 1.2, 1.4, np.nan ] (x, y) = (ma.fix_invalid(x), ma.fix_invalid(y)) assert_almost_equal(mstats.spearmanr(x, y)[0], 0.6887299)
def test_spearmanr(self): # Tests some computations of Spearman's rho (x, y) = ([5.05, 6.75, 3.21, 2.66], [1.65, 2.64, 2.64, 6.95]) assert_almost_equal(mstats.spearmanr(x, y)[0], -0.6324555) (x, y) = ([5.05, 6.75, 3.21, 2.66, np.nan], [1.65, 2.64, 2.64, 6.95, np.nan]) (x, y) = (ma.fix_invalid(x), ma.fix_invalid(y)) assert_almost_equal(mstats.spearmanr(x, y)[0], -0.6324555) x = [2.0, 47.4, 42.0, 10.8, 60.1, 1.7, 64.0, 63.1, 1.0, 1.4, 7.9, 0.3, 3.9, 0.3, 6.7] y = [22.6, 8.3, 44.4, 11.9, 24.6, 0.6, 5.7, 41.6, 0.0, 0.6, 6.7, 3.8, 1.0, 1.2, 1.4] assert_almost_equal(mstats.spearmanr(x, y)[0], 0.6887299) x = [2.0, 47.4, 42.0, 10.8, 60.1, 1.7, 64.0, 63.1, 1.0, 1.4, 7.9, 0.3, 3.9, 0.3, 6.7, np.nan] y = [22.6, 8.3, 44.4, 11.9, 24.6, 0.6, 5.7, 41.6, 0.0, 0.6, 6.7, 3.8, 1.0, 1.2, 1.4, np.nan] (x, y) = (ma.fix_invalid(x), ma.fix_invalid(y)) assert_almost_equal(mstats.spearmanr(x, y)[0], 0.6887299)
def all_pairs_spearman(M): """This should return a squareform matrix""" C = np.zeros((len(M), len(M))) for i in xrange(len(M)): for j in xrange(i+1, len(M)): C[i][j] = mstats.spearmanr(M[i],M[j])[0] return C
def evalsemrel(l1, l2, pairsfile, l1colnum, l2colnum, l2gpmffile, l1l2methpmffile): '''Evaluate semantic relatedness. l1 and l2 = languages s and t in p(t|s) pairsfile = translation pairs in l1 and l2, l1 words in column <colnum> l2gpmffile = gold pmf over l2 words, for l2 words (including those in pairsfile) l1l2methpmffile = method-induced pmf over l2 words, for l1 words (including those in pairsfile) ''' l2gpmf = L1L2PMF(l2, l2, l2gpmffile) l1l2pmf = L1L2PMF(l1, l2, l1l2methpmffile) # print "#JSDiv Spearmanr" jsds, rhos = [], [] for line in open(pairsfile): line = line.decode('utf-8').rstrip() pair = line.split() w1, w2 = pair[l1colnum-1], pair[l2colnum-1] gpmf = l2gpmf.pmf[w2] #gold pmf mpmf = l1l2pmf.pmf[w1] #method pmf vecs = [ (gpmf[x2], mpmf[x2]) for x2 in gpmf if x2 in mpmf] vecs.extend( [ (gpmf[x2], 0.0) for x2 in gpmf if x2 not in mpmf] ) vecs.extend( [ (0.0, mpmf[x2]) for x2 in mpmf if x2 not in gpmf] ) gvec, mvec = zip(*vecs) jsd = MyUtils.jsd(gvec, mvec, base=2) rho, pval = mstats.spearmanr(gvec, mvec, use_ties=True) jsds.append(jsd) rhos.append(rho) print "%f\t%f\t%f" % (jsd, rho, pval) print "\t\t\t%f\t%f" % ( sum(jsds)/len(jsds), sum(rhos)/len(rhos) )
def evalsemrel(l1, l2, pairsfile, l1colnum, l2colnum, l2gpmffile, l1l2methpmffile): '''Evaluate semantic relatedness. l1 and l2 = languages s and t in p(t|s) pairsfile = translation pairs in l1 and l2, l1 words in column <colnum> l2gpmffile = gold pmf over l2 words, for l2 words (including those in pairsfile) l1l2methpmffile = method-induced pmf over l2 words, for l1 words (including those in pairsfile) ''' l2gpmf = L1L2PMF(l2, l2, l2gpmffile) l1l2pmf = L1L2PMF(l1, l2, l1l2methpmffile) # print "#JSDiv Spearmanr" jsds, rhos = [], [] for line in open(pairsfile): line = line.decode('utf-8').rstrip() pair = line.split() w1, w2 = pair[l1colnum - 1], pair[l2colnum - 1] gpmf = l2gpmf.pmf[w2] #gold pmf mpmf = l1l2pmf.pmf[w1] #method pmf vecs = [(gpmf[x2], mpmf[x2]) for x2 in gpmf if x2 in mpmf] vecs.extend([(gpmf[x2], 0.0) for x2 in gpmf if x2 not in mpmf]) vecs.extend([(0.0, mpmf[x2]) for x2 in mpmf if x2 not in gpmf]) gvec, mvec = zip(*vecs) jsd = MyUtils.jsd(gvec, mvec, base=2) rho, pval = mstats.spearmanr(gvec, mvec, use_ties=True) jsds.append(jsd) rhos.append(rho) print "%f\t%f\t%f" % (jsd, rho, pval) print "\t\t\t%f\t%f" % (sum(jsds) / len(jsds), sum(rhos) / len(rhos))
def compute(self, x, y): assert np.size(x) == np.size(y) rho, pv = mstats.spearmanr(x,y) return { "SPEARMAN": rho, "SPEARMAN_PV": pv }
def m_scagnostics(self): graph = self._mst_graph weights = [graph[a][b]['weight'] for a,b in graph.edges] weights.sort() quant25 = np.quantile(weights,0.25) quant75 = np.quantile(weights,0.75) quant10 = np.quantile(weights,0.1) quant90 = np.quantile(weights,0.9) quant50 = np.quantile(weights,0.5) crit = quant75 + 1.5*(quant75-quant25) longEdgesSum = np.sum(list(filter(lambda x: x>crit, weights))) subgraphs = [graph.copy() for a,b in graph.edges] edges = [[a,b] for a,b in graph.edges] clumpylist = [] i= 0 for g in subgraphs: g.remove_edge(edges[i][0],edges[i][1]) minComp = min(connected_component_subgraphs(g), key=len) if len(minComp.edges) >0 and graph[edges[i][0]][edges[i][1]]['weight'] > 0: maxEdge = max([g[a][b]['weight'] for a,b in minComp.edges]) maxEdge = max(0.00001,maxEdge) val = 1- (maxEdge/graph[edges[i][0]][edges[i][1]]['weight']) clumpylist.append(val) i+=1 diameter = max([max(i[1][0].values()) for i in nx.all_pairs_dijkstra(graph)]) self.clumpy_measure = max(clumpylist) self.sparse_measure = min(1,quant90) self.stringy_measure = diameter/graph.size(weight='weight') self.spearmanr_measure = spearmanr([i[0] for i in self.projects],[i[1] for i in self.projects]).correlation self.outlying_measure = longEdgesSum/np.sum(weights) self.skewed_measure = (quant90-quant50)/(quant90-quant10)
def correlate_traits(eigengene_path, trait_path, trait_types, ordinals): traits = pd.read_csv(trait_path, index_col=0) continuous_traits = traits[[ trait for trait, type_ in trait_types.items() if type_ == 'C' ]] nominal_traits = traits[[ trait for trait, type_ in trait_types.items() if type_ == 'N' if trait not in ordinals ]] nominal_traits = pd.get_dummies(nominal_traits) ordinal_traits = traits[list(ordinals.keys())] for trait, variables in ordinals.items(): variables = {v: i for i, v in enumerate(variables)} ordinal_traits[trait] = ordinal_traits[trait].replace(variables) traits = pd.concat([continuous_traits, nominal_traits, ordinal_traits], axis=1) eigengenes = pd.read_csv(eigengene_path, index_col=0) corrs = pd.DataFrame(index=eigengenes.columns, columns=traits.columns) for module in corrs.index: for trait in corrs.columns: corrs.at[module, trait] = spearmanr(eigengenes[module], traits[trait]).correlation return corrs
def spearman(self): slicez, islicez = self.slices(method='variable') #corrs = np.array([spearmanr(*self.dtw_resample(sl))[0] if len(sl) else 0.0 for sl in slicez]) corrs = np.nan_to_num([ spearmanr(*self.dtw_resample(sl))[0] if len(sl) else 0.0 for sl in slicez ]) corrs = np.clip(corrs, a_min=0.0, a_max=1.0) return corrs
def get_r(data): x = [8,13,21,34,55] cl = np.array(data['CL'],dtype='f8') for i in x: data['m'+str(i)] = ta.EMA(cl,timeperiod=i) for i,r in data.iterrows(): if math.isnan(r['m55']): continue z = [[r['m8'],8],[r['m13'],13],[r['m21'],21],[r['m34'],34],[r['m55'],55]] c,p = mstats.spearmanr(x,[k[1] for k in sorted(z,reverse=True)]) data.loc[i,'r'] = c return data['r']
def get_spearman(self, human_assessment_dict) -> Tuple[float, float]: human_assessment_values = [] cosine_values = [] total_count = 0 found_count = 0 found_concepts = [] all_concepts = [] tqdm_bar = tqdm(human_assessment_dict.items(), total=len(human_assessment_dict)) # suma = 0 for concept, other_concepts in tqdm_bar: total_count += len(other_concepts) for c in other_concepts: all_concepts.append((concept, c)) concept_vec = self.get_concept_vector(concept, give_none=True) if concept_vec is not None: for other_concept in other_concepts: other_concept_vec = self.get_concept_vector(other_concept, give_none=True) if other_concept_vec is not None: # cos = self.cosine(vector1=concept_vec, vector2=other_concept_vec, # same_vec_zero=not(concept == other_concept)) cos = self.cosine(vector1=concept_vec, vector2=other_concept_vec, same_vec_zero=False) if cos == 1 and concept != other_concept: pass # human_assessment_values.append(0) # cosine_values.append(cos) else: human_assessment_values.append( human_assessment_dict[concept][other_concept]) cosine_values.append(cos) found_count += 1 found_concepts.append((concept, other_concept)) tqdm_bar.set_description( f"{self.__class__.__name__} Beam ({self.dataset}|{self.algorithm}|{self.preprocessing})" ) tqdm_bar.update() if len(human_assessment_values) > 0 and len(cosine_values) > 0: cor, _ = spearmanr(human_assessment_values, cosine_values) else: cor = 0 benchmark_coverage = found_count / total_count # print(suma) # print('cov1:', benchmark_coverage) # print('cov2:', len(found_concepts)/len(all_concepts)) # print(len(found_concepts), len(all_concepts)) # print('found:', found_concepts) # print('total', all_concepts) return cor, benchmark_coverage
def score_decontextualized(embeddings, layers, RG, WS, SL, SV, embedding_keys): originals = {"RG65" : RG, "WS353" : WS, "SL999" : SL, "SV3500" : SV} scores = {"RG65" : {}, "WS353" : {}, "SL999" : {}, "SV3500" : {}} for key in scores: for embedding_key in embedding_keys: scores_human = [] scores_embed = [] for w1, w2, score in originals[key]: e1 = embeddings[w1][embedding_key] e2 = embeddings[w2][embedding_key] cos = 1 - cosine(e1, e2) scores_human.append(score) scores_embed.append(cos) scores[key][embedding_key]= round(spearmanr(scores_human, scores_embed)[0], 4) return scores
def test_spearmanr(self): # Tests some computations of Spearman's rho (x, y) = ([5.05,6.75,3.21,2.66],[1.65,2.64,2.64,6.95]) assert_almost_equal(mstats.spearmanr(x,y)[0], -0.6324555) (x, y) = ([5.05,6.75,3.21,2.66,np.nan],[1.65,2.64,2.64,6.95,np.nan]) (x, y) = (ma.fix_invalid(x), ma.fix_invalid(y)) assert_almost_equal(mstats.spearmanr(x,y)[0], -0.6324555) x = [2.0, 47.4, 42.0, 10.8, 60.1, 1.7, 64.0, 63.1, 1.0, 1.4, 7.9, 0.3, 3.9, 0.3, 6.7] y = [22.6, 8.3, 44.4, 11.9, 24.6, 0.6, 5.7, 41.6, 0.0, 0.6, 6.7, 3.8, 1.0, 1.2, 1.4] assert_almost_equal(mstats.spearmanr(x,y)[0], 0.6887299) x = [2.0, 47.4, 42.0, 10.8, 60.1, 1.7, 64.0, 63.1, 1.0, 1.4, 7.9, 0.3, 3.9, 0.3, 6.7, np.nan] y = [22.6, 8.3, 44.4, 11.9, 24.6, 0.6, 5.7, 41.6, 0.0, 0.6, 6.7, 3.8, 1.0, 1.2, 1.4, np.nan] (x, y) = (ma.fix_invalid(x), ma.fix_invalid(y)) assert_almost_equal(mstats.spearmanr(x,y)[0], 0.6887299) # test for namedtuple attributes res = mstats.spearmanr(x, y) attributes = ('correlation', 'pvalue') check_named_results(res, attributes, ma=True)
def write_results(search_query, file_destination, input_query): search_result_1 = {} search_result_2 = {} correlation = 0 # If there are more than one search term in a query then separate them for individual search If they produce # uneven number of documents, remove documents with the lowest scores, in a rank with more documents # until both rankings are the same length if len(search_query) > 1: search_result_1 = search(search_query[0]) search_result_2 = search(search_query[1]) if len(search_result_1) > len(search_result_2): for i in range(len(search_result_1) - len(search_result_2)): search_result_1.popitem() else: for i in range(len(search_result_2) - len(search_result_1)): search_result_2.popitem() # check correlation between queries correlation = spearmanr(list(search_result_1.values()), list(search_result_2.values())).correlation else: search_result_1 = search(search_query[0]) # greater correlation means that these two products are similar enough, # so combine the results and pick the most relevant products from the same pool if correlation > 0.8: final_list = dict(search_result_1) final_list.update(search_result_2) final_list_sorted = sorted(final_list, key=final_list.get, reverse=True)[:6] for k in final_list_sorted: file = open(file_destination + "/" + input_query + '.txt', "a") file.write(k + "\n\n") # If they aren't similar, pick the top products from each rank elif correlation == 0 and len(search_result_2) > 0: for k in list(search_result_1)[:3]: file = open(file_destination + "/" + input_query + '.txt', "a") file.write(k + "\n\n") for k in list(search_result_2)[:3]: file = open(file_destination + "/" + input_query + '.txt', "a") file.write(k + "\n\n") # If only one term is being searched, provide the user with the complete rank else: for k in search_result_1: file = open(file_destination + "/" + input_query + '.txt', "a") file.write(k + "\n\n")
def get_r(stk): data = load_from_db(stk) if len(data) < 144: return [] cl = np.array(data['CL'], dtype='f8') for i in x: data['m' + str(i)] = ta.EMA(cl, timeperiod=i) for i, r in data.iterrows(): z = [[r['m8'], 8], [r['m13'], 13], [r['m21'], 21], [r['m34'], 34], [r['m55'], 55], [r['m89'], 89], [r['m144'], 144], [r['m233'], 233]] c, p = mstats.spearmanr(x, [k[1] for k in sorted(z, reverse=True)]) data.loc[i, 'r'] = c data['mr'] = ta.SMA(np.array(data['r'], dtype='f8'), 10) return data.values[-1][-2:]
def cross_vect_score(vect_a, vect_b, scoring='euclidean', inv_noise_cov=None): """ Use the scoring function to compute a value between two vectors Parameters ---------- vect_a, vect_b: vector Data vectors. scoring: Scoring function in euclidean / mahalanobis / crossnobis / spearmanr / pearsornr. If "spearmanr_dist", return 1 - spearmanr correlation. inv_noise_cov: 2D array Inverse of the noise covariance matrix needed for mahalanobis and crossnobis scorings. Returns ------- score: float Score value. """ if scoring == 'euclidean': score = euclidean(vect_a, vect_b) elif scoring == "mahalanobis": score = mahalanobis(vect_a, vect_b, inv_noise_cov) elif scoring == "crossnobis": raise NotImplemented("Cross validated Mahalanobis distance is not " + \ "yet available") elif scoring in ["spearmanr", "spearmanr_dist"]: # Warning: ranking takes time, it's faster to input ranked vectors and # use pearsonr distance when doing multiple test on same vectors score, _ = spearmanr(vect_a, vect_b) elif scoring == "pearsonr": score, _ = pearsonr(vect_a, vect_b) else: raise ValueError("Unknown scoring function") if scoring[-5:] == "_dist": return 1 - score return score
def score_contextualized(embeddings, layers, RG, WS, SL, SV, embedding_keys): originals = {"RG65" : RG, "WS353" : WS, "SL999" : SL, "SV3500" : SV} scores = {"RG65" : {}, "WS353" : {}, "SL999" : {}, "SV3500" : {}} bests = {"RG65" : {}, "WS353" : {}, "SL999" : {}, "SV3500" : {}} for key in scores: for macro, micro in embedding_keys: scores[key][(macro, micro)] = [] for i in range(layers + 1): scores_human = [] scores_embed = [] for w1, w2, score in originals[key]: e1 = embeddings[w1][i][macro][micro] e2 = embeddings[w2][i][macro][micro] cos = 1 - cosine(e1, e2) scores_human.append(score) scores_embed.append(cos) scores[key][(macro, micro)].append(spearmanr(scores_human, scores_embed)[0]) for key in scores: l = [(i,round(v,4)) for i, v in enumerate(scores[key][('mean', 'vec_mean')])] bests[key] = sorted(l, key = lambda t: t[1])[-1] return scores, bests
def compute_mean_correlation(nannos): nreps = 10 mean_rho = 0 for rep in range(nreps): pair_ids = list([get_pid(pair) for pair in pairs]) upair_ids = np.unique(pair_ids) anno_counts = np.zeros(len(upair_ids)) subsample = [] for p, pid in enumerate( np.random.choice(pair_ids, len(pair_ids), replace=False)): if anno_counts[upair_ids == pid] < nannos: anno_counts[upair_ids == pid] += 1 subsample.append(p) print('Got subsample') sub_pairs = pairs[subsample] sub_bws = compute_bws(sub_pairs) # Now compute the correlations again mean_rho += spearmanr(bws, sub_bws)[0] mean_rho /= nreps print('Mean rho for %i = %f' % (nannos, mean_rho))
# sorted_ = np.squeeze(sorted_) # print(np.squeeze(sorted_).shape) # print(sorted_.shape) # print(sorted_) # exit() # print(labels_count[sorted_[:100]]) # print(labels_count[sorted_[:100]] / index) sorted_indices = np.zeros((np.int(num_samples / num_classes), num_classes)) for c in range(num_classes): # print(c) # print(class_labels[sorted_]) print(sorted_[np.where(class_labels[sorted_] == c)]) sorted_indices[:, c] = sorted_[np.where(class_labels[sorted_] == c)] # print(sort_indices[:10]) # print(sort_indices[:10].dtype) parent_path = '/cs/labs/daphna/gadic/curriculum_learning/' save_path = 'cifar100/subset1/' with open(os.path.join(dataset.data_path, 'sorted_indices_mc_large.pkl'), mode='wb') as file: pickle.dump(sorted_indices, file) sorted_indices1 = unpickle(os.path.join(dataset.data_path, 'sorted_indices.pkl')).astype(np.int).reshape(-1, ) # sorted_indices2 = unpickle(os.path.join(dataset.data_path, 'sorted_indices_mc.pkl')).astype(np.int).reshape(-1, ) sorted_indices2 = unpickle(os.path.join(dataset.data_path, 'sorted_indices_mc_large.pkl')).astype(np.int).reshape(-1, ) import scipy.stats.mstats as st print(st.spearmanr(sorted_indices1, sorted_indices2)) print(sorted_indices1[:100]) print(sorted_indices2[:100])
def compute(self, x, y, i): assert np.size(x) == np.size(y) and i >= 0 self.Matrices["SPEARMAN"][i], self.Matrices["SPEARMAN_PV"][i] = mstats.spearmanr(x,y)
''' # plotting domain = [-55, 90, 10, 180] #[-55, -270, 10, -180] #[-55, 90, 10, 180] domain_draw = [-55, 90, 10, 180] #[-55, -270, 10, -180] #[-55, 90, 10, 180] dlat = 10 #30 #10 dlon = 30 #90 #30 llon_obs, llat_obs = np.meshgrid(lon, lat) llon_mdl, llat_mdl = np.meshgrid(lon, lat) bg_col = '0.6' cont_col = '1.0' lev = np.hstack((np.arange(-0.06,-0.0005+0.0005,0.001), \ np.arange(0.0005,0.06+0.0005,0.001))) # PC1 pc = 3 SpearC, tmp = st.spearmanr(pcs_mdl[:, pc], pcs_obs[:, pc]) plt.figure plt.plot(pcs_mdl[:, pc], color='b', linewidth=2) plt.plot(pcs_obs[:, pc], color='k', linewidth=2) ax = plt.gca() ax.axhline(0, color='k') #ax.set_ylim(-3, 3) ax.set_xlabel('Year') ax.legend(['mdl','obs']) ax.set_ylabel('PC amplitude00') ax.set_title('PC4 Time Series', fontsize=16) plt.text(0.3, 0.1, 'Spearman Correlation coefficient:' + str(round(SpearC,3)), \ ha='center', va='center', transform=ax.transAxes, \ fontsize=14) plt.grid() plt.show()
def spear_corr(X, Y): return mstats.spearmanr(X, Y, use_ties=True)
data = pd.read_csv(resfile, usecols=[0, 1, 2]) ids = data['id'].values bws = data['bws'].values gppl = data['predicted'].values # ### Ties in the BWS Scores contribute to the discrepeancies between BWS and GPPL # # GPPL scores are all unique, but BWS contains many ties. # Selecting only one of the tied items increases the Spearman correlation. # # Find the ties in BWS. Compute correlations between those tied items for the GPPL scores vs. original BWS scores and GPPL vs. scaled BWS scores. # Do the ties contribute a lot of the differences in the overall ranking? # Another way to test if the ties contribute differences to the ranking: # Select only one random item from each tie and exclude the rest, then recompute. print('with ties included:') print(spearmanr(bws, gppl)[0]) print('with ties present but no correction for ties:') print(spearmanr(bws, gppl, False)[0]) print('with a random sample of one item if there is a tie in bws scores:') total = 0 for sample in range(10): untied_sample_bws = [] untied_sample_gppl = [] ties = [] tiesgppl = [] for i, item in enumerate(ids): if i >= 1 and bws[i] == bws[i - 1]:
def spearman(ypred, y): corr, _ = spearmanr(ypred, y) return corr
def run(self): img = IMG() markerset = MarkerSet() print('Reading metadata.') metadata = img.genomeMetadata('Final') print('Getting marker genes.') pfamMarkers, tigrMarkers = markerset.getLineageMarkerGenes('Archaea') markerGenes = pfamMarkers.union(tigrMarkers) print(' Marker genes: ' + str(len(markerGenes))) print('Getting genomes of interest.') genomeIds = img.genomeIdsByTaxonomy('Archaea', 'Final') print(' Genomes: ' + str(len(genomeIds))) print('Getting position of each marker gene.') geneDistTable = img.geneDistTable(genomeIds, markerGenes, spacingBetweenContigs=1e6) spearmanValues = [] pearsonValues = [] genomeIds = list(genomeIds) for i in range(0, len(genomeIds)): print(str(i + 1) + ' of ' + str(len(genomeIds))) geneOrderI = [] maskI = [] for markerGenesId in markerGenes: if markerGenesId in geneDistTable[genomeIds[i]]: geneOrderI.append( float(geneDistTable[genomeIds[i]][markerGenesId][0][0]) / metadata[genomeIds[i]]['genome size']) maskI.append(0) else: geneOrderI.append(-1) maskI.append(1) for j in range(i + 1, len(genomeIds)): geneOrderJ = [] maskJ = [] for markerGenesId in markerGenes: if markerGenesId in geneDistTable[genomeIds[j]]: geneOrderJ.append( float(geneDistTable[genomeIds[j]][markerGenesId][0] [0]) / metadata[genomeIds[j]]['genome size']) maskJ.append(0) else: geneOrderJ.append(-1) maskJ.append(1) # test all translations bestSpearman = 0 bestPearson = 0 for _ in range(0, len(markerGenes)): maskedI = [] maskedJ = [] for k in range(0, len(maskI)): if maskI[k] == 0 and maskJ[k] == 0: maskedI.append(geneOrderI[k]) maskedJ.append(geneOrderJ[k]) r, _ = spearmanr(maskedI, maskedJ) if abs(r) > bestSpearman: bestSpearman = abs(r) r, _ = pearsonr(maskedI, maskedJ) if abs(r) > bestPearson: bestPearson = abs(r) geneOrderJ = geneOrderJ[1:] + [geneOrderJ[0]] maskJ = maskJ[1:] + [maskJ[0]] spearmanValues.append(bestSpearman) pearsonValues.append(bestPearson) print('Spearman: %.2f +/- %.2f: ' % (mean(spearmanValues), std(spearmanValues))) print('Pearson: %.2f +/- %.2f: ' % (mean(pearsonValues), std(pearsonValues)))
corr_map_nino34 = np.empty(X * Y) corr_map_nino34.fill(np.nan) corr_map_dmi = np.empty(X * Y) corr_map_dmi.fill(np.nan) corr_map_sam = np.empty(X * Y) corr_map_sam.fill(np.nan) corr_map_nni = np.empty(X * Y) corr_map_nni.fill(np.nan) corr_map_eac = np.empty(X * Y) corr_map_eac.fill(np.nan) for tt in np.arange(0, T): mode_map[tt, :, :] = np.squeeze(var_EOF[ind, :, :]) * var_PC[tt] mode_map_ = mode_map.reshape(T, X * Y) for ll in np.arange(0, X * Y): corr_map_mei[ll], tmp = st.spearmanr(mode_map_[:, ll], mei_monthly) corr_map_nino34[ll], tmp = st.spearmanr(mode_map_[:, ll], nino34_monthly) corr_map_dmi[ll], tmp = st.spearmanr(mode_map_[:, ll], dmi_monthly) corr_map_sam[ll], tmp = st.spearmanr(mode_map_[:, ll], sam_monthly) corr_map_nni[ll], tmp = st.spearmanr(mode_map_[:, ll], nni_monthly) # corr_map_eac[ll], tmp = st.spearmanr(mode_map_[:,ll],eac_monthly) corr_map_mei = corr_map_mei.reshape(X, Y) corr_map_nino34 = corr_map_nino34.reshape(X, Y) corr_map_dmi = corr_map_dmi.reshape(X, Y) corr_map_sam = corr_map_sam.reshape(X, Y) corr_map_nni = corr_map_nni.reshape(X, Y) #corr_map_eac = corr_map_eac.reshape(X,Y) # plot setting domain = [-55, 90, 10, 180] #[-80, 0, 85, 360] #[-55, 90, 10, 180]
def train_predict_adaboost(normalized_features, feature_selector, y, num_runs=20, cv=False): ''' :cv: if True, num_runs is used as num_folds ''' # adaboost test selected_features = normalized_features[:, feature_selector] learn_options = { 'V': 3, 'train_genes': np.array([ 'CD5', 'CD45', 'THY1', 'H2-K', 'CD28', 'CD43', 'CD33', 'CD13', 'CD15', 'CCDC101', 'MED12', 'TADA2B', 'TADA1', 'HPRT1', 'CUL3', 'NF1', 'NF2' ], dtype=object), 'test_genes': np.array([ 'CD5', 'CD45', 'THY1', 'H2-K', 'CD28', 'CD43', 'CD33', 'CD13', 'CD15', 'CCDC101', 'MED12', 'TADA2B', 'TADA1', 'HPRT1', 'CUL3', 'NF1', 'NF2' ], dtype=object), 'target_name': 'score_drug_gene_rank', 'testing_non_binary_target_name': 'ranks', 'include_pi_nuc_feat': True, 'gc_features': True, 'nuc_features': True, 'include_gene_position': True, 'include_NGGX_interaction': True, 'include_Tm': True, 'include_strand': False, 'include_gene_feature': False, 'include_gene_guide_feature': 0, 'extra pairs': False, 'weighted': None, 'training_metric': 'spearmanr', 'NDGC_k': 10, 'cv': 'gene', 'adaboost_loss': 'ls', 'include_gene_effect': False, 'include_drug': False, 'include_sgRNAscore': False, 'adaboost_alpha': 0.5, 'adaboost_CV': False, 'num_proc': 8, 'num_thread_per_proc': None, 'order': 2, 'normalize_features': True, 'all pairs': False, 'include_known_pairs': False, 'seed': None, 'flipV1target': False, 'num_genes_remove_train': None, 'include_microhomology': False, 'algorithm_hyperparam_search': 'grid', 'binary target name': 'score_drug_gene_threshold', 'rank-transformed target name': 'score_drug_gene_rank', 'raw target name': None, 'all_genes': np.array([ 'CD5', 'CD45', 'THY1', 'H2-K', 'CD28', 'CD43', 'CD33', 'CD13', 'CD15', 'CCDC101', 'MED12', 'TADA2B', 'TADA1', 'HPRT1', 'CUL3', 'NF1', 'NF2' ], dtype=object), 'ground_truth_label': 'score_drug_gene_rank', 'method': 'AdaBoostRegressor', 'adaboost_version': 'python', 'adaboost_learning_rate': 0.1, 'adaboost_n_estimators': 100, 'adaboost_max_depth': 3 } sps = [] if cv: cv_indices = np.random.permutation(normalized_features.shape[0]) fold_length = normalized_features.shape[0] // num_runs for i in range(num_runs): if not cv: indices = np.random.permutation(normalized_features.shape[0]) train = indices[:4000] test = indices[4000:] else: test = cv_indices[i * fold_length:(i + 1) * fold_length] train = np.concatenate( (cv_indices[0:i * fold_length], cv_indices[(i + 1) * fold_length:normalized_features.shape[0]])) predictions, model = azimuth_adaboost(None, train, test, y, None, selected_features, None, None, learn_options, False) sps.append(spearmanr(predictions, y[test])[0]) return sps
eac_str = int(np.array(np.where(tim_vec == eac_time[0]))) eac_end = int(np.array(np.where(tim_vec == eac_time[-1]))) eac_monthly = signal.detrend(eac_monthly) ''' # EAC transport -- BRAN from Zeya's analysis # 1994-2016 (Aug) df = pd.read_csv('/v_Munk_Drive/ecougnon/data/transport_y.csv', header=None) eac_monthly = (df.iloc[:, 0]) * 110000 * np.cos( (np.pi / 180) * 37) * 0.1 * 10**(-6) eac_monthly = signal.detrend(eac_monthly) eac_str = int(12 * 12) eac_end = int(12 * 12 + len(eac_monthly) - 1) # calculate correlation coefficient using the spearman rank corr_mei, p_mei = st.spearmanr(var_PC, mei_monthly) corr_mei_std, p_mei_std = st.spearmanr(var_PC, mei_monthly_std) corr_nino34, p_nino34 = st.spearmanr(var_PC, nino34_monthly) corr_nino34_a, p_nino34_a = st.spearmanr(var_PC, nino34_monthly_a) corr_nino34_std, p_nino34_std = st.spearmanr(var_PC, nino34_monthly_std) corr_nino3, p_nino3 = st.spearmanr(var_PC, nino3_monthly) corr_nino3_a, p_nino3_a = st.spearmanr(var_PC, nino3_monthly_a) corr_nino3_std, p_nino3_std = st.spearmanr(var_PC, nino3_monthly_std) corr_nino4, p_nino4 = st.spearmanr(var_PC, nino4_monthly) corr_nino4_a, p_nino4_a = st.spearmanr(var_PC, nino4_monthly_a) corr_nino4_std, p_nino4_std = st.spearmanr(var_PC, nino4_monthly_std) corr_dmi, p_dmi = st.spearmanr(var_PC, dmi_monthly)
def run(self): img = IMG() markerset = MarkerSet() print 'Reading metadata.' metadata = img.genomeMetadata('Final') print 'Getting marker genes.' pfamMarkers, tigrMarkers = markerset.getLineageMarkerGenes('Archaea') markerGenes = pfamMarkers.union(tigrMarkers) print ' Marker genes: ' + str(len(markerGenes)) print 'Getting genomes of interest.' genomeIds = img.genomeIdsByTaxonomy('Archaea', 'Final') print ' Genomes: ' + str(len(genomeIds)) print 'Getting position of each marker gene.' geneDistTable = img.geneDistTable(genomeIds, markerGenes) spearmanValues = [] pearsonValues = [] genomeIds = list(genomeIds) for i in xrange(0, len(genomeIds)): print str(i+1) + ' of ' + str(len(genomeIds)) geneOrderI = [] maskI = [] for markerGenesId in markerGenes: if markerGenesId in geneDistTable[genomeIds[i]]: geneOrderI.append(float(geneDistTable[genomeIds[i]][markerGenesId][0][0]) / metadata[genomeIds[i]]['genome size']) maskI.append(0) else: geneOrderI.append(-1) maskI.append(1) for j in xrange(i+1, len(genomeIds)): geneOrderJ = [] maskJ = [] for markerGenesId in markerGenes: if markerGenesId in geneDistTable[genomeIds[j]]: geneOrderJ.append(float(geneDistTable[genomeIds[j]][markerGenesId][0][0]) / metadata[genomeIds[j]]['genome size']) maskJ.append(0) else: geneOrderJ.append(-1) maskJ.append(1) # test all translations bestSpearman = 0 bestPearson = 0 for _ in xrange(0, len(markerGenes)): maskedI = [] maskedJ = [] for k in xrange(0, len(maskI)): if maskI[k] == 0 and maskJ[k] == 0: maskedI.append(geneOrderI[k]) maskedJ.append(geneOrderJ[k]) r, _ = spearmanr(maskedI, maskedJ) if abs(r) > bestSpearman: bestSpearman = abs(r) r, _ = pearsonr(maskedI, maskedJ) if abs(r) > bestPearson: bestPearson = abs(r) geneOrderJ = geneOrderJ[1:] + [geneOrderJ[0]] maskJ = maskJ[1:] + [maskJ[0]] spearmanValues.append(bestSpearman) pearsonValues.append(bestPearson) print 'Spearman: %.2f +/- %.2f: ' % (mean(spearmanValues), std(spearmanValues)) print 'Pearson: %.2f +/- %.2f: ' % (mean(pearsonValues), std(pearsonValues))
def post(self, request): input_json = json.loads(request.body) database = input_json['parameters']['database'] if input_json['parameters']['database'] not in connections: return jsonify(status='error', message="Invalid database!") lotz_data_query = """ SELECT newsitem.id nid, text.id tid, text.wordcount, newsitem.qty_videos, newsitem.qty_images, LENGTH(text.text)/wordcount avg_wordlength, newsitem.qty_citations FROM newsitem, text WHERE newsitem.text_id = text.id """ qty_comments_query = """ SELECT newsitem.id nid, COUNT(*) qty_comments FROM newsitem, comment WHERE comment.NewsItemID = newsitem.id AND newsitem.text_id IS NOT NULL GROUP BY nid """ comments_score_query = """ SELECT newsitem_id nid, pos_comments, neg_comments, neutral_comments FROM newsitem, newsitem_pos_neg_comments WHERE algorithm_id = %s AND newsitem.id = newsitem_id AND newsitem.text_id IS NOT NULL """ algorithm_results_query = """ SELECT newsitem.id nid, result.value FROM newsitem, text, result WHERE newsitem.text_id = text.id AND text.id = result.text_id AND result.algorithm_id = %s """ algorithm_anew_valence, _ = Algorithm.objects.using(database)\ .get_or_create(name="anew_valence") algorithm_anew_arousal, _ = Algorithm.objects.using(database)\ .get_or_create(name="anew_arousal") algorithm_anew_dominance, _ = Algorithm.objects.using(database)\ .get_or_create(name="anew_dominance") od_number_of_comments = OrderedDict() cursor = connections[database].cursor() cursor.execute(lotz_data_query) arr_number_of_words = [] arr_number_of_images = [] arr_number_of_videos = [] arr_avg_wordlength = [] arr_number_of_citations = [] for row in dictfetch(cursor): od_number_of_comments[row['nid']] = float(0) arr_number_of_words.append(float(row['wordcount'])) arr_number_of_images.append(float(row['qty_images'])) arr_number_of_videos.append(float(row['qty_videos'])) arr_avg_wordlength.append(float(row['avg_wordlength'])) arr_number_of_citations.append(float(row['qty_citations'])) od_number_of_positive_comments = od_number_of_comments.copy() od_number_of_negative_comments = od_number_of_comments.copy() od_number_of_neutral_comments = od_number_of_comments.copy() od_valence = od_number_of_comments.copy() od_arousal = od_number_of_comments.copy() od_dominance = od_number_of_comments.copy() cursor.execute(comments_score_query, [input_json['algorithm']]) for row in dictfetch(cursor): od_number_of_positive_comments[row['nid']] = float(row['pos_comments']) od_number_of_negative_comments[row['nid']] = float(row['neg_comments']) od_number_of_neutral_comments[row['nid']] = float(row['neutral_comments']) arr_number_of_positive_comments = od_number_of_positive_comments.values() arr_number_of_negative_comments = od_number_of_negative_comments.values() arr_number_of_neutral_comments = od_number_of_neutral_comments.values() cursor.execute(algorithm_results_query, [algorithm_anew_valence.id]) for row in dictfetch(cursor): od_valence[row['nid']] = float(row['value']) arr_valence = od_valence.values() cursor.execute(algorithm_results_query, [algorithm_anew_arousal.id]) for row in dictfetch(cursor): od_arousal[row['nid']] = float(row['value']) arr_arousal = od_arousal.values() cursor.execute(algorithm_results_query, [algorithm_anew_dominance.id]) for row in dictfetch(cursor): od_dominance[row['nid']] = float(row['value']) arr_dominance = od_dominance.values() cursor.execute(qty_comments_query) for row in dictfetch(cursor): od_number_of_comments[row['nid']] = float(row['qty_comments']) arr_number_of_comments = od_number_of_comments.values() end = time.time() column_arrays = [ arr_number_of_comments, arr_number_of_positive_comments, arr_number_of_negative_comments, arr_number_of_neutral_comments ] row_arrays = [ arr_number_of_words, arr_number_of_images, arr_number_of_videos, arr_avg_wordlength, arr_number_of_citations, arr_valence, arr_arousal, arr_dominance ] message = "" results = [] for x_index in range(len(column_arrays)): for y_index in range(len(row_arrays)): if y_index >= 5: coef = spearmanr(column_arrays[x_index], row_arrays[y_index]) correlation = Decimal(coef[0]) value = round(correlation, 3) probability = Decimal(float(coef[1].data)) else: coef = pearsonr(column_arrays[x_index], row_arrays[y_index]) if numpy.isnan(coef[0]): value = -1 else: correlation = Decimal(coef[0]) value = round(correlation, 3) probability = Decimal(coef[1]) results.append({'x': x_index, 'y': y_index, 'value': value, 'correlation': (correlation, probability)}) # make result res={ 'results': results, 'message': message } # return json return JsonResponse(res)
def compute(self, x, y): assert np.size(x) == np.size(y) rho, pv = mstats.spearmanr(x, y) return {"SPEARMAN": rho, "SPEARMAN_PV": pv}
import shutil LOG_MSG = "#npy_fname=%(npy_fname)s, function=%(function)s, start=%(start)d, end=%(end)d, m=%(m)d, date=%(date)s" REPORT_N = 1000 # get username TMP_DIR = "/tmp/%s" % pwd.getpwuid(os.getuid()).pw_name def euclidean(x,y): q=x-y return ma.sqrt((q*q.T).sum()) # this should be in a separate file FUNCTIONS = { 'pearson': lambda x, y: mstats.pearsonr(x,y)[0], 'spearman': lambda x, y: mstats.spearmanr(x,y)[0], 'euclidean': euclidean, 'kendalltau': lambda x,y: mstats.kendalltau(x,y)[0], 'dcor': dcor, } def main(npy_fname=None, function=None, batchname=None, outdir=None, start=None, end=None, m=None): """Compute pairs of dependency""" assert npy_fname, function assert function in FUNCTIONS assert os.path.exists(outdir) assert os.path.isdir(outdir) m = int(m) assert m > 0
def calculate_spearman(gold_filename, matrix_filename, similarity_function): """ Calculate Spearman coefficient between a corpus of similarities of word pairs and a bigram model as produced by read_bigram_matrix.py. Parameters ---------- gold_filename : String Filename of the corpus. Assumes that there is one word pair per line in the format "word1 word2 similarity" matrix_filename : String File containg the bigram model. unigram_filename : String File containing the unigram probabilities. vocab_order_filename : String File containing the order of the vectors in the matrix. similarity_function : function Function for calculating influence of two vectors. Return ------ spearman : Float Spearman coefficient """ reg = r"(\S+)\s(\S+)\s(\S+)" gold_list = [] with open(gold_filename) as gold_file: for line in gold_file: m = re.match(reg, line) if not m: print(line) continue word1 = m.group(1) word2 = m.group(2) sim = m.group(3) gold_list.append((word1, word2, sim)) matrix_similarity_list = [] gold_similarity_list = [] matrix = DS_matrix(matrix_filename) for word1, word2, sim in gold_list: if not matrix.contains(word1) or not matrix.contains(word2): continue vec1 = matrix.get_vector(word1) vec2 = matrix.get_vector(word2) similarity = similarity_function(vec1, vec2) matrix_similarity_list.append(similarity) gold_similarity_list.append(float(sim)) spearman, _ = spearmanr(matrix_similarity_list, gold_similarity_list) return spearman
plt.figure() plt.hist(d1) plt.figure() plt.hist(d2) plt.show() print(normaltest(d1)) print(normaltest(d2)) r, p = pearsonr(d1, d2) print('Pearson correlation coeff: {}, p-value: {}'.format(r, p)) r, p = spearmanr(d1, d2) print('Spearman correlation coeff: {}, p-value: {}'.format(r, p)) print('-' * 30) # e. compare distribution from scipy.stats import ttest_ind # t-test requirement: independent samples with identical variances # compare samples def do_ttest(dat1, dat2, name1, name2, alpha=0.05): print('-' * 20)
output = output / torch.norm(output, p=2, dim=1) output = output - torch.mean(output) loss = -torch.sum( torch.exp(batch_out) * output[:, :, 0] - output[:, :, 0] ) # profit = SUM[gi*xi - xi], gi=linear gain, xi=investment (negative if short) # TODO: Add heavy L1 cost to promote small number of trades? # Compute gradients and update parameters loss.backward() optimizer.step() loss_history[i_iter] = loss.data.cpu().numpy() loss_baseline[i_iter] = criterion( TT(np.zeros(batch_out.size(), dtype=np.float32)), batch_out).data.cpu().numpy() spear[i_iter] = spearmanr(output[0, :, 0].data.cpu().numpy(), batch_out[0, :].data.cpu().numpy())[0] if (i_iter + 1) % n_i_iter_per_log == 0 | (i_iter == 0): print('Iteration %d, Loss=%0.5f, Duration=%0.3f' % (i_iter + 1, loss.data.cpu().numpy(), time.time() - t_start)) ## Put testing data through model net.eval() loss_day = np.asarray([]) for i_test_samp in range(n_days_test - n_days_input): # Build the batch ix_date_start = np.asarray([n_dates - n_days_test + i_test_samp]) batch_in = np.zeros((1, n_symbols, n_days_input, 4)) batch_out = np.zeros((1, n_symbols))