Exemplo n.º 1
0
    def test_spearmanr(self):
        # Tests some computations of Spearman's rho
        (x, y) = ([5.05, 6.75, 3.21, 2.66], [1.65, 2.64, 2.64, 6.95])
        assert_almost_equal(mstats.spearmanr(x, y)[0], -0.6324555)
        (x, y) = ([5.05, 6.75, 3.21, 2.66,
                   np.nan], [1.65, 2.64, 2.64, 6.95, np.nan])
        (x, y) = (ma.fix_invalid(x), ma.fix_invalid(y))
        assert_almost_equal(mstats.spearmanr(x, y)[0], -0.6324555)

        x = [
            2.0, 47.4, 42.0, 10.8, 60.1, 1.7, 64.0, 63.1, 1.0, 1.4, 7.9, 0.3,
            3.9, 0.3, 6.7
        ]
        y = [
            22.6, 8.3, 44.4, 11.9, 24.6, 0.6, 5.7, 41.6, 0.0, 0.6, 6.7, 3.8,
            1.0, 1.2, 1.4
        ]
        assert_almost_equal(mstats.spearmanr(x, y)[0], 0.6887299)
        x = [
            2.0, 47.4, 42.0, 10.8, 60.1, 1.7, 64.0, 63.1, 1.0, 1.4, 7.9, 0.3,
            3.9, 0.3, 6.7, np.nan
        ]
        y = [
            22.6, 8.3, 44.4, 11.9, 24.6, 0.6, 5.7, 41.6, 0.0, 0.6, 6.7, 3.8,
            1.0, 1.2, 1.4, np.nan
        ]
        (x, y) = (ma.fix_invalid(x), ma.fix_invalid(y))
        assert_almost_equal(mstats.spearmanr(x, y)[0], 0.6887299)

        # test for namedtuple attributes
        res = mstats.spearmanr(x, y)
        attributes = ('correlation', 'pvalue')
        check_named_results(res, attributes, ma=True)
Exemplo n.º 2
0
 def test_spearmanr(self):
     "Tests some computations of Spearman's rho"
     (x, y) = ([5.05, 6.75, 3.21, 2.66], [1.65, 2.64, 2.64, 6.95])
     assert_almost_equal(mstats.spearmanr(x, y)[0], -0.6324555)
     (x, y) = ([5.05, 6.75, 3.21, 2.66,
                np.nan], [1.65, 2.64, 2.64, 6.95, np.nan])
     (x, y) = (ma.fix_invalid(x), ma.fix_invalid(y))
     assert_almost_equal(mstats.spearmanr(x, y)[0], -0.6324555)
     #
     x = [
         2.0, 47.4, 42.0, 10.8, 60.1, 1.7, 64.0, 63.1, 1.0, 1.4, 7.9, 0.3,
         3.9, 0.3, 6.7
     ]
     y = [
         22.6, 08.3, 44.4, 11.9, 24.6, 0.6, 5.7, 41.6, 0.0, 0.6, 6.7, 3.8,
         1.0, 1.2, 1.4
     ]
     assert_almost_equal(mstats.spearmanr(x, y)[0], 0.6887299)
     x = [
         2.0, 47.4, 42.0, 10.8, 60.1, 1.7, 64.0, 63.1, 1.0, 1.4, 7.9, 0.3,
         3.9, 0.3, 6.7, np.nan
     ]
     y = [
         22.6, 08.3, 44.4, 11.9, 24.6, 0.6, 5.7, 41.6, 0.0, 0.6, 6.7, 3.8,
         1.0, 1.2, 1.4, np.nan
     ]
     (x, y) = (ma.fix_invalid(x), ma.fix_invalid(y))
     assert_almost_equal(mstats.spearmanr(x, y)[0], 0.6887299)
Exemplo n.º 3
0
    def test_spearmanr(self):
        # Tests some computations of Spearman's rho
        (x, y) = ([5.05, 6.75, 3.21, 2.66], [1.65, 2.64, 2.64, 6.95])
        assert_almost_equal(mstats.spearmanr(x, y)[0], -0.6324555)
        (x, y) = ([5.05, 6.75, 3.21, 2.66, np.nan], [1.65, 2.64, 2.64, 6.95, np.nan])
        (x, y) = (ma.fix_invalid(x), ma.fix_invalid(y))
        assert_almost_equal(mstats.spearmanr(x, y)[0], -0.6324555)

        x = [2.0, 47.4, 42.0, 10.8, 60.1, 1.7, 64.0, 63.1, 1.0, 1.4, 7.9, 0.3, 3.9, 0.3, 6.7]
        y = [22.6, 8.3, 44.4, 11.9, 24.6, 0.6, 5.7, 41.6, 0.0, 0.6, 6.7, 3.8, 1.0, 1.2, 1.4]
        assert_almost_equal(mstats.spearmanr(x, y)[0], 0.6887299)
        x = [2.0, 47.4, 42.0, 10.8, 60.1, 1.7, 64.0, 63.1, 1.0, 1.4, 7.9, 0.3, 3.9, 0.3, 6.7, np.nan]
        y = [22.6, 8.3, 44.4, 11.9, 24.6, 0.6, 5.7, 41.6, 0.0, 0.6, 6.7, 3.8, 1.0, 1.2, 1.4, np.nan]
        (x, y) = (ma.fix_invalid(x), ma.fix_invalid(y))
        assert_almost_equal(mstats.spearmanr(x, y)[0], 0.6887299)
Exemplo n.º 4
0
def all_pairs_spearman(M):
  """This should return a squareform matrix"""
  C = np.zeros((len(M), len(M)))
  for i in xrange(len(M)):
    for j in xrange(i+1, len(M)):
      C[i][j] = mstats.spearmanr(M[i],M[j])[0]
  return C
Exemplo n.º 5
0
def evalsemrel(l1, l2, pairsfile, l1colnum, l2colnum, l2gpmffile, l1l2methpmffile):
    '''Evaluate semantic relatedness.
    l1 and l2 = languages s and t in p(t|s)
    pairsfile = translation pairs in l1 and l2, l1 words in column <colnum>
    l2gpmffile = gold pmf over l2 words, for l2 words (including those in pairsfile)
    l1l2methpmffile = method-induced pmf over l2 words, for l1 words (including those
                      in pairsfile)
    '''
    l2gpmf = L1L2PMF(l2, l2, l2gpmffile)
    l1l2pmf = L1L2PMF(l1, l2, l1l2methpmffile)
#    print "#JSDiv Spearmanr"
    jsds, rhos = [], []
    for line in open(pairsfile):
        line = line.decode('utf-8').rstrip()
        pair = line.split()
        w1, w2 = pair[l1colnum-1], pair[l2colnum-1]
        gpmf = l2gpmf.pmf[w2] #gold pmf
        mpmf = l1l2pmf.pmf[w1] #method pmf
        vecs = [ (gpmf[x2], mpmf[x2]) for x2 in gpmf if x2 in mpmf]
        vecs.extend( [ (gpmf[x2], 0.0) for x2 in gpmf if x2 not in mpmf] )
        vecs.extend( [ (0.0, mpmf[x2]) for x2 in mpmf if x2 not in gpmf] )
        gvec, mvec = zip(*vecs)
        jsd = MyUtils.jsd(gvec, mvec, base=2)
        rho, pval = mstats.spearmanr(gvec, mvec, use_ties=True)
        jsds.append(jsd)
        rhos.append(rho)
        print "%f\t%f\t%f" % (jsd, rho, pval)
    print "\t\t\t%f\t%f" % ( sum(jsds)/len(jsds), sum(rhos)/len(rhos) )
Exemplo n.º 6
0
def evalsemrel(l1, l2, pairsfile, l1colnum, l2colnum, l2gpmffile,
               l1l2methpmffile):
    '''Evaluate semantic relatedness.
    l1 and l2 = languages s and t in p(t|s)
    pairsfile = translation pairs in l1 and l2, l1 words in column <colnum>
    l2gpmffile = gold pmf over l2 words, for l2 words (including those in pairsfile)
    l1l2methpmffile = method-induced pmf over l2 words, for l1 words (including those
                      in pairsfile)
    '''
    l2gpmf = L1L2PMF(l2, l2, l2gpmffile)
    l1l2pmf = L1L2PMF(l1, l2, l1l2methpmffile)
    #    print "#JSDiv Spearmanr"
    jsds, rhos = [], []
    for line in open(pairsfile):
        line = line.decode('utf-8').rstrip()
        pair = line.split()
        w1, w2 = pair[l1colnum - 1], pair[l2colnum - 1]
        gpmf = l2gpmf.pmf[w2]  #gold pmf
        mpmf = l1l2pmf.pmf[w1]  #method pmf
        vecs = [(gpmf[x2], mpmf[x2]) for x2 in gpmf if x2 in mpmf]
        vecs.extend([(gpmf[x2], 0.0) for x2 in gpmf if x2 not in mpmf])
        vecs.extend([(0.0, mpmf[x2]) for x2 in mpmf if x2 not in gpmf])
        gvec, mvec = zip(*vecs)
        jsd = MyUtils.jsd(gvec, mvec, base=2)
        rho, pval = mstats.spearmanr(gvec, mvec, use_ties=True)
        jsds.append(jsd)
        rhos.append(rho)
        print "%f\t%f\t%f" % (jsd, rho, pval)
    print "\t\t\t%f\t%f" % (sum(jsds) / len(jsds), sum(rhos) / len(rhos))
 def compute(self, x, y):
   assert np.size(x) == np.size(y)
   rho, pv = mstats.spearmanr(x,y)
   return {
     "SPEARMAN": rho,
     "SPEARMAN_PV": pv
     }
Exemplo n.º 8
0
    def m_scagnostics(self):
        graph = self._mst_graph
        weights = [graph[a][b]['weight'] for a,b in graph.edges]
        weights.sort()
        quant25 = np.quantile(weights,0.25)
        quant75 = np.quantile(weights,0.75)
        quant10 = np.quantile(weights,0.1)
        quant90 = np.quantile(weights,0.9)
        quant50 = np.quantile(weights,0.5)
        crit = quant75 + 1.5*(quant75-quant25)
        longEdgesSum = np.sum(list(filter(lambda x: x>crit, weights)))
        subgraphs = [graph.copy() for a,b in graph.edges]
        edges = [[a,b] for a,b in graph.edges]
        clumpylist = []
        i= 0
        for g in subgraphs:
            g.remove_edge(edges[i][0],edges[i][1])
            minComp = min(connected_component_subgraphs(g), key=len)
            if len(minComp.edges) >0 and graph[edges[i][0]][edges[i][1]]['weight'] > 0:
                maxEdge = max([g[a][b]['weight'] for a,b in minComp.edges])
                maxEdge = max(0.00001,maxEdge)
                val = 1- (maxEdge/graph[edges[i][0]][edges[i][1]]['weight'])
                clumpylist.append(val)
            i+=1
        diameter = max([max(i[1][0].values()) for i in nx.all_pairs_dijkstra(graph)])

        self.clumpy_measure  = max(clumpylist)
        self.sparse_measure  = min(1,quant90)
        self.stringy_measure = diameter/graph.size(weight='weight')
        self.spearmanr_measure = spearmanr([i[0] for i in self.projects],[i[1] for i in self.projects]).correlation
        self.outlying_measure = longEdgesSum/np.sum(weights)
        self.skewed_measure = (quant90-quant50)/(quant90-quant10)
Exemplo n.º 9
0
def correlate_traits(eigengene_path, trait_path, trait_types, ordinals):
    traits = pd.read_csv(trait_path, index_col=0)
    continuous_traits = traits[[
        trait for trait, type_ in trait_types.items() if type_ == 'C'
    ]]
    nominal_traits = traits[[
        trait for trait, type_ in trait_types.items() if type_ == 'N'
        if trait not in ordinals
    ]]
    nominal_traits = pd.get_dummies(nominal_traits)
    ordinal_traits = traits[list(ordinals.keys())]
    for trait, variables in ordinals.items():
        variables = {v: i for i, v in enumerate(variables)}
        ordinal_traits[trait] = ordinal_traits[trait].replace(variables)
    traits = pd.concat([continuous_traits, nominal_traits, ordinal_traits],
                       axis=1)

    eigengenes = pd.read_csv(eigengene_path, index_col=0)

    corrs = pd.DataFrame(index=eigengenes.columns, columns=traits.columns)
    for module in corrs.index:
        for trait in corrs.columns:
            corrs.at[module, trait] = spearmanr(eigengenes[module],
                                                traits[trait]).correlation

    return corrs
Exemplo n.º 10
0
 def spearman(self):
     slicez, islicez = self.slices(method='variable')
     #corrs = np.array([spearmanr(*self.dtw_resample(sl))[0] if len(sl) else 0.0 for sl in slicez])
     corrs = np.nan_to_num([
         spearmanr(*self.dtw_resample(sl))[0] if len(sl) else 0.0
         for sl in slicez
     ])
     corrs = np.clip(corrs, a_min=0.0, a_max=1.0)
     return corrs
Exemplo n.º 11
0
def get_r(data):
    x = [8,13,21,34,55]
    cl = np.array(data['CL'],dtype='f8')
    for i in x:
        data['m'+str(i)] = ta.EMA(cl,timeperiod=i)
    for i,r in data.iterrows():
        if math.isnan(r['m55']): continue
        z = [[r['m8'],8],[r['m13'],13],[r['m21'],21],[r['m34'],34],[r['m55'],55]]
        c,p = mstats.spearmanr(x,[k[1] for k in sorted(z,reverse=True)])
        data.loc[i,'r'] = c
    return data['r']
Exemplo n.º 12
0
    def get_spearman(self, human_assessment_dict) -> Tuple[float, float]:
        human_assessment_values = []
        cosine_values = []
        total_count = 0
        found_count = 0
        found_concepts = []
        all_concepts = []
        tqdm_bar = tqdm(human_assessment_dict.items(),
                        total=len(human_assessment_dict))
        # suma = 0
        for concept, other_concepts in tqdm_bar:
            total_count += len(other_concepts)
            for c in other_concepts:
                all_concepts.append((concept, c))
            concept_vec = self.get_concept_vector(concept, give_none=True)
            if concept_vec is not None:
                for other_concept in other_concepts:
                    other_concept_vec = self.get_concept_vector(other_concept,
                                                                give_none=True)
                    if other_concept_vec is not None:
                        # cos = self.cosine(vector1=concept_vec, vector2=other_concept_vec,
                        #                   same_vec_zero=not(concept == other_concept))
                        cos = self.cosine(vector1=concept_vec,
                                          vector2=other_concept_vec,
                                          same_vec_zero=False)
                        if cos == 1 and concept != other_concept:
                            pass
                            # human_assessment_values.append(0)
                            # cosine_values.append(cos)
                        else:
                            human_assessment_values.append(
                                human_assessment_dict[concept][other_concept])
                            cosine_values.append(cos)
                            found_count += 1
                            found_concepts.append((concept, other_concept))
                        tqdm_bar.set_description(
                            f"{self.__class__.__name__} Beam ({self.dataset}|{self.algorithm}|{self.preprocessing})"
                        )
                        tqdm_bar.update()

        if len(human_assessment_values) > 0 and len(cosine_values) > 0:
            cor, _ = spearmanr(human_assessment_values, cosine_values)
        else:
            cor = 0
        benchmark_coverage = found_count / total_count
        # print(suma)
        # print('cov1:', benchmark_coverage)
        # print('cov2:', len(found_concepts)/len(all_concepts))
        # print(len(found_concepts), len(all_concepts))
        # print('found:', found_concepts)
        # print('total', all_concepts)
        return cor, benchmark_coverage
Exemplo n.º 13
0
def score_decontextualized(embeddings, layers, RG, WS, SL, SV, embedding_keys):
	originals = {"RG65" : RG, "WS353" : WS, "SL999" : SL, "SV3500" : SV}
	scores = {"RG65" : {}, "WS353" : {}, "SL999" : {}, "SV3500" : {}}
	for key in scores:
		for embedding_key in embedding_keys:
			scores_human = []
			scores_embed = []
			for w1, w2, score in originals[key]:
				e1 = embeddings[w1][embedding_key]
				e2 = embeddings[w2][embedding_key]
				cos = 1 - cosine(e1, e2)
				scores_human.append(score)
				scores_embed.append(cos)
			scores[key][embedding_key]= round(spearmanr(scores_human, scores_embed)[0], 4)	
	return scores
Exemplo n.º 14
0
    def test_spearmanr(self):
        # Tests some computations of Spearman's rho
        (x, y) = ([5.05,6.75,3.21,2.66],[1.65,2.64,2.64,6.95])
        assert_almost_equal(mstats.spearmanr(x,y)[0], -0.6324555)
        (x, y) = ([5.05,6.75,3.21,2.66,np.nan],[1.65,2.64,2.64,6.95,np.nan])
        (x, y) = (ma.fix_invalid(x), ma.fix_invalid(y))
        assert_almost_equal(mstats.spearmanr(x,y)[0], -0.6324555)

        x = [2.0, 47.4, 42.0, 10.8, 60.1, 1.7, 64.0, 63.1,
              1.0, 1.4, 7.9, 0.3, 3.9, 0.3, 6.7]
        y = [22.6, 8.3, 44.4, 11.9, 24.6, 0.6, 5.7, 41.6,
              0.0, 0.6, 6.7, 3.8, 1.0, 1.2, 1.4]
        assert_almost_equal(mstats.spearmanr(x,y)[0], 0.6887299)
        x = [2.0, 47.4, 42.0, 10.8, 60.1, 1.7, 64.0, 63.1,
              1.0, 1.4, 7.9, 0.3, 3.9, 0.3, 6.7, np.nan]
        y = [22.6, 8.3, 44.4, 11.9, 24.6, 0.6, 5.7, 41.6,
              0.0, 0.6, 6.7, 3.8, 1.0, 1.2, 1.4, np.nan]
        (x, y) = (ma.fix_invalid(x), ma.fix_invalid(y))
        assert_almost_equal(mstats.spearmanr(x,y)[0], 0.6887299)

        # test for namedtuple attributes
        res = mstats.spearmanr(x, y)
        attributes = ('correlation', 'pvalue')
        check_named_results(res, attributes, ma=True)
Exemplo n.º 15
0
def write_results(search_query, file_destination, input_query):
    search_result_1 = {}
    search_result_2 = {}
    correlation = 0

    # If there are more than one search term in a query then separate them for individual search If they produce
    # uneven number of documents, remove documents with the lowest scores, in a rank with more documents
    # until both rankings are the same length
    if len(search_query) > 1:
        search_result_1 = search(search_query[0])
        search_result_2 = search(search_query[1])
        if len(search_result_1) > len(search_result_2):
            for i in range(len(search_result_1) - len(search_result_2)):
                search_result_1.popitem()
        else:
            for i in range(len(search_result_2) - len(search_result_1)):
                search_result_2.popitem()
        # check correlation between queries
        correlation = spearmanr(list(search_result_1.values()),
                                list(search_result_2.values())).correlation
    else:
        search_result_1 = search(search_query[0])

    # greater correlation means that these two products are similar enough,
    # so combine the results and pick the most relevant products from the same pool
    if correlation > 0.8:
        final_list = dict(search_result_1)
        final_list.update(search_result_2)
        final_list_sorted = sorted(final_list,
                                   key=final_list.get,
                                   reverse=True)[:6]
        for k in final_list_sorted:
            file = open(file_destination + "/" + input_query + '.txt', "a")
            file.write(k + "\n\n")
    # If they aren't similar, pick the top products from each rank
    elif correlation == 0 and len(search_result_2) > 0:
        for k in list(search_result_1)[:3]:
            file = open(file_destination + "/" + input_query + '.txt', "a")
            file.write(k + "\n\n")
        for k in list(search_result_2)[:3]:
            file = open(file_destination + "/" + input_query + '.txt', "a")
            file.write(k + "\n\n")
    # If only one term is being searched, provide the user with the complete rank
    else:
        for k in search_result_1:
            file = open(file_destination + "/" + input_query + '.txt', "a")
            file.write(k + "\n\n")
Exemplo n.º 16
0
def get_r(stk):
    data = load_from_db(stk)
    if len(data) < 144: return []

    cl = np.array(data['CL'], dtype='f8')
    for i in x:
        data['m' + str(i)] = ta.EMA(cl, timeperiod=i)

    for i, r in data.iterrows():
        z = [[r['m8'], 8], [r['m13'], 13], [r['m21'], 21], [r['m34'], 34],
             [r['m55'], 55], [r['m89'], 89], [r['m144'], 144],
             [r['m233'], 233]]
        c, p = mstats.spearmanr(x, [k[1] for k in sorted(z, reverse=True)])
        data.loc[i, 'r'] = c
    data['mr'] = ta.SMA(np.array(data['r'], dtype='f8'), 10)

    return data.values[-1][-2:]
Exemplo n.º 17
0
def cross_vect_score(vect_a, vect_b, scoring='euclidean', inv_noise_cov=None):
    """ Use the scoring function to compute a value between two vectors

    Parameters
    ----------
    vect_a, vect_b: vector
        Data vectors.

    scoring:
        Scoring function in euclidean / mahalanobis / crossnobis / spearmanr /
        pearsornr. If "spearmanr_dist", return 1 - spearmanr correlation.

    inv_noise_cov: 2D array
        Inverse of the noise covariance matrix needed for mahalanobis and
        crossnobis scorings.

    Returns
    -------
    score: float
        Score value.
        
    """
    if scoring == 'euclidean':
        score = euclidean(vect_a, vect_b)
    elif scoring == "mahalanobis":
        score = mahalanobis(vect_a, vect_b, inv_noise_cov)
    elif scoring == "crossnobis":
        raise NotImplemented("Cross validated Mahalanobis distance is not " + \
                             "yet available")
    elif scoring in ["spearmanr", "spearmanr_dist"]:
        # Warning: ranking takes time, it's faster to input ranked vectors and
        # use pearsonr distance when doing multiple test on same vectors
        score, _ = spearmanr(vect_a, vect_b)
    elif scoring == "pearsonr":
        score, _ = pearsonr(vect_a, vect_b)
    else:
        raise ValueError("Unknown scoring function")

    if scoring[-5:] == "_dist":
        return 1 - score
    return score
Exemplo n.º 18
0
def score_contextualized(embeddings, layers, RG, WS, SL, SV, embedding_keys):
	originals = {"RG65" : RG, "WS353" : WS, "SL999" : SL, "SV3500" : SV}
	scores = {"RG65" : {}, "WS353" : {}, "SL999" : {}, "SV3500" : {}}
	bests = {"RG65" : {}, "WS353" : {}, "SL999" : {}, "SV3500" : {}}
	for key in scores:
		for macro, micro in embedding_keys:
			scores[key][(macro, micro)] = []
			for i in range(layers + 1):
				scores_human = []
				scores_embed = []
				for w1, w2, score in originals[key]:
					e1 = embeddings[w1][i][macro][micro]
					e2 = embeddings[w2][i][macro][micro]
					cos = 1 - cosine(e1, e2)
					scores_human.append(score)
					scores_embed.append(cos)
				scores[key][(macro, micro)].append(spearmanr(scores_human, scores_embed)[0])	
	for key in scores:
		l = [(i,round(v,4)) for i, v in enumerate(scores[key][('mean', 'vec_mean')])]
		bests[key] = sorted(l, key = lambda t: t[1])[-1] 	
	return scores, bests
def compute_mean_correlation(nannos):
    nreps = 10

    mean_rho = 0

    for rep in range(nreps):
        pair_ids = list([get_pid(pair) for pair in pairs])
        upair_ids = np.unique(pair_ids)
        anno_counts = np.zeros(len(upair_ids))
        subsample = []
        for p, pid in enumerate(
                np.random.choice(pair_ids, len(pair_ids), replace=False)):

            if anno_counts[upair_ids == pid] < nannos:
                anno_counts[upair_ids == pid] += 1
                subsample.append(p)
        print('Got subsample')
        sub_pairs = pairs[subsample]
        sub_bws = compute_bws(sub_pairs)
        # Now compute the correlations again
        mean_rho += spearmanr(bws, sub_bws)[0]

    mean_rho /= nreps
    print('Mean rho for %i = %f' % (nannos, mean_rho))
Exemplo n.º 20
0
            # sorted_ = np.squeeze(sorted_)
            # print(np.squeeze(sorted_).shape)
            # print(sorted_.shape)
            # print(sorted_)
            # exit()
            # print(labels_count[sorted_[:100]])
            # print(labels_count[sorted_[:100]] / index)

    sorted_indices = np.zeros((np.int(num_samples / num_classes), num_classes))

    for c in range(num_classes):
        # print(c)
        # print(class_labels[sorted_])
        print(sorted_[np.where(class_labels[sorted_] == c)])
        sorted_indices[:, c] = sorted_[np.where(class_labels[sorted_] == c)]

    # print(sort_indices[:10])
    # print(sort_indices[:10].dtype)
    parent_path = '/cs/labs/daphna/gadic/curriculum_learning/'
    save_path = 'cifar100/subset1/'
    with open(os.path.join(dataset.data_path, 'sorted_indices_mc_large.pkl'), mode='wb') as file:
        pickle.dump(sorted_indices, file)
    sorted_indices1 = unpickle(os.path.join(dataset.data_path, 'sorted_indices.pkl')).astype(np.int).reshape(-1, )
    # sorted_indices2 = unpickle(os.path.join(dataset.data_path, 'sorted_indices_mc.pkl')).astype(np.int).reshape(-1, )
    sorted_indices2 = unpickle(os.path.join(dataset.data_path, 'sorted_indices_mc_large.pkl')).astype(np.int).reshape(-1, )

    import scipy.stats.mstats as st
    print(st.spearmanr(sorted_indices1, sorted_indices2))
    print(sorted_indices1[:100])
    print(sorted_indices2[:100])
 def compute(self, x, y, i):
   assert np.size(x) == np.size(y) and i >= 0
   self.Matrices["SPEARMAN"][i], self.Matrices["SPEARMAN_PV"][i] = mstats.spearmanr(x,y)
Exemplo n.º 22
0
'''
# plotting
domain = [-55, 90, 10, 180] #[-55, -270, 10, -180] #[-55, 90, 10, 180]
domain_draw = [-55, 90, 10, 180] #[-55, -270, 10, -180] #[-55, 90, 10, 180]
dlat = 10 #30 #10
dlon = 30 #90 #30
llon_obs, llat_obs = np.meshgrid(lon, lat)
llon_mdl, llat_mdl = np.meshgrid(lon, lat)
bg_col = '0.6'
cont_col = '1.0'
lev = np.hstack((np.arange(-0.06,-0.0005+0.0005,0.001), \
                 np.arange(0.0005,0.06+0.0005,0.001)))

# PC1
pc = 3
SpearC, tmp = st.spearmanr(pcs_mdl[:, pc], pcs_obs[:, pc])
plt.figure
plt.plot(pcs_mdl[:, pc], color='b', linewidth=2)
plt.plot(pcs_obs[:, pc], color='k', linewidth=2)
ax = plt.gca()
ax.axhline(0, color='k')
#ax.set_ylim(-3, 3)
ax.set_xlabel('Year')
ax.legend(['mdl','obs'])
ax.set_ylabel('PC amplitude00')
ax.set_title('PC4 Time Series', fontsize=16)
plt.text(0.3, 0.1, 'Spearman Correlation coefficient:' + str(round(SpearC,3)), \
         ha='center', va='center', transform=ax.transAxes, \
         fontsize=14)
plt.grid()
plt.show()
Exemplo n.º 23
0
 def spear_corr(X, Y):
     return mstats.spearmanr(X, Y, use_ties=True)
data = pd.read_csv(resfile, usecols=[0, 1, 2])
ids = data['id'].values
bws = data['bws'].values
gppl = data['predicted'].values

# ### Ties in the BWS Scores contribute to the discrepeancies between BWS and GPPL
#
# GPPL scores are all unique, but BWS contains many ties.
# Selecting only one of the tied items increases the Spearman correlation.
#
# Find the ties in BWS. Compute correlations between those tied items for the GPPL scores vs. original BWS scores and GPPL vs. scaled BWS scores.
# Do the ties contribute a lot of the differences in the overall ranking?
# Another way to test if the ties contribute differences to the ranking:
# Select only one random item from each tie and exclude the rest, then recompute.
print('with ties included:')
print(spearmanr(bws, gppl)[0])
print('with ties present but no correction for ties:')
print(spearmanr(bws, gppl, False)[0])
print('with a random sample of one item if there is a tie in bws scores:')
total = 0
for sample in range(10):
    untied_sample_bws = []
    untied_sample_gppl = []

    ties = []
    tiesgppl = []

    for i, item in enumerate(ids):

        if i >= 1 and bws[i] == bws[i - 1]:
Exemplo n.º 25
0
 def spearman(ypred, y):
     corr, _ = spearmanr(ypred, y)
     return corr
Exemplo n.º 26
0
    def run(self):
        img = IMG()
        markerset = MarkerSet()

        print('Reading metadata.')
        metadata = img.genomeMetadata('Final')

        print('Getting marker genes.')
        pfamMarkers, tigrMarkers = markerset.getLineageMarkerGenes('Archaea')
        markerGenes = pfamMarkers.union(tigrMarkers)
        print('  Marker genes: ' + str(len(markerGenes)))

        print('Getting genomes of interest.')
        genomeIds = img.genomeIdsByTaxonomy('Archaea', 'Final')
        print('  Genomes: ' + str(len(genomeIds)))

        print('Getting position of each marker gene.')
        geneDistTable = img.geneDistTable(genomeIds,
                                          markerGenes,
                                          spacingBetweenContigs=1e6)

        spearmanValues = []
        pearsonValues = []
        genomeIds = list(genomeIds)
        for i in range(0, len(genomeIds)):
            print(str(i + 1) + ' of ' + str(len(genomeIds)))

            geneOrderI = []
            maskI = []
            for markerGenesId in markerGenes:
                if markerGenesId in geneDistTable[genomeIds[i]]:
                    geneOrderI.append(
                        float(geneDistTable[genomeIds[i]][markerGenesId][0][0])
                        / metadata[genomeIds[i]]['genome size'])
                    maskI.append(0)
                else:
                    geneOrderI.append(-1)
                    maskI.append(1)

            for j in range(i + 1, len(genomeIds)):
                geneOrderJ = []
                maskJ = []
                for markerGenesId in markerGenes:
                    if markerGenesId in geneDistTable[genomeIds[j]]:
                        geneOrderJ.append(
                            float(geneDistTable[genomeIds[j]][markerGenesId][0]
                                  [0]) / metadata[genomeIds[j]]['genome size'])
                        maskJ.append(0)
                    else:
                        geneOrderJ.append(-1)
                        maskJ.append(1)

                # test all translations
                bestSpearman = 0
                bestPearson = 0
                for _ in range(0, len(markerGenes)):
                    maskedI = []
                    maskedJ = []
                    for k in range(0, len(maskI)):
                        if maskI[k] == 0 and maskJ[k] == 0:
                            maskedI.append(geneOrderI[k])
                            maskedJ.append(geneOrderJ[k])
                    r, _ = spearmanr(maskedI, maskedJ)
                    if abs(r) > bestSpearman:
                        bestSpearman = abs(r)

                    r, _ = pearsonr(maskedI, maskedJ)
                    if abs(r) > bestPearson:
                        bestPearson = abs(r)

                    geneOrderJ = geneOrderJ[1:] + [geneOrderJ[0]]
                    maskJ = maskJ[1:] + [maskJ[0]]

                spearmanValues.append(bestSpearman)
                pearsonValues.append(bestPearson)

        print('Spearman: %.2f +/- %.2f: ' %
              (mean(spearmanValues), std(spearmanValues)))
        print('Pearson: %.2f +/- %.2f: ' %
              (mean(pearsonValues), std(pearsonValues)))
Exemplo n.º 27
0
corr_map_nino34 = np.empty(X * Y)
corr_map_nino34.fill(np.nan)
corr_map_dmi = np.empty(X * Y)
corr_map_dmi.fill(np.nan)
corr_map_sam = np.empty(X * Y)
corr_map_sam.fill(np.nan)
corr_map_nni = np.empty(X * Y)
corr_map_nni.fill(np.nan)
corr_map_eac = np.empty(X * Y)
corr_map_eac.fill(np.nan)
for tt in np.arange(0, T):
    mode_map[tt, :, :] = np.squeeze(var_EOF[ind, :, :]) * var_PC[tt]
mode_map_ = mode_map.reshape(T, X * Y)

for ll in np.arange(0, X * Y):
    corr_map_mei[ll], tmp = st.spearmanr(mode_map_[:, ll], mei_monthly)
    corr_map_nino34[ll], tmp = st.spearmanr(mode_map_[:, ll], nino34_monthly)
    corr_map_dmi[ll], tmp = st.spearmanr(mode_map_[:, ll], dmi_monthly)
    corr_map_sam[ll], tmp = st.spearmanr(mode_map_[:, ll], sam_monthly)
    corr_map_nni[ll], tmp = st.spearmanr(mode_map_[:, ll], nni_monthly)
#    corr_map_eac[ll], tmp = st.spearmanr(mode_map_[:,ll],eac_monthly)

corr_map_mei = corr_map_mei.reshape(X, Y)
corr_map_nino34 = corr_map_nino34.reshape(X, Y)
corr_map_dmi = corr_map_dmi.reshape(X, Y)
corr_map_sam = corr_map_sam.reshape(X, Y)
corr_map_nni = corr_map_nni.reshape(X, Y)
#corr_map_eac = corr_map_eac.reshape(X,Y)

# plot setting
domain = [-55, 90, 10, 180]  #[-80, 0, 85, 360] #[-55, 90, 10, 180]
Exemplo n.º 28
0
def train_predict_adaboost(normalized_features,
                           feature_selector,
                           y,
                           num_runs=20,
                           cv=False):
    '''
    :cv: if True, num_runs is used as num_folds
    '''
    # adaboost test
    selected_features = normalized_features[:, feature_selector]
    learn_options = {
        'V':
        3,
        'train_genes':
        np.array([
            'CD5', 'CD45', 'THY1', 'H2-K', 'CD28', 'CD43', 'CD33', 'CD13',
            'CD15', 'CCDC101', 'MED12', 'TADA2B', 'TADA1', 'HPRT1', 'CUL3',
            'NF1', 'NF2'
        ],
                 dtype=object),
        'test_genes':
        np.array([
            'CD5', 'CD45', 'THY1', 'H2-K', 'CD28', 'CD43', 'CD33', 'CD13',
            'CD15', 'CCDC101', 'MED12', 'TADA2B', 'TADA1', 'HPRT1', 'CUL3',
            'NF1', 'NF2'
        ],
                 dtype=object),
        'target_name':
        'score_drug_gene_rank',
        'testing_non_binary_target_name':
        'ranks',
        'include_pi_nuc_feat':
        True,
        'gc_features':
        True,
        'nuc_features':
        True,
        'include_gene_position':
        True,
        'include_NGGX_interaction':
        True,
        'include_Tm':
        True,
        'include_strand':
        False,
        'include_gene_feature':
        False,
        'include_gene_guide_feature':
        0,
        'extra pairs':
        False,
        'weighted':
        None,
        'training_metric':
        'spearmanr',
        'NDGC_k':
        10,
        'cv':
        'gene',
        'adaboost_loss':
        'ls',
        'include_gene_effect':
        False,
        'include_drug':
        False,
        'include_sgRNAscore':
        False,
        'adaboost_alpha':
        0.5,
        'adaboost_CV':
        False,
        'num_proc':
        8,
        'num_thread_per_proc':
        None,
        'order':
        2,
        'normalize_features':
        True,
        'all pairs':
        False,
        'include_known_pairs':
        False,
        'seed':
        None,
        'flipV1target':
        False,
        'num_genes_remove_train':
        None,
        'include_microhomology':
        False,
        'algorithm_hyperparam_search':
        'grid',
        'binary target name':
        'score_drug_gene_threshold',
        'rank-transformed target name':
        'score_drug_gene_rank',
        'raw target name':
        None,
        'all_genes':
        np.array([
            'CD5', 'CD45', 'THY1', 'H2-K', 'CD28', 'CD43', 'CD33', 'CD13',
            'CD15', 'CCDC101', 'MED12', 'TADA2B', 'TADA1', 'HPRT1', 'CUL3',
            'NF1', 'NF2'
        ],
                 dtype=object),
        'ground_truth_label':
        'score_drug_gene_rank',
        'method':
        'AdaBoostRegressor',
        'adaboost_version':
        'python',
        'adaboost_learning_rate':
        0.1,
        'adaboost_n_estimators':
        100,
        'adaboost_max_depth':
        3
    }
    sps = []
    if cv:
        cv_indices = np.random.permutation(normalized_features.shape[0])
        fold_length = normalized_features.shape[0] // num_runs
    for i in range(num_runs):
        if not cv:
            indices = np.random.permutation(normalized_features.shape[0])
            train = indices[:4000]
            test = indices[4000:]
        else:
            test = cv_indices[i * fold_length:(i + 1) * fold_length]
            train = np.concatenate(
                (cv_indices[0:i * fold_length],
                 cv_indices[(i + 1) *
                            fold_length:normalized_features.shape[0]]))
        predictions, model = azimuth_adaboost(None, train, test, y, None,
                                              selected_features, None, None,
                                              learn_options, False)
        sps.append(spearmanr(predictions, y[test])[0])
    return sps
Exemplo n.º 29
0
eac_str = int(np.array(np.where(tim_vec == eac_time[0])))
eac_end = int(np.array(np.where(tim_vec == eac_time[-1])))
eac_monthly = signal.detrend(eac_monthly)
'''

# EAC transport -- BRAN from Zeya's analysis
# 1994-2016 (Aug)
df = pd.read_csv('/v_Munk_Drive/ecougnon/data/transport_y.csv', header=None)
eac_monthly = (df.iloc[:, 0]) * 110000 * np.cos(
    (np.pi / 180) * 37) * 0.1 * 10**(-6)
eac_monthly = signal.detrend(eac_monthly)
eac_str = int(12 * 12)
eac_end = int(12 * 12 + len(eac_monthly) - 1)

# calculate correlation coefficient using the spearman rank
corr_mei, p_mei = st.spearmanr(var_PC, mei_monthly)
corr_mei_std, p_mei_std = st.spearmanr(var_PC, mei_monthly_std)

corr_nino34, p_nino34 = st.spearmanr(var_PC, nino34_monthly)
corr_nino34_a, p_nino34_a = st.spearmanr(var_PC, nino34_monthly_a)
corr_nino34_std, p_nino34_std = st.spearmanr(var_PC, nino34_monthly_std)

corr_nino3, p_nino3 = st.spearmanr(var_PC, nino3_monthly)
corr_nino3_a, p_nino3_a = st.spearmanr(var_PC, nino3_monthly_a)
corr_nino3_std, p_nino3_std = st.spearmanr(var_PC, nino3_monthly_std)

corr_nino4, p_nino4 = st.spearmanr(var_PC, nino4_monthly)
corr_nino4_a, p_nino4_a = st.spearmanr(var_PC, nino4_monthly_a)
corr_nino4_std, p_nino4_std = st.spearmanr(var_PC, nino4_monthly_std)

corr_dmi, p_dmi = st.spearmanr(var_PC, dmi_monthly)
Exemplo n.º 30
0
    def run(self):
        img = IMG()
        markerset = MarkerSet()

        print 'Reading metadata.'
        metadata = img.genomeMetadata('Final')

        print 'Getting marker genes.'
        pfamMarkers, tigrMarkers = markerset.getLineageMarkerGenes('Archaea')
        markerGenes = pfamMarkers.union(tigrMarkers)
        print '  Marker genes: ' + str(len(markerGenes))

        print 'Getting genomes of interest.'
        genomeIds = img.genomeIdsByTaxonomy('Archaea', 'Final')
        print '  Genomes: ' + str(len(genomeIds))

        print 'Getting position of each marker gene.'
        geneDistTable = img.geneDistTable(genomeIds, markerGenes)

        spearmanValues = []
        pearsonValues = []
        genomeIds = list(genomeIds)
        for i in xrange(0, len(genomeIds)):
            print str(i+1) + ' of ' + str(len(genomeIds))

            geneOrderI = []
            maskI = []
            for markerGenesId in markerGenes:
                if markerGenesId in geneDistTable[genomeIds[i]]:
                    geneOrderI.append(float(geneDistTable[genomeIds[i]][markerGenesId][0][0]) / metadata[genomeIds[i]]['genome size'])
                    maskI.append(0)
                else:
                    geneOrderI.append(-1)
                    maskI.append(1)


            for j in xrange(i+1, len(genomeIds)):
                geneOrderJ = []
                maskJ = []
                for markerGenesId in markerGenes:
                    if markerGenesId in geneDistTable[genomeIds[j]]:
                        geneOrderJ.append(float(geneDistTable[genomeIds[j]][markerGenesId][0][0]) / metadata[genomeIds[j]]['genome size'])
                        maskJ.append(0)
                    else:
                        geneOrderJ.append(-1)
                        maskJ.append(1)

                # test all translations
                bestSpearman = 0
                bestPearson = 0
                for _ in xrange(0, len(markerGenes)):
                    maskedI = []
                    maskedJ = []
                    for k in xrange(0, len(maskI)):
                        if maskI[k] == 0 and maskJ[k] == 0:
                            maskedI.append(geneOrderI[k])
                            maskedJ.append(geneOrderJ[k])
                    r, _ = spearmanr(maskedI, maskedJ)
                    if abs(r) > bestSpearman:
                        bestSpearman = abs(r)

                    r, _ = pearsonr(maskedI, maskedJ)
                    if abs(r) > bestPearson:
                        bestPearson = abs(r)

                    geneOrderJ = geneOrderJ[1:] + [geneOrderJ[0]]
                    maskJ = maskJ[1:] + [maskJ[0]]

                spearmanValues.append(bestSpearman)
                pearsonValues.append(bestPearson)

        print 'Spearman: %.2f +/- %.2f: ' % (mean(spearmanValues), std(spearmanValues))
        print 'Pearson: %.2f +/- %.2f: ' % (mean(pearsonValues), std(pearsonValues))
Exemplo n.º 31
0
 def post(self, request):
     input_json = json.loads(request.body)
     
     database = input_json['parameters']['database']
     
     if input_json['parameters']['database'] not in connections:
         return jsonify(status='error', message="Invalid database!")
     
     lotz_data_query = """
         SELECT newsitem.id nid, text.id tid, 
                                 text.wordcount,
                                 newsitem.qty_videos, 
                                 newsitem.qty_images, 
                                 LENGTH(text.text)/wordcount avg_wordlength,
                                 newsitem.qty_citations
           FROM newsitem, text
          WHERE newsitem.text_id = text.id        
     """
     
     qty_comments_query = """
         SELECT newsitem.id nid, COUNT(*) qty_comments
           FROM newsitem, comment
          WHERE comment.NewsItemID = newsitem.id
            AND newsitem.text_id IS NOT NULL
          GROUP BY nid        
     """
     
     comments_score_query = """
         SELECT newsitem_id nid, pos_comments, neg_comments, neutral_comments
           FROM newsitem, newsitem_pos_neg_comments 
          WHERE algorithm_id = %s
            AND newsitem.id = newsitem_id
            AND newsitem.text_id IS NOT NULL
     """                                          
     
     algorithm_results_query = """
         SELECT newsitem.id nid, result.value
         FROM newsitem, text, result
         WHERE newsitem.text_id = text.id
           AND text.id = result.text_id
           AND result.algorithm_id = %s      
     """
     
     algorithm_anew_valence, _ = Algorithm.objects.using(database)\
                             .get_or_create(name="anew_valence")
     algorithm_anew_arousal, _ = Algorithm.objects.using(database)\
                                         .get_or_create(name="anew_arousal")
     algorithm_anew_dominance, _ = Algorithm.objects.using(database)\
                                         .get_or_create(name="anew_dominance")        
                                 
     od_number_of_comments = OrderedDict()    
     
     cursor = connections[database].cursor()
     cursor.execute(lotz_data_query)                        
                             
     arr_number_of_words = []
     arr_number_of_images = []
     arr_number_of_videos = []
     arr_avg_wordlength = []
     arr_number_of_citations = []                        
                             
     for row in dictfetch(cursor):
         od_number_of_comments[row['nid']] = float(0)
         arr_number_of_words.append(float(row['wordcount']))
         arr_number_of_images.append(float(row['qty_images']))
         arr_number_of_videos.append(float(row['qty_videos']))
         arr_avg_wordlength.append(float(row['avg_wordlength']))
         arr_number_of_citations.append(float(row['qty_citations']))
         
     od_number_of_positive_comments = od_number_of_comments.copy()
     od_number_of_negative_comments = od_number_of_comments.copy()
     od_number_of_neutral_comments = od_number_of_comments.copy()
     od_valence = od_number_of_comments.copy()
     od_arousal = od_number_of_comments.copy()
     od_dominance = od_number_of_comments.copy()
     
     cursor.execute(comments_score_query, [input_json['algorithm']])
     for row in dictfetch(cursor):
         od_number_of_positive_comments[row['nid']] = float(row['pos_comments'])
         od_number_of_negative_comments[row['nid']] = float(row['neg_comments'])
         od_number_of_neutral_comments[row['nid']] = float(row['neutral_comments'])
     arr_number_of_positive_comments = od_number_of_positive_comments.values()
     arr_number_of_negative_comments = od_number_of_negative_comments.values()         
     arr_number_of_neutral_comments = od_number_of_neutral_comments.values()        
         
     cursor.execute(algorithm_results_query, [algorithm_anew_valence.id])
     for row in dictfetch(cursor):
         od_valence[row['nid']] = float(row['value'])     
     arr_valence = od_valence.values() 
         
     cursor.execute(algorithm_results_query, [algorithm_anew_arousal.id])
     for row in dictfetch(cursor):
         od_arousal[row['nid']] = float(row['value'])
     arr_arousal = od_arousal.values()
         
     cursor.execute(algorithm_results_query, [algorithm_anew_dominance.id])
     for row in dictfetch(cursor):
         od_dominance[row['nid']] = float(row['value'])
     arr_dominance = od_dominance.values()
                              
     cursor.execute(qty_comments_query)
     for row in dictfetch(cursor):
         od_number_of_comments[row['nid']] = float(row['qty_comments'])
     arr_number_of_comments = od_number_of_comments.values()
     
     end = time.time()
         
     column_arrays = [
         arr_number_of_comments,
         arr_number_of_positive_comments,
         arr_number_of_negative_comments,
         arr_number_of_neutral_comments                
     ]
     row_arrays = [
         arr_number_of_words,
         arr_number_of_images,
         arr_number_of_videos,
         arr_avg_wordlength,
         arr_number_of_citations,
         arr_valence,
         arr_arousal,
         arr_dominance
     ]
     
     message = ""
     results = []
     
     for x_index in range(len(column_arrays)):    
         for y_index in range(len(row_arrays)):        
             
             if y_index >= 5:
                 coef = spearmanr(column_arrays[x_index], 
                                 row_arrays[y_index])
                 correlation = Decimal(coef[0])
                 value = round(correlation, 3)
                 probability = Decimal(float(coef[1].data))
             else:
                 coef = pearsonr(column_arrays[x_index], 
                                 row_arrays[y_index])
                 if numpy.isnan(coef[0]):
                     value = -1
                 else:
                     correlation = Decimal(coef[0])
                     value = round(correlation, 3)
                     probability = Decimal(coef[1])
                 
             results.append({'x': x_index, 
                             'y': y_index, 
                             'value': value, 
                             'correlation': (correlation, probability)})
         
     # make result
     res={
         'results': results,
         'message': message
     }
     
     # return json
     return JsonResponse(res)         
Exemplo n.º 32
0
 def compute(self, x, y):
     assert np.size(x) == np.size(y)
     rho, pv = mstats.spearmanr(x, y)
     return {"SPEARMAN": rho, "SPEARMAN_PV": pv}
Exemplo n.º 33
0
 def spear_corr(X, Y):
     return mstats.spearmanr(X, Y, use_ties=True)
Exemplo n.º 34
0
import shutil

LOG_MSG = "#npy_fname=%(npy_fname)s, function=%(function)s, start=%(start)d, end=%(end)d, m=%(m)d, date=%(date)s"
REPORT_N = 1000
# get username
TMP_DIR = "/tmp/%s" % pwd.getpwuid(os.getuid()).pw_name

def euclidean(x,y):
  q=x-y
  return ma.sqrt((q*q.T).sum())


# this should be in a separate file
FUNCTIONS = {
  'pearson': lambda x, y: mstats.pearsonr(x,y)[0],
  'spearman': lambda x, y: mstats.spearmanr(x,y)[0],
  'euclidean': euclidean,
  'kendalltau': lambda x,y: mstats.kendalltau(x,y)[0],
  'dcor': dcor,
  }

def main(npy_fname=None, function=None, batchname=None, outdir=None, start=None, end=None, m=None):
  """Compute pairs of dependency"""
  assert npy_fname, function
  assert function in FUNCTIONS
  assert os.path.exists(outdir)
  assert os.path.isdir(outdir)

  m = int(m)
  assert m > 0
Exemplo n.º 35
0
def calculate_spearman(gold_filename, matrix_filename, similarity_function):
    """
    Calculate Spearman coefficient
    between a corpus of similarities of word pairs
    and a bigram model as produced by read_bigram_matrix.py.

    Parameters
    ----------
    gold_filename : String
        Filename of the corpus.
        Assumes that there is one word pair per line
        in the format "word1 word2 similarity"
    matrix_filename : String
        File containg the bigram model.
    unigram_filename : String
        File containing the unigram probabilities.
    vocab_order_filename : String
        File containing the order of the vectors in the matrix.
    similarity_function : function
        Function for calculating influence of two vectors.

    Return
    ------
    spearman : Float
        Spearman coefficient
    """

    reg = r"(\S+)\s(\S+)\s(\S+)"

    gold_list = []

    with open(gold_filename) as gold_file:

        for line in gold_file:
            m = re.match(reg, line)
            if not m:
                print(line)
                continue

            word1 = m.group(1)
            word2 = m.group(2)
            sim = m.group(3)

            gold_list.append((word1, word2, sim))

    matrix_similarity_list = []
    gold_similarity_list = []

    matrix = DS_matrix(matrix_filename)

    for word1, word2, sim in gold_list:
        if not matrix.contains(word1) or not matrix.contains(word2):
            continue

        vec1 = matrix.get_vector(word1)
        vec2 = matrix.get_vector(word2)
        similarity = similarity_function(vec1, vec2)
        matrix_similarity_list.append(similarity)

        gold_similarity_list.append(float(sim))

    spearman, _ = spearmanr(matrix_similarity_list, gold_similarity_list)

    return spearman
Exemplo n.º 36
0
plt.figure()
plt.hist(d1)

plt.figure()
plt.hist(d2)

plt.show()

print(normaltest(d1))
print(normaltest(d2))


r, p = pearsonr(d1, d2)
print('Pearson correlation coeff: {}, p-value: {}'.format(r, p))

r, p = spearmanr(d1, d2)
print('Spearman correlation coeff: {}, p-value: {}'.format(r, p))


print('-' * 30)


# e. compare distribution
from scipy.stats import ttest_ind

# t-test requirement: independent samples with identical variances


# compare samples
def do_ttest(dat1, dat2, name1, name2, alpha=0.05):
    print('-' * 20)
Exemplo n.º 37
0
    output = output / torch.norm(output, p=2, dim=1)
    output = output - torch.mean(output)
    loss = -torch.sum(
        torch.exp(batch_out) * output[:, :, 0] - output[:, :, 0]
    )  # profit = SUM[gi*xi - xi], gi=linear gain, xi=investment (negative if short)
    # TODO: Add heavy L1 cost to promote small number of trades?

    # Compute gradients and update parameters
    loss.backward()
    optimizer.step()

    loss_history[i_iter] = loss.data.cpu().numpy()
    loss_baseline[i_iter] = criterion(
        TT(np.zeros(batch_out.size(), dtype=np.float32)),
        batch_out).data.cpu().numpy()
    spear[i_iter] = spearmanr(output[0, :, 0].data.cpu().numpy(),
                              batch_out[0, :].data.cpu().numpy())[0]

    if (i_iter + 1) % n_i_iter_per_log == 0 | (i_iter == 0):
        print('Iteration %d, Loss=%0.5f, Duration=%0.3f' %
              (i_iter + 1, loss.data.cpu().numpy(), time.time() - t_start))

        ## Put testing data through model
        net.eval()

        loss_day = np.asarray([])
        for i_test_samp in range(n_days_test - n_days_input):

            # Build the batch
            ix_date_start = np.asarray([n_dates - n_days_test + i_test_samp])
            batch_in = np.zeros((1, n_symbols, n_days_input, 4))
            batch_out = np.zeros((1, n_symbols))