示例#1
0
    def test_pearsonr(self):
        # Tests some computations of Pearson's r
        x = ma.arange(10)
        with warnings.catch_warnings():
            # The tests in this context are edge cases, with perfect
            # correlation or anticorrelation, or totally masked data.
            # None of these should trigger a RuntimeWarning.
            warnings.simplefilter("error", RuntimeWarning)

            assert_almost_equal(mstats.pearsonr(x, x)[0], 1.0)
            assert_almost_equal(mstats.pearsonr(x, x[::-1])[0], -1.0)

            x = ma.array(x, mask=True)
            pr = mstats.pearsonr(x, x)
            assert_(pr[0] is masked)
            assert_(pr[1] is masked)

        x1 = ma.array([-1.0, 0.0, 1.0])
        y1 = ma.array([0, 0, 3])
        r, p = mstats.pearsonr(x1, y1)
        assert_almost_equal(r, np.sqrt(3)/2)
        assert_almost_equal(p, 1.0/3)

        # (x2, y2) have the same unmasked data as (x1, y1).
        mask = [False, False, False, True]
        x2 = ma.array([-1.0, 0.0, 1.0, 99.0], mask=mask)
        y2 = ma.array([0, 0, 3, -1], mask=mask)
        r, p = mstats.pearsonr(x2, y2)
        assert_almost_equal(r, np.sqrt(3)/2)
        assert_almost_equal(p, 1.0/3)
示例#2
0
    def test_pearsonr(self):
        # Tests some computations of Pearson's r
        x = ma.arange(10)
        with warnings.catch_warnings():
            # The tests in this context are edge cases, with perfect
            # correlation or anticorrelation, or totally masked data.
            # None of these should trigger a RuntimeWarning.
            warnings.simplefilter("error", RuntimeWarning)

            assert_almost_equal(mstats.pearsonr(x, x)[0], 1.0)
            assert_almost_equal(mstats.pearsonr(x, x[::-1])[0], -1.0)

            x = ma.array(x, mask=True)
            pr = mstats.pearsonr(x, x)
            assert_(pr[0] is masked)
            assert_(pr[1] is masked)

        x1 = ma.array([-1.0, 0.0, 1.0])
        y1 = ma.array([0, 0, 3])
        r, p = mstats.pearsonr(x1, y1)
        assert_almost_equal(r, np.sqrt(3)/2)
        assert_almost_equal(p, 1.0/3)

        # (x2, y2) have the same unmasked data as (x1, y1).
        mask = [False, False, False, True]
        x2 = ma.array([-1.0, 0.0, 1.0, 99.0], mask=mask)
        y2 = ma.array([0, 0, 3, -1], mask=mask)
        r, p = mstats.pearsonr(x2, y2)
        assert_almost_equal(r, np.sqrt(3)/2)
        assert_almost_equal(p, 1.0/3)
示例#3
0
def R2(obs, mod, axis=None):
    """ Coefficient of Determination (unit squared)"""
    from scipy.stats.mstats import pearsonr
    if axis is None:
        return pearsonr(obs, mod)[0]**2
    else:
        return apply_along_axis_2v(lambda x, y: pearsonr(x, y)[0]**2, axis,
                                   obs, mod)
 def test_pearsonr(self):
     "Tests some computations of Pearson's r"
     x = ma.arange(10)
     assert_almost_equal(mstats.pearsonr(x,x)[0], 1.0)
     assert_almost_equal(mstats.pearsonr(x,x[::-1])[0], -1.0)
     #
     x = ma.array(x, mask=True)
     pr = mstats.pearsonr(x,x)
     assert(pr[0] is masked)
     assert(pr[1] is masked)
示例#5
0
 def test_pearsonr(self):
     "Tests some computations of Pearson's r"
     x = ma.arange(10)
     assert_almost_equal(mstats.pearsonr(x, x)[0], 1.0)
     assert_almost_equal(mstats.pearsonr(x, x[::-1])[0], -1.0)
     #
     x = ma.array(x, mask=True)
     pr = mstats.pearsonr(x, x)
     assert (pr[0] is masked)
     assert (pr[1] is masked)
示例#6
0
    def test_pearsonr(self):
        "Tests some computations of Pearson's r"
        x = ma.arange(10)
        olderr = np.seterr(all='ignore')
        try:
            assert_almost_equal(mstats.pearsonr(x,x)[0], 1.0)
            assert_almost_equal(mstats.pearsonr(x,x[::-1])[0], -1.0)

            x = ma.array(x, mask=True)
            pr = mstats.pearsonr(x,x)
        finally:
            np.seterr(**olderr)
        assert_(pr[0] is masked)
        assert_(pr[1] is masked)
示例#7
0
    def test_pearsonr(self):
        "Tests some computations of Pearson's r"
        x = ma.arange(10)
        olderr = np.seterr(all='ignore')
        try:
            assert_almost_equal(mstats.pearsonr(x,x)[0], 1.0)
            assert_almost_equal(mstats.pearsonr(x,x[::-1])[0], -1.0)

            x = ma.array(x, mask=True)
            pr = mstats.pearsonr(x,x)
        finally:
            np.seterr(**olderr)
        assert_(pr[0] is masked)
        assert_(pr[1] is masked)
示例#8
0
def performance_indicators(y, y_true, modelname, verbose=False, plot_scatter=False):
    # calculate different accuracy scores
    r2_score = r2(y, y_true)
    spearman_corr = spearmanr(y, y_true)[0]
    rms_error = np.sqrt(mean_squared_error(y, y_true))
    pearson_corr = pearsonr(y, y_true)[0]

    if verbose:
        print(f"prediction accuracy for {modelname}")
        print(f"R^2 score: \t {r2_score}")
        print(f"RMS error: \t {rms_error}")
        print(f"Pearson: \t {pearson_corr}")
        print(f"Spearman: \t {spearman_corr}")

    if plot_scatter:
        data = pd.DataFrame({'true_values': y_true.reshape(-1), 'predictions': y.reshape(-1)})

        joint_grid = sns.jointplot("true_values", "predictions", data=data,
                                   kind="scatter",
                                   xlim=(min(y_true), max(y_true)), ylim=(min(y_true), max(y_true)),
                                   height=7)
        joint_grid.ax_joint.plot([min(y_true), max(y_true)], [min(y_true), max(y_true)], 'r')

    summary_dict = {"rmse": rms_error,
                    "r2": r2_score,
                    "pearson": pearson_corr,
                    "spearman": spearman_corr}

    return summary_dict
示例#9
0
 def compute(self, x, y):
     assert np.size(x) == np.size(y)
     r, pv = mstats.pearsonr(x, y)
     try:
         n_pv = float(pv)
     except ValueError:
         n_pv = float(pv.data[0])
     return {'PEARSON': r, 'PEARSON_PV': n_pv}
示例#10
0
def all_pairs_pearson(M):
  """This should return a squareform matrix.

  This is about 15% faster than "correlate all", but
  correlate_all calcuates twice the extra values.
  """
  C = np.zeros((len(M), len(M)))
  for i in xrange(len(M)):
    for j in xrange(i+1, len(M)):
      C[i][j] = mstats.pearsonr(M[i],M[j])[0]
  return C
示例#11
0
 def getMetrics(self):
     # Hausdorff
     self.metrics['Hausdorff'] = hd(self.segment, self.mask)
     self.LB_HausdorffValue.setText(
         str(round(self.metrics['Hausdorff'], 3)) + " Pixels")
     # Dice
     self.metrics['Dice'] = 100 * dc(self.segment, self.mask)
     self.LB_DiceValue.setText(str(round(self.metrics['Dice'], 3)) + " %")
     # Jaccard
     self.metrics['Jaccard'] = 100 * jc(self.segment, self.mask)
     self.LB_JaccardValue.setText(
         str(round(self.metrics['Jaccard'], 3)) + " %")
     # P
     self.metrics['P_Value'] = pearsonr(self.segment.ravel(),
                                        self.mask.ravel())[1]
     self.LB_P_Value.setText(str(round(self.metrics['P_Value'], 3)))
     # Pearson Corellation Coefficient
     self.metrics['Pearson'] = pearsonr(self.segment.ravel(),
                                        self.mask.ravel())[0]
     self.LB_PearsonValue.setText(str(round(self.metrics['Pearson'], 3)))
 def compute(self, x, y):
   assert np.size(x) == np.size(y)
   r, pv = mstats.pearsonr(x,y)
   try:
     n_pv = float(pv)
   except ValueError:
     n_pv = float(pv.data[0])
   return {
     'PEARSON': r,
     'PEARSON_PV': n_pv
     }
示例#13
0
def calc_correlation(target_array, reference_array):
    '''Calculate the correlation coefficient between two arrays.

    :param target_array: an array to be evaluated, as model output
    :type target_array: :class:'numpy.ma.core.MaskedArray'

    :param reference_array: an array of reference dataset
    :type reference_array: :class:'numpy.ma.core.MaskedArray'

    :returns: pearson's correlation coefficient between the two input arrays
    :rtype: :class:'numpy.ma.core.MaskedArray'
    '''

    return mstats.pearsonr(reference_array.flatten(), target_array.flatten())[0]
示例#14
0
文件: metrics.py 项目: CWSL/climate
def calc_correlation(target_array, reference_array):
    """Calculate the correlation coefficient between two arrays.

    :param target_array: an array to be evaluated, as model output
    :type target_array: :class:'numpy.ma.core.MaskedArray'

    :param reference_array: an array of reference dataset
    :type reference_array: :class:'numpy.ma.core.MaskedArray'

    :returns: pearson's correlation coefficient between the two input arrays
    :rtype: :class:'numpy.ma.core.MaskedArray'
    """

    return mstats.pearsonr(reference_array.flatten(), target_array.flatten())[0]
示例#15
0
def pearson_correlation(target: np.ndarray, source: np.ndarray,
                        map: np.ndarray) -> float:
    """
    Compute pearson correlation index after alignment

    Parameters
    ----------
    target: np.array
        target image in gray scale
    source: np.array
        source image in gray scale
    map: sklearn.transformation
        computed transformation


    Returns
    -------
    source_points, target_points: np.array
        Filtered source and target points for affine transformation

    Example
    -------
    >>> import skimage.transform
    >>> source = np.ones((1000,1000))
    >>> target = np.ones((1000,1000))
    >>> source_points = np.array([[1.0,1.0],[500,500],[700,500])
    >>> target_points = source_points
    >>> M = transform.estimate_transform("affine",source_points,target_points)
    >>> pearson_correlation(target, source, M)
    (1.0)
    """
    mask = np.zeros_like(target)
    mask[0:source.shape[0], 0:source.shape[1]] = 1
    source_extended = np.zeros_like(target)
    source_extended[0:source.shape[0], 0:source.shape[1]] = source

    binary_mask = transform.warp(mask, inverse_map=map.inverse).astype(np.bool)

    source_warped = transform.warp(source_extended,
                                   inverse_map=map.inverse) * 255
    masked_source = np.ma.array(data=source_warped,
                                mask=np.logical_not(binary_mask))

    masked_target = np.ma.array(data=target, mask=np.logical_not(binary_mask))

    corr_coef = pearsonr(masked_source.flatten(), masked_target.flatten())

    print("image:", corr_coef)  #masked correlation only compare visible parts

    return corr_coef
示例#16
0
def plot_matrix(aa_list, par_child_mat, par_gen_mat):

    pearson_corr_te_par_child_par_gen_mut = pearsonr(par_child_mat,
                                                     par_gen_mat)
    print(
        "Pearson correlation between true par-child mut and true par-gen mut: {}"
        .format(str(pearson_corr_te_par_child_par_gen_mut)))

    # generate plots
    cmap = "Blues"
    plt.rcParams.update({'font.size': 14})
    fig, axs = plt.subplots(2)
    pos_ticks = list(np.arange(0, len(aa_list)))
    pos_labels = aa_list
    interpolation = "none"

    ax0 = axs[0].imshow(par_child_mat,
                        cmap=cmap,
                        interpolation=interpolation,
                        aspect='auto')
    axs[0].set_title("(A) Parent-child AA transition frequency")
    axs[0].set_ylabel("From")
    axs[0].set_xlabel("To")
    axs[0].set_xticks(pos_ticks)
    axs[0].set_xticklabels(pos_labels, rotation='horizontal')
    axs[0].set_yticks(pos_ticks)
    axs[0].set_yticklabels(pos_labels, rotation='horizontal')

    ax1 = axs[1].imshow(par_gen_mat,
                        cmap=cmap,
                        interpolation=interpolation,
                        aspect='auto')
    axs[1].set_title("(B) Parent-gen AA transition frequency")
    axs[1].set_ylabel("From")
    axs[1].set_xlabel("To")
    axs[1].set_xticks(pos_ticks)
    axs[1].set_xticklabels(pos_labels, rotation='horizontal')
    axs[1].set_yticks(pos_ticks)
    axs[1].set_yticklabels(pos_labels, rotation='horizontal')

    cbar_ax = fig.add_axes([0.92, 0.15, 0.03, 0.7])
    cbar = fig.colorbar(ax0, cax=cbar_ax)
    plt.suptitle(
        "AA transition frequency in true and generated datasets. Parent: {}, children: {}. Pearson correlation of A & B: {}"
        .format(clade_parent, ",".join(clade_children),
                str(np.round(pearson_corr_te_par_child_par_gen_mut[0], 2))))
    plt.show()
示例#17
0
def cross_vect_score(vect_a, vect_b, scoring='euclidean', inv_noise_cov=None):
    """ Use the scoring function to compute a value between two vectors

    Parameters
    ----------
    vect_a, vect_b: vector
        Data vectors.

    scoring:
        Scoring function in euclidean / mahalanobis / crossnobis / spearmanr /
        pearsornr. If "spearmanr_dist", return 1 - spearmanr correlation.

    inv_noise_cov: 2D array
        Inverse of the noise covariance matrix needed for mahalanobis and
        crossnobis scorings.

    Returns
    -------
    score: float
        Score value.
        
    """
    if scoring == 'euclidean':
        score = euclidean(vect_a, vect_b)
    elif scoring == "mahalanobis":
        score = mahalanobis(vect_a, vect_b, inv_noise_cov)
    elif scoring == "crossnobis":
        raise NotImplemented("Cross validated Mahalanobis distance is not " + \
                             "yet available")
    elif scoring in ["spearmanr", "spearmanr_dist"]:
        # Warning: ranking takes time, it's faster to input ranked vectors and
        # use pearsonr distance when doing multiple test on same vectors
        score, _ = spearmanr(vect_a, vect_b)
    elif scoring == "pearsonr":
        score, _ = pearsonr(vect_a, vect_b)
    else:
        raise ValueError("Unknown scoring function")

    if scoring[-5:] == "_dist":
        return 1 - score
    return score
示例#18
0
def correlacaoPearson(vetA, vetB):
    usuarioA = vetA
    usuarioB = vetB
    #-----------------------------DEBUG----------------------------------
    #print("Pearson init")
    #print(str(len(usuarioA)))
    #print(usuarioA)
    #print(str(len(usuarioB)))
    #print(usuarioB)
    #-----------------------------DEBUG----------------------------------

    indexRemove = []
    for i in range(len(usuarioA)):
        if (usuarioA[i] == "?"):
            indexRemove.append(i)

    for j in range(len(usuarioB)):
        if (usuarioB[j] == "?"):
            indexRemove.append(j)

    indexRemove.sort(reverse=True)

    #Para fazer a correlaçao de Pearson é necessario comparar os itens avaliados pelos dois usuarios
    #ou seja, remover do vetor de comparacao os "?"

    for k in indexRemove:
        usuarioA = np.delete(usuarioA, k)
        usuarioB = np.delete(usuarioB, k)

    #-----------------------------DEBUG----------------------------------
    #print("Pearson after")
    #print(usuarioA)
    #print(usuarioB)
    #-----------------------------DEBUG----------------------------------

    usuarioA = usuarioA.astype(int)
    usuarioB = usuarioB.astype(int)

    #chama a biblioteca PearsonR passando 2 vetores
    return pearsonr(usuarioA, usuarioB)[0]
示例#19
0
def LyungBoxTest(ts, tested_lag, significance=0.95):
    """
    ts: a time series.
    tested_lag: is the lag being tested, but must be an int.
    """
    tested_lag = int(tested_lag)
    f_ts = ts
    f_ts = f_ts - f_ts.mean()
    n = f_ts.shape[0]
    Q = 0
    for i in range(1, tested_lag + 1):
        lagged_f_ts = f_ts.shift(i)
        m_f_ts = ma.masked_array(lagged_f_ts, mask=np.isnan(lagged_f_ts))
        Q += mstats.pearsonr(f_ts, m_f_ts)[0] ** 2 / (n - i)

    Q = Q * n * (n + 2)
    t = stats.chi2(tested_lag).ppf(significance)
    if Q < t:
        print "%d   | Not enough evidence to reject Null: Q = %.4f < %.4f" % (tested_lag, Q, t)
        # print "Not enough evidence to reject "+func.__name__ + " " + series_name+" as not %d autocorrelated according to the Lyung Box test. Q = %.4f < %.4f"%(tested_lag, Q,t)
    else:
        print "%d   | Reject Null: Q = %.4f > %.4f" % (tested_lag, Q, t)
示例#20
0
def LyungBoxTest(ts, tested_lag, significance=0.95):
    """
    ts: a time series.
    tested_lag: is the lag being tested, but must be an int.
    """
    tested_lag = int(tested_lag)
    f_ts = ts
    f_ts = f_ts - f_ts.mean()
    n = f_ts.shape[0]
    Q = 0
    for i in range(1, tested_lag + 1):
        lagged_f_ts = f_ts.shift(i)
        m_f_ts = ma.masked_array(lagged_f_ts, mask=np.isnan(lagged_f_ts))
        Q += mstats.pearsonr(f_ts, m_f_ts)[0]**2 / (n - i)

    Q = Q * n * (n + 2)
    t = stats.chi2(tested_lag).ppf(significance)
    if Q < t:
        print "%d   | Not enough evidence to reject Null: Q = %.4f < %.4f" % (
            tested_lag, Q, t)
        #print "Not enough evidence to reject "+func.__name__ + " " + series_name+" as not %d autocorrelated according to the Lyung Box test. Q = %.4f < %.4f"%(tested_lag, Q,t)
    else:
        print "%d   | Reject Null: Q = %.4f > %.4f" % (tested_lag, Q, t)
 def compute(self, x, y, i):
   assert np.size(x) == np.size(y) and i >= 0
   self.Matrices["PEARSON"][i], self.Matrices["PEARSON_PV"][i] = mstats.pearsonr(x,y)
#
all_comp_gppl = []

# Do diffs correlate with sum(- worse_item_rank + better_item_rank)?
for idx in range(len(diffs)):
    #print('Item: %i' % ids[idx])
    #print('Diff: %f; BWS rank=%i, GPPL rank=%i' % (diffs[idx], rank_bws[idx], rank_gppl[idx]))

    otherids = pairs[pairs[:, 0] == ids[idx], 1]
    otheridxs = [
        np.argwhere(ids == otherid).flatten()[0] for otherid in otherids
    ]

    tot_rank_gppl = 0

    for otheridx in otheridxs:
        tot_rank_gppl -= rank_gppl[otheridx]

    otherids = pairs[pairs[:, 1] == ids[idx], 0]
    otheridxs = [
        np.argwhere(ids == otherid).flatten()[0] for otherid in otherids
    ]
    for otheridx in otheridxs:
        tot_rank_gppl += rank_gppl[otheridx]

    #print('Total rank differences: BWS=%i, GPPL=%i' % (tot_rank_gppl, tot_rank_bws))
    all_comp_gppl.append(tot_rank_gppl)
print('Correlation between rank diff and total ranks of compared items: %f' %
      spearmanr(all_comp_gppl, diffs)[0])
print(pearsonr(all_comp_gppl, diffs))
示例#23
0
文件: Eval.py 项目: gtholpadi/WikiTSu
 def pears_corr(X, Y):
     return mstats.pearsonr(X,Y)
#get the log prob scores
model = kenlm.LanguageModel(os.path.join(args.model_dir, "model.klm"))
logprobs = []
mean_logprobs = []
norm_logprobs = []
slors = []
for s in test_sentences:
    uni = 0.0
    for w in s.split() + ["</s>"]:
    #for w in s.split():
        uni += unigram_logprob[w]

    fs = model.full_scores(s)
    n = 0
    logprob = 0.0
    for p, l in fs:
        logprob += p
        n += 1
        
    logprobs.append(logprob)
    mean_logprobs.append(logprob / n)
    norm_logprobs.append(logprob / uni * -1.0)
    slors.append((logprob - uni) / n)

#calculate correlation
print "logprob =", pearsonr(logprobs, test_ratings)[0]
print "mean logprob =", pearsonr(mean_logprobs, test_ratings)[0]
print "norm logprob =", pearsonr(norm_logprobs, test_ratings)[0]
print "slor =", pearsonr(slors, test_ratings)[0]
示例#25
0
    norm_lp_div.append((-1.0 * lp) / unigram_lp[i])
    norm_lp_sub.append(lp - unigram_lp[i])
    slor.append( (lp - unigram_lp[i]) / sent_lens[i] )

    #bottom 5 lowest word logprobs, mean, m1q and m2q
    wordlp = wordlps[i]
    wordlp_min5 = sorted(wordlp)[:5]
    wlp_min1.append(wordlp_min5[0])
    wlp_min2.append(wordlp_min5[1])
    wlp_min3.append(wordlp_min5[2])
    wlp_min4.append(wordlp_min5[3])
    wlp_min5.append(wordlp_min5[4])
    wlp_mean.append(numpy.mean(wordlp))
    wlp_m1q.append(mean_of_percentile(wordlp, 25.0))
    wlp_m2q.append(mean_of_percentile(wordlp, 50.0))

    if (args.test_csv_output):
        test_out.write(str(i) + ",," + str(sent_lens[i]) + "," + str(lp) + "," + str(unigram_lp[i]))
        test_out.write("," + str(mean_lp[-1]) + "," + str(norm_lp_div[-1]) + ",")
        test_out.write(str(norm_lp_sub[-1]) + "," + str(slor[-1]) + ",")
        test_out.write(",".join([str(item) for item in wordlp_min5]) + ",")
        test_out.write(str(wlp_mean[-1]) + "," + str(wlp_m1q[-1]) + "," + str(wlp_m2q[-1]) + "\n")

metrics_list = header.split(",")[3:]
results = [lps, unigram_lp, mean_lp, norm_lp_div, norm_lp_sub, slor, wlp_min1, wlp_min2, wlp_min3, wlp_min4, wlp_min5, wlp_mean, wlp_m1q, wlp_m2q]

#print the results
print "METRICS\tCORRELATION"
for i, m in enumerate(metrics_list):
    print m + "\t" + str(pearsonr(results[i], gold)[0])
示例#26
0
    def run(self):
        img = IMG()
        markerset = MarkerSet()

        print 'Reading metadata.'
        metadata = img.genomeMetadata('Final')

        print 'Getting marker genes.'
        pfamMarkers, tigrMarkers = markerset.getLineageMarkerGenes('Archaea')
        markerGenes = pfamMarkers.union(tigrMarkers)
        print '  Marker genes: ' + str(len(markerGenes))

        print 'Getting genomes of interest.'
        genomeIds = img.genomeIdsByTaxonomy('Archaea', 'Final')
        print '  Genomes: ' + str(len(genomeIds))

        print 'Getting position of each marker gene.'
        geneDistTable = img.geneDistTable(genomeIds, markerGenes)

        spearmanValues = []
        pearsonValues = []
        genomeIds = list(genomeIds)
        for i in xrange(0, len(genomeIds)):
            print str(i+1) + ' of ' + str(len(genomeIds))

            geneOrderI = []
            maskI = []
            for markerGenesId in markerGenes:
                if markerGenesId in geneDistTable[genomeIds[i]]:
                    geneOrderI.append(float(geneDistTable[genomeIds[i]][markerGenesId][0][0]) / metadata[genomeIds[i]]['genome size'])
                    maskI.append(0)
                else:
                    geneOrderI.append(-1)
                    maskI.append(1)


            for j in xrange(i+1, len(genomeIds)):
                geneOrderJ = []
                maskJ = []
                for markerGenesId in markerGenes:
                    if markerGenesId in geneDistTable[genomeIds[j]]:
                        geneOrderJ.append(float(geneDistTable[genomeIds[j]][markerGenesId][0][0]) / metadata[genomeIds[j]]['genome size'])
                        maskJ.append(0)
                    else:
                        geneOrderJ.append(-1)
                        maskJ.append(1)

                # test all translations
                bestSpearman = 0
                bestPearson = 0
                for _ in xrange(0, len(markerGenes)):
                    maskedI = []
                    maskedJ = []
                    for k in xrange(0, len(maskI)):
                        if maskI[k] == 0 and maskJ[k] == 0:
                            maskedI.append(geneOrderI[k])
                            maskedJ.append(geneOrderJ[k])
                    r, _ = spearmanr(maskedI, maskedJ)
                    if abs(r) > bestSpearman:
                        bestSpearman = abs(r)

                    r, _ = pearsonr(maskedI, maskedJ)
                    if abs(r) > bestPearson:
                        bestPearson = abs(r)

                    geneOrderJ = geneOrderJ[1:] + [geneOrderJ[0]]
                    maskJ = maskJ[1:] + [maskJ[0]]

                spearmanValues.append(bestSpearman)
                pearsonValues.append(bestPearson)

        print 'Spearman: %.2f +/- %.2f: ' % (mean(spearmanValues), std(spearmanValues))
        print 'Pearson: %.2f +/- %.2f: ' % (mean(pearsonValues), std(pearsonValues))
示例#27
0
    wlp_min2.append(wordlp_min5[1])
    wlp_min3.append(wordlp_min5[2])
    wlp_min4.append(wordlp_min5[3])
    wlp_min5.append(wordlp_min5[4])
    wlp_mean.append(numpy.mean(wordlp))
    wlp_m1q.append(mean_of_percentile(wordlp, 25.0))
    wlp_m2q.append(mean_of_percentile(wordlp, 50.0))

    if (args.test_csv_output):
        test_out.write(
            str(i) + ",," + str(sent_lens[i]) + "," + str(lp) + "," +
            str(unigram_lp[i]))
        test_out.write("," + str(mean_lp[-1]) + "," + str(norm_lp_div[-1]) +
                       ",")
        test_out.write(str(norm_lp_sub[-1]) + "," + str(slor[-1]) + ",")
        test_out.write(",".join([str(item) for item in wordlp_min5]) + ",")
        test_out.write(
            str(wlp_mean[-1]) + "," + str(wlp_m1q[-1]) + "," +
            str(wlp_m2q[-1]) + "\n")

metrics_list = header.split(",")[3:]
results = [
    lps, unigram_lp, mean_lp, norm_lp_div, norm_lp_sub, slor, wlp_min1,
    wlp_min2, wlp_min3, wlp_min4, wlp_min5, wlp_mean, wlp_m1q, wlp_m2q
]

#print the results
print "METRICS\tCORRELATION"
for i, m in enumerate(metrics_list):
    print m + "\t" + str(pearsonr(results[i], gold)[0])
示例#28
0
 def pears_corr(X, Y):
     return mstats.pearsonr(X, Y)
示例#29
0
    import xarray as xr
    fname = '/home/ecougnon/ana/SSTa_daily_Aus_20032016Dec.nc'
    lat_obs = xr.open_dataset(fname)['lat']
#    lat_obs = lat_obs.sel(lat=slice(lat_px_min,lat_px_max))
    lat_obs = lat_obs.sel(lat=lat_mdl, method='nearest')
    lon_obs = xr.open_dataset(fname)['lon']
#    lon_obs = lon_obs.sel(lon=slice(lon_px_min,lon_px_max))
    lon_obs = lon_obs.sel(lon=lon_mdl+360, method='nearest')
    tim_obs = xr.open_dataset(fname)['time']
    tim_obs = tim_obs.sel(time=slice('2003-01-01','2016-12-31'))
    sst_obs = xr.open_dataset(fname)['SSTa']
    sst_obs = sst_obs.sel(time=tim_obs, lat=lat_obs,lon=lon_obs)
    sst_obs = np.nanmean(np.nanmean(sst_obs,0),0)

# pearson correlation
PearC, tmp = st.pearsonr(sst_mdl, sst_obs) 

## plotting
plt.figure(figsize=(13,7))
ax = plt.subplot(111)
#plt.plot(tim_mdl,sst_mdl)
plt.plot(tim_vec,sst_obs)
#plt.legend(['mdl','obs'])
#plt.title('SSTa TAS area time series -- 37-45S 147-155E')
plt.title('SSTa TAS area time series -- 42-44S 144-146E')
plt.grid()
#plt.text(0.3, 0.1, 'Pearson Correlation coefficient:' + str(round(PearC,3)), \
#         ha='center', va='center', transform=ax.transAxes, \
#         fontsize=14)

#plt.savefig(figfile, bbox_inches='tight', dpi=300)
            plot_path = os.path.join(outdir, "loghist" + "_".join(name_tuple) + ".png")
            print "Saving figure as %s..." % plot_path
            plt.savefig(plot_path, dpi=600)

        # compute all-pairs
        for x, y in itertools.combinations(R.keys(), 2):

            name_tuple = (x, y, enrich_name, D["gse_id"])
            print "Scatter Plotting %s versus %s for %s from %s" % (name_tuple)
            X_Q = R[x]["Q"]
            Y_Q = R[y]["Q"]
            if E_Mask is not None:
                X_Q = X_Q[E_Mask]
                Y_Q = Y_Q[E_Mask]

            pcc = mstats.pearsonr(X_Q, Y_Q)
            print "PCC of %s and %s:" % (x, y), pcc

            plt.clf()
            plt.cla()
            plt.title(" ".join(name_tuple))
            plt.xlabel(x)
            plt.ylabel(y)
            plot_path = os.path.join(outdir, "scatter" + "_".join(name_tuple) + ".png")
            plt.plot(X_Q, Y_Q, "b.")
            print "Saving figure as %s..." % plot_path
            plt.savefig(plot_path, dpi=600)

        #   if not no enrichment, compute enrichment
        #     output stats
        #     plot stats
示例#31
0
import pwd
import shutil

LOG_MSG = "#npy_fname=%(npy_fname)s, function=%(function)s, start=%(start)d, end=%(end)d, m=%(m)d, date=%(date)s"
REPORT_N = 1000
# get username
TMP_DIR = "/tmp/%s" % pwd.getpwuid(os.getuid()).pw_name

def euclidean(x,y):
  q=x-y
  return ma.sqrt((q*q.T).sum())


# this should be in a separate file
FUNCTIONS = {
  'pearson': lambda x, y: mstats.pearsonr(x,y)[0],
  'spearman': lambda x, y: mstats.spearmanr(x,y)[0],
  'euclidean': euclidean,
  'kendalltau': lambda x,y: mstats.kendalltau(x,y)[0],
  'dcor': dcor,
  }

def main(npy_fname=None, function=None, batchname=None, outdir=None, start=None, end=None, m=None):
  """Compute pairs of dependency"""
  assert npy_fname, function
  assert function in FUNCTIONS
  assert os.path.exists(outdir)
  assert os.path.isdir(outdir)

  m = int(m)
  assert m > 0
    ratings.append(float(line.strip()))

if debug:
    print "Ratings", len(ratings), "=", ratings[:10]

#process the test.csv file
metrics = []
probs = []
for line_id, line in enumerate(open(args.test_csv)):
    data = line.strip().split(",")
    if line_id == 0:
        metrics = data[3:]
        if debug:
            print "\nmetrics =", metrics
    else:
        for i, score in enumerate(data[3:]):
            if len(probs) == i:
                probs.append([])
            if score == "":
                score = 0
            probs[i].append(float(score))

#print "\n".join(metrics), "\n"
print "METRICS\tCORRELATION"
for i, prob in enumerate(probs):
    if debug:
        print "\nmetric =", metrics[i]
        print "\tprob", len(prob), "=", prob[:5]
    corr = pearsonr(ratings, prob)[0]
    print metrics[i] + "\t" + str(corr)
mode4_mdl = mode4_mdl.reshape((t,Y*X))
mode1_obs = mode1_obs.reshape((t,Y*X))
mode2_obs = mode2_obs.reshape((t,Y*X))
mode3_obs = mode3_obs.reshape((t,Y*X))
mode4_obs = mode4_obs.reshape((t,Y*X))

corr_map_mode1 = np.empty(X*Y)
corr_map_mode2 = np.empty(X*Y)
corr_map_mode3 = np.empty(X*Y)
corr_map_mode4 = np.empty(X*Y)
corr_map_mode1.fill(np.nan)
corr_map_mode2.fill(np.nan)
corr_map_mode3.fill(np.nan)
corr_map_mode4.fill(np.nan)
for ii in range(0,(X*Y)):
    corr_map_mode1[ii], tmp = st.pearsonr(mode1_mdl[:,ii], mode1_obs[:,ii]) 
    corr_map_mode2[ii], tmp = st.pearsonr(mode2_mdl[:,ii], mode2_obs[:,ii])
    corr_map_mode3[ii], tmp = st.pearsonr(mode3_mdl[:,ii], mode3_obs[:,ii])
    corr_map_mode4[ii], tmp = st.pearsonr(mode4_mdl[:,ii], mode4_obs[:,ii])
# change shape back to lat/lon
corr_map_mode1 = np.reshape(corr_map_mode1,(Y,X))
corr_map_mode2 = np.reshape(corr_map_mode2,(Y,X))
corr_map_mode3 = np.reshape(corr_map_mode3,(Y,X))
corr_map_mode4 = np.reshape(corr_map_mode4,(Y,X))

## plotting
domain = [-55, 90, 10, 180] #[-55, -270, 10, -180] #[-55, 90, 10, 180]
domain_draw = [-55, 90, 10, 180] #[-55, -270, 10, -180] #[-55, 90, 10, 180]
dlat = 10 #30 #10
dlon = 30 #90 #30
llon_obs, llat_obs = np.meshgrid(lon, lat)
示例#34
0
    def run(self):
        img = IMG()
        markerset = MarkerSet()

        print('Reading metadata.')
        metadata = img.genomeMetadata('Final')

        print('Getting marker genes.')
        pfamMarkers, tigrMarkers = markerset.getLineageMarkerGenes('Archaea')
        markerGenes = pfamMarkers.union(tigrMarkers)
        print('  Marker genes: ' + str(len(markerGenes)))

        print('Getting genomes of interest.')
        genomeIds = img.genomeIdsByTaxonomy('Archaea', 'Final')
        print('  Genomes: ' + str(len(genomeIds)))

        print('Getting position of each marker gene.')
        geneDistTable = img.geneDistTable(genomeIds,
                                          markerGenes,
                                          spacingBetweenContigs=1e6)

        spearmanValues = []
        pearsonValues = []
        genomeIds = list(genomeIds)
        for i in range(0, len(genomeIds)):
            print(str(i + 1) + ' of ' + str(len(genomeIds)))

            geneOrderI = []
            maskI = []
            for markerGenesId in markerGenes:
                if markerGenesId in geneDistTable[genomeIds[i]]:
                    geneOrderI.append(
                        float(geneDistTable[genomeIds[i]][markerGenesId][0][0])
                        / metadata[genomeIds[i]]['genome size'])
                    maskI.append(0)
                else:
                    geneOrderI.append(-1)
                    maskI.append(1)

            for j in range(i + 1, len(genomeIds)):
                geneOrderJ = []
                maskJ = []
                for markerGenesId in markerGenes:
                    if markerGenesId in geneDistTable[genomeIds[j]]:
                        geneOrderJ.append(
                            float(geneDistTable[genomeIds[j]][markerGenesId][0]
                                  [0]) / metadata[genomeIds[j]]['genome size'])
                        maskJ.append(0)
                    else:
                        geneOrderJ.append(-1)
                        maskJ.append(1)

                # test all translations
                bestSpearman = 0
                bestPearson = 0
                for _ in range(0, len(markerGenes)):
                    maskedI = []
                    maskedJ = []
                    for k in range(0, len(maskI)):
                        if maskI[k] == 0 and maskJ[k] == 0:
                            maskedI.append(geneOrderI[k])
                            maskedJ.append(geneOrderJ[k])
                    r, _ = spearmanr(maskedI, maskedJ)
                    if abs(r) > bestSpearman:
                        bestSpearman = abs(r)

                    r, _ = pearsonr(maskedI, maskedJ)
                    if abs(r) > bestPearson:
                        bestPearson = abs(r)

                    geneOrderJ = geneOrderJ[1:] + [geneOrderJ[0]]
                    maskJ = maskJ[1:] + [maskJ[0]]

                spearmanValues.append(bestSpearman)
                pearsonValues.append(bestPearson)

        print('Spearman: %.2f +/- %.2f: ' %
              (mean(spearmanValues), std(spearmanValues)))
        print('Pearson: %.2f +/- %.2f: ' %
              (mean(pearsonValues), std(pearsonValues)))
示例#35
0
def plot_mutation_counts():
    df_true_pred = pd.read_csv(results_path + file_name_mut_ct, sep=",")
    #df_true_pred = df_true_pred[:100]
    print(df_true_pred)
    cols = list(df_true_pred.columns)
    parent_child = dict()
    parent_gen = dict()
    child_gen = dict()

    parent_child_pos = dict()
    parent_gen_pos = dict()

    f_dict = read_json(results_path + "f_word_dictionaries.json")
    rev_dict = read_json(results_path + "r_word_dictionaries.json")

    encoded_wuhan_seq = utils.read_wuhan_seq(WUHAN_SEQ, rev_dict)

    # compare differences at positions
    space = 1
    for index, row in df_true_pred.iterrows():
        true_x = row[cols[0]].split(",")
        true_y = row[cols[1]].split(",")
        pred_y = row[cols[2]].split(",")

        for i in range(len(true_x)):
            first = true_x[i:i + space]
            sec = true_y[i:i + space]
            third = pred_y[i:i + space]

            first_aa = [f_dict[j] for j in first]
            sec_aa = [f_dict[j] for j in sec]
            third_aa = [f_dict[j] for j in third]

            first_mut = first_aa[0]
            second_mut = sec_aa[0]
            third_mut = third_aa[0]
            '''if first_mut != second_mut and first_mut != third_mut:
                key_par_child = "{}>{}".format(first_mut, second_mut)
                key_pos_par_child = "{}>{}>{}".format(first_mut, str(i+1), second_mut)
                print("Parent-child: {}".format(key_pos_par_child))
                if key_par_child not in parent_child:
                    parent_child[key_par_child] = 0
                parent_child[key_par_child] += 1

                key_par_gen = "{}>{}".format(first_mut, third_mut)
                key_pos_par_gen = "{}>{}>{}".format(first_mut, str(i+1), third_mut)
                print("Parent-gen: {}".format(key_pos_par_gen))
                print("------------")
                if key_par_gen not in parent_gen:
                    parent_gen[key_par_gen] = 0
                parent_gen[key_par_gen] += 1'''

            if first_mut != second_mut:
                key = "{}>{}".format(first_mut, second_mut)
                key_pos_par_child = "{}>{}>{}".format(first_mut, str(i + 1),
                                                      second_mut)
                if key_pos_par_child not in parent_child_pos:
                    parent_child_pos[key_pos_par_child] = 0
                parent_child_pos[key_pos_par_child] += 1

                if key not in parent_child:
                    parent_child[key] = 0
                parent_child[key] += 1

            if first_mut != third_mut:
                key = "{}>{}".format(first_mut, third_mut)
                key_pos_par_gen = "{}>{}>{}".format(first_mut, str(i + 1),
                                                    third_mut)
                if key_pos_par_gen not in parent_gen_pos:
                    parent_gen_pos[key_pos_par_gen] = 0
                parent_gen_pos[key_pos_par_gen] += 1

                if key not in parent_gen:
                    parent_gen[key] = 0
                parent_gen[key] += 1

    write_dict(
        results_path +
        "te_parent_child_{}_{}.json".format(clade_parent, clade_child),
        parent_child)
    write_dict(
        results_path +
        "te_parent_gen_{}_{}.json".format(clade_parent, clade_child),
        parent_gen)

    aa_list = list('QNKWFPYLMTEIARGHSDVC')
    print("---------------------")
    print("Parent child mutations with POS")
    parent_child_pos = dict(
        sorted(parent_child_pos.items(),
               key=lambda item: item[1],
               reverse=True))
    print(len(parent_child_pos), parent_child_pos)
    print()
    print("Parent gen mutations with POS")
    parent_gen_pos = dict(
        sorted(parent_gen_pos.items(), key=lambda item: item[1], reverse=True))
    print(len(parent_gen_pos), parent_gen_pos)
    print()

    write_dict(
        results_path +
        "te_parent_child_pos_{}_{}.json".format(clade_parent, clade_child),
        parent_child_pos)
    write_dict(
        results_path +
        "te_parent_gen_pos_{}_{}.json".format(clade_parent, clade_child),
        parent_gen_pos)

    keys1 = list(parent_child_pos.keys())
    keys2 = list(parent_gen_pos.keys())

    inter = list(set(keys1).intersection(set(keys2)))
    print(len(inter), inter)
    print()
    print("---------------------")
    test_size = df_true_pred.shape[0]

    parent_child = dict(
        sorted(parent_child.items(), key=lambda item: item[1], reverse=True))
    print("Test: Mutation freq between parent-child: {}".format(parent_child))
    print("Test: # Mutations between parent-child: {}".format(
        str(len(parent_child))))
    print()

    parent_gen = dict(
        sorted(parent_gen.items(), key=lambda item: item[1], reverse=True))
    print("Test: Mutation freq between parent-gen: {}".format(parent_gen))
    print("Test: # Mutations between parent-child: {}".format(
        str(len(parent_gen))))
    print()

    par_child_mat = get_mat(aa_list, parent_child, test_size)
    print()
    par_gen_mat = get_mat(aa_list, parent_gen, test_size)

    print("Preparing train data...")
    tr_par_child_mat, tr_parent_child = get_train_mat()

    pearson_corr_tr_par_child_mut = pearsonr(tr_par_child_mat, par_child_mat)
    pearson_corr_tr_par_child_par_gen_mut = pearsonr(tr_par_child_mat,
                                                     par_gen_mat)
    pearson_corr_te_par_child_par_gen_mut = pearsonr(par_child_mat,
                                                     par_gen_mat)

    print(
        "Pearson correlation between train and test par-child mut: {}".format(
            str(pearson_corr_tr_par_child_mut)))
    print(
        "Pearson correlation between train par-child mut and test par-gen mut: {}"
        .format(str(pearson_corr_tr_par_child_par_gen_mut)))
    print("Pearson correlation between test par-child mut and par-gen mut: {}".
          format(str(pearson_corr_te_par_child_par_gen_mut)))

    tr_par_child_keys = list(tr_parent_child.keys())
    te_par_child_keys = list(parent_child.keys())
    te_par_gen_keys = list(parent_gen.keys())

    print("Size of mutations - tr par-child, te par-child, te par-gen")
    print(len(tr_parent_child), len(parent_child), len(parent_gen))

    intersection_tr_par_child_te_par_child = len(
        list(set(tr_par_child_keys).intersection(
            set(te_par_child_keys)))) / float(len(tr_parent_child))
    print("% intersection between tr par-child and te par-child: {}".format(
        str(np.round(intersection_tr_par_child_te_par_child, 2))))

    intersection_tr_par_child_te_par_gen = len(
        list(set(tr_par_child_keys).intersection(
            set(te_par_gen_keys)))) / float(len(tr_parent_child))
    print("% intersection between tr par-child and te par-gen: {}".format(
        str(np.round(intersection_tr_par_child_te_par_gen, 2))))

    intersection_te_par_child_te_par_gen = len(
        list(set(te_par_child_keys).intersection(
            set(te_par_gen_keys)))) / float(len(te_par_child_keys))
    print("% intersection between te par-child and te par-gen: {}".format(
        str(np.round(intersection_te_par_child_te_par_gen, 2))))
    print()
    print("Common mutations in tr, test and gen for {}>{} branch".format(
        clade_parent, clade_child))
    for mut in tr_parent_child:
        if mut in parent_child and mut in parent_gen:
            print(mut, tr_parent_child[mut], parent_child[mut],
                  parent_gen[mut])
    # generate plots

    cmap = "Blues"  #"RdYlBu" Spectral
    plt.rcParams.update({'font.size': 10})

    fig, axs = plt.subplots(3)

    pos_ticks = list(np.arange(0, len(aa_list)))
    pos_labels = aa_list

    interpolation = "none"

    ax0 = axs[0].imshow(tr_par_child_mat,
                        cmap=cmap,
                        interpolation=interpolation,
                        aspect='auto')
    axs[0].set_title("(A) Train parent-child mutation frequency")
    axs[0].set_ylabel("From")
    axs[0].set_xlabel("To")
    axs[0].set_xticks(pos_ticks)
    axs[0].set_xticklabels(pos_labels, rotation='horizontal')
    axs[0].set_yticks(pos_ticks)
    axs[0].set_yticklabels(pos_labels, rotation='horizontal')

    ax1 = axs[1].imshow(par_child_mat,
                        cmap=cmap,
                        interpolation=interpolation,
                        aspect='auto')
    axs[1].set_title("(B) Test parent-child mutation frequency")
    axs[1].set_ylabel("From")
    axs[1].set_xlabel("To")
    axs[1].set_xticks(pos_ticks)
    axs[1].set_xticklabels(pos_labels, rotation='horizontal')
    axs[1].set_yticks(pos_ticks)
    axs[1].set_yticklabels(pos_labels, rotation='horizontal')

    ax2 = axs[2].imshow(par_gen_mat,
                        cmap=cmap,
                        interpolation=interpolation,
                        aspect='auto')
    axs[2].set_title("(C) Test parent-generated mutation frequency")
    axs[2].set_ylabel("From")
    axs[2].set_xlabel("To")
    axs[2].set_xticks(pos_ticks)
    axs[2].set_xticklabels(pos_labels, rotation='horizontal')
    axs[2].set_yticks(pos_ticks)
    axs[2].set_yticklabels(pos_labels, rotation='horizontal')

    cbar_ax = fig.add_axes([0.92, 0.15, 0.03, 0.7])
    cbar = fig.colorbar(ax0, cax=cbar_ax)
    plt.suptitle(
        "Mutation frequency in test, train and generated datasets. Pearson correlation of A & B: {}, A & C: {}, B & C: {}"
        .format(str(np.round(pearson_corr_tr_par_child_mut[0], 2)),
                str(np.round(pearson_corr_tr_par_child_par_gen_mut[0], 2)),
                str(np.round(pearson_corr_te_par_child_par_gen_mut[0], 2))))
    plt.show()

    # plot differences

    diff_tr_par_child_te_par_child = par_child_mat - tr_par_child_mat
    diff_te_par_gen_te_par_child = par_gen_mat - par_child_mat
    diff_tr_par_child_te_par_gen = par_gen_mat - tr_par_child_mat

    cmap = "RdBu"
    fig, axs = plt.subplots(3)
    vmin = -0.08
    vmax = 0.08

    ax0 = axs[0].imshow(diff_tr_par_child_te_par_child,
                        cmap=cmap,
                        interpolation=interpolation,
                        aspect='auto',
                        vmin=vmin,
                        vmax=vmax)  # ,
    axs[0].set_title("Test vs training")
    axs[0].set_ylabel("From")
    axs[0].set_xlabel("To")
    axs[0].set_xticks(pos_ticks)
    axs[0].set_xticklabels(pos_labels, rotation='horizontal')
    axs[0].set_yticks(pos_ticks)
    axs[0].set_yticklabels(pos_labels, rotation='horizontal')

    ax1 = axs[1].imshow(diff_te_par_gen_te_par_child,
                        cmap=cmap,
                        interpolation=interpolation,
                        aspect='auto',
                        vmin=vmin,
                        vmax=vmax)
    axs[1].set_title("Generated vs test")
    axs[1].set_ylabel("From")
    axs[1].set_xlabel("To")
    axs[1].set_xticks(pos_ticks)
    axs[1].set_xticklabels(pos_labels, rotation='horizontal')
    axs[1].set_yticks(pos_ticks)
    axs[1].set_yticklabels(pos_labels, rotation='horizontal')

    ax2 = axs[2].imshow(diff_tr_par_child_te_par_gen,
                        cmap=cmap,
                        interpolation=interpolation,
                        aspect='auto',
                        vmin=vmin,
                        vmax=vmax)
    axs[2].set_title("Generated vs training")
    axs[2].set_ylabel("From")
    axs[2].set_xlabel("To")
    axs[2].set_xticks(pos_ticks)
    axs[2].set_xticklabels(pos_labels, rotation='horizontal')
    axs[2].set_yticks(pos_ticks)
    axs[2].set_yticklabels(pos_labels, rotation='horizontal')

    cbar_ax = fig.add_axes([0.92, 0.15, 0.03, 0.7])
    cbar = fig.colorbar(ax0, cax=cbar_ax)
    plt.suptitle("Delta of mutation frequency plots")
    plt.show()
 def calcPearsonCC(self, pred, gt):
     '''
     Calculates Pearson's Correlation Coefficient
     '''
     pcc = pearsonr(pred, gt)[0]
     return pcc
示例#37
0
#get the log prob scores
model = kenlm.LanguageModel(os.path.join(args.model_dir, "model.klm"))
logprobs = []
mean_logprobs = []
norm_logprobs = []
slors = []
for s in test_sentences:
    uni = 0.0
    for w in s.split() + ["</s>"]:
        #for w in s.split():
        uni += unigram_logprob[w]

    fs = model.full_scores(s)
    n = 0
    logprob = 0.0
    for p, l in fs:
        logprob += p
        n += 1

    logprobs.append(logprob)
    mean_logprobs.append(logprob / n)
    norm_logprobs.append(logprob / uni * -1.0)
    slors.append((logprob - uni) / n)

#calculate correlation
print "logprob =", pearsonr(logprobs, test_ratings)[0]
print "mean logprob =", pearsonr(mean_logprobs, test_ratings)[0]
print "norm logprob =", pearsonr(norm_logprobs, test_ratings)[0]
print "slor =", pearsonr(slors, test_ratings)[0]
示例#38
0
	#create similarity matrix, start by creating the numpy structure
	pearsonMatrix = numpy.zeros((rows,rows) , dtype=numpy.float)
	counter = 1.
	totalrows = rows
	i = 0
	j = 0
	for userA in userList:
		for userB in userList:
			if userA <> userB:
				userARatings = userRatings[userMap[userA]]
				userBRatings = userRatings[userMap[userB]]
				#print userARatings.shape
				#print userBRatings.shape
				#pearsonMatrix[i,j] = pearsonCorrelation(userA,userB)
				if pearsonMatrix[i,j] == 0.:
					pearsonMatrix[i,j] = pearsonr(userARatings,userBRatings)[0]
					pearsonMatrix[j,i] = pearsonMatrix[i,j]
				j = j+1
		progress = round ((counter * 100)/totalrows,3)
                print "Progress: "+str(progress) + "%"
                counter = counter + 1	
		i = i+1
		j = 0.


print "Saving Pearson Similarity matrix, please wait"
numpy.savetxt("pearsonMatrix.csv", pearsonMatrix, delimiter=',')
print "Pearson matrix saved"
	

	  
示例#39
0
fig = plt.figure(figsize=(10, 6))
ax = fig.add_axes([.025, .025, .675, .875])
cax = fig.add_axes([0.725, .025, .025, .875])
pax = fig.add_axes([0.85, .025, .1, .875])
bmap.drawcoastlines(ax=ax)
bmap.drawcountries(ax=ax)
bmap.drawstates(ax=ax)
for season, months in seasons.items():
    print(season)
    pax.cla()
    tidx = np.array([ti for ti, t in enumerate(times) if t.month in months])
    tmO3 = mO3[tidx]
    toO3 = oO3[tidx]
    tbO3 = tmO3[:].mean(0) - toO3[:].mean(0)
    rs = np.ma.masked_invalid(
        [mstats.pearsonr(m, o)[0] for m, o in zip(tmO3.T, toO3.T)])
    print(tbO3.min(), tbO3.max())
    s = bmap.scatter(lon,
                     lat,
                     c=tbO3,
                     norm=bnorm,
                     cmap=bcmap,
                     ax=ax,
                     edgecolors='k')
    cbar = plt.colorbar(s, cax=cax, label='ppb')
    hist, edgesdummy = np.histogram(tbO3, bins=bedges)
    pax.plot(hist.repeat(2, 0) / hist.sum() * 100, bedges.repeat(2, 0)[1:-1])
    pax.set_ylim(bedges[0], bedges[-1])
    pax.xaxis.tick_top()
    pax.yaxis.set_major_formatter(plt.NullFormatter())
    pax.set_xlabel('% Sites')
if debug:
    print "Ratings", len(ratings), "=", ratings[:10]

#process the test.csv file
metrics = []
probs = []
for line_id, line in enumerate(open(args.test_csv)):
    data = line.strip().split(",")
    if line_id == 0:
        metrics = data[3:]
        if debug:
            print "\nmetrics =", metrics
    else:
        for i, score in enumerate(data[3:]):
            if len(probs) == i:
                probs.append([])
            if score == "":
                score = 0
            probs[i].append(float(score))

#print "\n".join(metrics), "\n"
print "METRICS\tCORRELATION"
for i, prob in enumerate(probs):
    if debug:
        print "\nmetric =", metrics[i]
        print "\tprob", len(prob), "=", prob[:5]
    corr = pearsonr(ratings, prob)[0]
    print metrics[i] + "\t" + str(corr)