Exemplo n.º 1
0
def consistancy(explanation_objects, stype='kendall'):
    '''
		Compares the relative differences in explanations for the same document accross different sampling 
	'''
    #kendall_values = {}
    scores = []
    l1 = None
    l2 = None
    for i in range(len(explanation_objects)):
        #kendall_values[i] = {}
        l1 = explanation_objects[i]
        for j in range(i + 1, len(explanation_objects)):
            l2 = explanation_objects[j]

            if len(l1) > 3 and len(l2) > 3:
                if len(l1) != len(l2):
                    min_len = min(len(l1), len(l2))
                    #print(len(l1), len(l2), min_len)
                    l1 = explanation_objects[i][:min_len]
                    l2 = explanation_objects[j][:min_len]

                if stype == 'kendall':
                    kscore = kendalltau(l1, l2)
                    if kscore[1] < 0.05:
                        #kendall_values[i][j] = kscore[0]
                        scores.append(kscore[0])
                else:
                    kscore = weightedtau(l1, l2, False)
                    scores.append(kscore[0])

    return np.mean(scores)
def kendal_weighted(featImp=np.array([.55, .33, .07, .05]), 
                    pcRank=np.array([1, 2, 4, 3])):
    """SNIPPET 8.6 COMPUTATION OF WEIGHTED KENDALL’S TAU BETWEEN FEATURE IMPORTANCE AND INVERSE PCA RANKING
    featImp: Feature importance
    pcRank: PCA rank
    """
    from scipy.stats import weightedtau
    return weightedtau(featImp, pcRank**-1.)[0]
Exemplo n.º 3
0
def get_pca_rank_weighted_kendall_tau(feature_imp, pca_rank):
    """
    Compute Kendall's weighted tau (hyperbolic). 
    :param feature_imp: (np.array): with feature mean importance
    :param pca_rank: (np.array): PCA based feature importance rank
    :return: (float): weighted Kendall tau of feature importance and inverse PCA rank with p_value
    """
    return weightedtau(feature_imp, pca_rank**-1.)
Exemplo n.º 4
0
def get_pca_rank_weighted_kendall_tau(feature_imp, pca_rank):
    """
    Snippet 8.6, page 121. Computation of Weighted Kendall's Tau Between Feature Importance and Inverse PCA Ranking

    :param feature_imp: (np.array): with feature mean importance
    :param pca_rank: (np.array): PCA based feature importance rank
    :return: (float): weighted Kendall tau of feature importance and inverse PCA rank with p_value
    """
    return weightedtau(feature_imp, pca_rank ** -1.)
Exemplo n.º 5
0
def correlation(x, y, xlabel, ylabel):
    print('\n', xlabel, '-', ylabel)
    print('Covariance:\n', np.cov(x, y))
    print('Pearson Correlation\n', stats.pearsonr(x, y))
    print('Spearman Correlation\n', stats.spearmanr(x, y))
    print('Fisher-Z Transformation\n', np.arctan(stats.pearsonr(x, y)))
    print('Kendall Correlation\n', stats.kendalltau(x, y))
    print('Weighted Kendall\n', stats.weightedtau(x, y))
    print('Cosine Similarity\n', cosine_similarity(x, y))
Exemplo n.º 6
0
 def correlation(self, a: np.ndarray, b: np.ndarray,
                 **kwargs) -> CorrelationMap:
     results = {}
     for alpha in self.alphas:
         weigher = lambda x: (1 / (x + 1)**alpha)
         wkt, _ = weightedtau(a, b, weigher=weigher)
         results[f"{self.id}_{alpha}"] = CorrelationResult(correlation=wkt,
                                                           k=len(a))
     return results
Exemplo n.º 7
0
def kendall_predictand(data: np.ndarray) -> float:
    """
    Takes in two timeseries in a 2D array (n_obs,[x,y]). computes weighted kendall tau.
    Weights are determined by the y (done by rank is None, meaning that weighting is determined by x)
    (rank = True, would compute twice, once with x and second with y)
    Significance is not implemented but might be obtained by the bootstrap decorator
    """
    corr, _ = weightedtau(x=data[:, 1], y=data[:, 0], rank=None)
    return corr
Exemplo n.º 8
0
def kendall_choice(data: np.ndarray) -> float:
    """
    Takes in two timeseries in a 2D array (n_obs,[x,y]). computes weighted kendall tau. Weighting direction in terms of precursor ranks is chosen based on pearsons
    Significance is not implemented
    """
    corr, _ = weightedtau(x=data[:, 0],
                          y=data[:, 1],
                          rank=rankdirection(x=data[:, 0], y=data[:, 1]))
    return corr
Exemplo n.º 9
0
def Find_Wtau(Rankings, num_comps):
    size = Rankings.shape[1]
    AdjacencyM = np.zeros((size, size))
    combs = itertools.combinations(range(num_comps), 2)
    for comb in combs:
        weightedT = weightedtau(list(Rankings[:, comb[0]]),
                                list(Rankings[:, comb[1]])).correlation
        AdjacencyM[comb[0], comb[1]] = weightedT
        AdjacencyM[comb[1], comb[0]] = weightedT
    return AdjacencyM
Exemplo n.º 10
0
  def update(self, prediction_probs, eviction_mask, oracle_scores):
    del oracle_scores

    _, predicted_order = prediction_probs.sort(descending=True)
    for unbatched_order in predicted_order.cpu().data.numpy():
      # Need to negate arguments for rank: see weightedtau docs
      # NOTE: This is incorporating potentially masked & padded probs
      weighted_tau, _ = stats.weightedtau(
          -unbatched_order, -np.array(range(len(unbatched_order))), rank=False)
      self._weighted_taus.append(weighted_tau)
    self._masks.extend(eviction_mask.cpu().data.numpy())
Exemplo n.º 11
0
def get_pca_rank_weighted_kendall_tau(feature_imp, pca_rank):
    """
    Advances in Financial Machine Learning, Snippet 8.6, page 121.

    Computes Weighted Kendall's Tau Between Feature Importance and Inverse PCA Ranking.

    :param feature_imp: (np.array): Feature mean importance.
    :param pca_rank: (np.array): PCA based feature importance rank.
    :return: (float): Weighted Kendall Tau of feature importance and inverse PCA rank with p_value.
    """
    return weightedtau(feature_imp, pca_rank**-1.0)
Exemplo n.º 12
0
    def _query_differences(self, run1, run2, *args, **kwargs):
        """
        :param run1: TREC run. Has the format {qid: {docid: score}, ...}
        :param run2: Same as above
        :param args:
        :param kwargs:
        :return: The union of top k qids in both runs, sorted by the order in which the queries appear in run 1
        ^ This is because run 1 appears on the left hand side in the web ui
        """
        topk = self.topk
        metric = self.metric
        qids = run1.keys() & run2.keys()
        if not qids:
            raise ValueError("run1 and run2 have no shared qids")

        id2measure = {}
        for qid in qids:
            from collections import defaultdict

            min_value = min(min(run1[qid].values()), min(
                run2[qid].values())) - 1e-5
            doc_score_1 = defaultdict(lambda: min_value, run1[qid])
            doc_score_2 = defaultdict(lambda: min_value, run2[qid])
            doc_ids_1 = doc_score_1.keys()
            doc_ids_2 = doc_score_2.keys()
            doc_ids_union = set(doc_ids_1).union(set(doc_ids_2))
            doc_ids_union = sorted(list(doc_ids_union),
                                   key=lambda id:
                                   (doc_score_1[id] + doc_score_2[id]),
                                   reverse=True)
            union_score1 = [doc_score_1[doc_id] for doc_id in doc_ids_union]
            union_score2 = [doc_score_2[doc_id] for doc_id in doc_ids_union]
            if metric == "weightedtau":
                tau, p_value = stats.weightedtau(union_score1, union_score2)
            elif metric == "tauap":
                tau = (self.tauap_fast(union_score1, union_score2) +
                       self.tauap_fast(union_score2, union_score1)) / 2
            elif metric == "spearmanr":
                tau, p_value = stats.spearmanr(union_score1, union_score2)
            elif metric == "pearsonrank":
                tau = (self.pearson_rank(union_score1, union_score2) +
                       self.pearson_rank(union_score2, union_score1)) / 2
            elif metric == "kldiv":
                tau = self.kl_div(union_score1, union_score2)
            else:
                raise ValueError(
                    "Metric {} not supported for the measure {}".format(
                        self.metric, "metric"))
            id2measure[qid] = tau
        qids = sorted(qids, key=lambda x: id2measure[x])
        qids = qids[:topk]
        id2measure = {idx: id2measure[idx] for idx in qids}
        return qids, id2measure, metric, None
def get_correlation_scores(actual_performances,
                           transferability_scores,
                           metric='w-kendall'):
    """Return a correlation score, according to the metric."""

    assert metric in ['w-kendall', 'kendall', 'pearson']
    if metric == 'w-kendall':
        return stats.weightedtau(actual_performances,
                                 transferability_scores)[0]
    if metric == 'kendall':
        return stats.kendalltau(actual_performances, transferability_scores)[0]
    if metric == 'pearson':
        return stats.pearsonr(actual_performances, transferability_scores)[0]
Exemplo n.º 14
0
def kendalltau_correlation(X, rowvar=False, weighted=False):
    """
    Computes kendall's tau correlation estimate.
    The option to use scipy.stats.weightedtau is not recommended
    as the implementation does not appear to handle ties correctly.

    Parameters
    ----------
    X: array-like, shape = [n_samples, n_features]
        Data matrix using which we compute the empirical
        correlation

    Returns
    -------
    rank_correlation

    References
    ----------

    Liu, Han, Fang; Yuan, Ming; Lafferty, John; Wasserman, Larry.
    "High-dimensional semiparametric Gaussian copula graphical models."
    Ann. Statist. 40.4 (2012): 2293-2326. doi:10.1214/12-AOS1037

    Barber, Rina Foygel; Kolar, Mladen.
    "ROCKET: Robust Confidence Intervals via Kendall's Tau
    for Transelliptical Graphical Models."
     arXiv:1502.07641
    """

    if rowvar:
        X = X.T

    _, n_features = X.shape
    rank_correlation = np.eye(n_features)
    for row in np.arange(n_features):
        for col in np.arange(1 + row, n_features):
            if weighted:
                rank_correlation[row, col], _ = weightedtau(X[:, row],
                                                            X[:, col],
                                                            rank=False)
            else:
                rank_correlation[row,
                                 col], _ = kendalltau(X[:, row], X[:, col])
    rank_correlation = np.triu(rank_correlation, 1) + rank_correlation.T

    return np.sin(rank_correlation * np.pi / 2)
Exemplo n.º 15
0
def calcCorrelationCoef(PDF1,PDF2,mode='simple'):
    if mode in ['simple']:
        tau, p_value = ss.kendalltau(PDF1[1,:],PDF2[1,:])
        wtaur, wtp = ss.weightedtau(PDF2[1,:],PDF1[1,:],rank=None)
        pr, prp = ss.pearsonr(PDF1[1,:],PDF2[1,:])
        print (tau)
        print (wtaur)
        print (pr)
        return wtaur
    elif mode in ['complex','complex1']: #In complex mode, we will define the correlation coefficient as the degree to which the observations conform to the expectation
        rcorr = np.zeros(len(PDF1[1,:]))
        def expectation(x):
            return x #i.e. as we are measuring the same quantity, we expect that the PDFs should follow a 1:1 relationship in the absence of bias or poor sampling
        for i in range(len(PDF1[1,:])):
            if PDF2[1,i] < expectation(PDF1[1,i]):
                rcorr[i] = PDF2[1,i] / expectation(PDF1[1,i])
            else:
                rcorr[i] =  expectation(PDF1[1,i]) / PDF2[1,i]
        print (rcorr)
        return rcorr
    elif mode in ['gradient','gradient_avg']:
        rcorr = np.zeros(len(PDF1[1,:]))
        stretchlength = 2
        expectedGradient = 1.0
        for i in range(len(PDF1[1,:])):
            if (i > stretchlength) and (i < len(PDF1[1,:]) - stretchlength):
                measgrad = (PDF2[1,i+stretchlength] - PDF2[1,i-stretchlength]) / (PDF1[1,i+stretchlength]-PDF1[1,i-stretchlength])
            elif (i <= stretchlength):
                measgrad = (PDF2[1,i+1] - PDF2[1,i]) / (PDF1[1,i+1]-PDF1[1,i])
            elif (i >=(len(PDF1[1,:]) - stretchlength)):
                measgrad = (PDF2[1,i] - PDF2[1,i-1]) / (PDF1[1,i]-PDF1[1,i-1])
            else:
                print ("something unexpected has occured")
            if measgrad > expectedGradient:
                rcorr[i] = expectedGradient / measgrad
            else:
                rcorr[i] = measgrad / expectedGradient
        if mode in ['gradient']:
            return rcorr
        elif mode in ['gradient_avg']:
            return np.median(np.nan_to_num(rcorr))
        else:
            print ('something has gone horribly wrong')
            return None
    else:
        print ('mode does not exist')
def compute_weighted_tau(ranking_A, ranking_B):
    # Arrays of scores
    app_ids = list_app_ids(ranking_A, ranking_B)
    x = convert_ranking_to_vector_of_scores(ranking_A, app_ids=app_ids)
    y = convert_ranking_to_vector_of_scores(ranking_B, app_ids=app_ids)
    # NB: it is important NOT to feed arrays of ranks for the weighted tau!
    #
    # > Note that if you are computing the weighted on arrays of ranks, rather than of scores (i.e., a larger value
    # > implies a lower rank) you must negate the ranks, so that elements of higher rank are associated with a larger
    # > value.
    #
    # Reference: http://scipy.github.io/devdocs/generated/scipy.stats.weightedtau.html#scipy.stats.weightedtau

    weighted_tau, p_value = stats.weightedtau(x, y)

    print('Weighted Kendall rank-order correlation coefficient: {:.4f}'.format(
        weighted_tau))
    print('p-value to test for non-correlation: {:.4f}'.format(p_value))

    return weighted_tau, p_value
def evaluate(answers, predictions):
    if len(answers) != len(predictions):
        raise Exception("Invalid Answers or Predictions!")

    rank_crr = 0
    for i in range(len(answers)):
        answer = answers[i]
        prediction = predictions[i]
        if len(prediction) != 3 or not (1 in prediction) or not (
                2 in prediction) or not (0 in prediction):
            raise Exception("Invalid Prediction! %s" % prediction)

        wkt = stats.weightedtau(answer, prediction)[0]

        rank_crr += wkt

    sum_wkt = rank_crr
    avg_wkt = rank_crr / len(answers)

    return {"sum_wkt": sum_wkt, "avg_wkt": avg_wkt}
Exemplo n.º 18
0
def quick_kendall(data: np.ndarray) -> tuple:
    """
    no significance testing
    """
    corr, pval = weightedtau(x=data[:, 1], y=data[:, 0], rank=None)
    return (corr, 1e-9)
Exemplo n.º 19
0
 def func_correlation_kendall(lhs, rhs):
     return MultiRollingAggregate.func_correlation(
         lhs, rhs, lambda x, y: stats.weightedtau(x, y, rank=False))
Exemplo n.º 20
0
    dfZ = dfX.sub(dfX.mean(), axis=1).div(dfX.std(), axis=1)  # standartize
    dot = pd.DataFrame(np.dot(dfZ.T, dfZ),
                       index=dfX.columns,
                       columns=dfX.columns)
    eVal, eVec = get_eVec(dot, varThres)

    dfP = np.dot(dfZ, eVec)
    return dfP


import numpy as np
from scipy.stats import weightedtau
featImp = np.array([.55, .33, .07, .05])  # feature importance
pcRank = np.array([1, 2, 4, 3])  # PCA rank
weightedtau(featImp, pcRank**-1.)[0]


def getTestData(n_features=40,
                n_informative=10,
                n_redundant=10,
                n_samples=10000):

    # Generate random dataset for a classification problem

    from sklearn.datasets import make_classification

    trnsX, cont = make_classification(n_samples=n_samples,
                                      n_features=n_features,
                                      n_informative=n_informative,
                                      n_redundant=n_redundant,
Exemplo n.º 21
0
    1, 2, 3, 4, 5, 20, 20, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
    21, 22, 23, 24, 25, 26, 27
]
b = [
    1, 2, 5, 20, 20, 3, 4, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
    21, 22, 23, 24, 25, 26, 27
]

s, p = wilcoxon(a, b)

print(s, p)

a = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
b = [1, 2, 5, 6, 3, 4, 7, 8, 9, 10]

cor, p = weightedtau(a, b)

print(cor, p)


def read_list(path, rows):
    ranked_genes = list()
    i = 0
    with open(path) as f:
        for row in f:
            name = (row.split(",")[0]).split("_")[0]
            ranked_genes.append(name)
            i = i + 1
            if i >= rows: break
    return ranked_genes
    correlation_return_tau = []
    correlation_return_pearson = []
    correlation_return_spearmann = []

    for extern_index_i in range(len(extern)):  #
        temp_tau = [
        ]  # this contains values based on computation of two K2 columns (intern and external validation)
        temp_pearson = [
        ]  # this contains values.But it checks for some conditions before that value is added
        temp_spearmann = [
        ]  # this contains values.But the value is based on what pearson says
        for intern_index_i in range(len(intern)):
            # pass in k2 (Box)(internal) and K2 (Box)(external)
            temp_tau.append(
                weightedtau(extern[extern_index_i],
                            intern[intern_index_i]).correlation)

            # if K2 Box(external) contains only 1 unique number (like 0) (all scores were the same)
            if len(np.unique(extern[extern_index_i])) == 1:
                temp_pearson.append(0)
                value = 0
            else:
                temp_pearson.append(
                    pearsonr(extern[extern_index_i],
                             intern[intern_index_i])[0])
                value = spearmanr(extern[extern_index_i],
                                  intern[intern_index_i]).correlation
            temp_spearmann.append(value)

            valueL.append(value)
            interL.append(header_1d[intern_index_i]
def get_pca_rank_weighted_kendall_tau(feature_imp, pca_rank):
    return weightedtau(feature_imp, pca_rank**-1.)
Exemplo n.º 24
0
# %%
# fit random forest model to the PCA features

rf = RandomForestClassifier(n_estimators=200)
rf.fit(features_pca, target)
mdi = get_mdi(rf.estimators_, features_pca.columns)
mdi = mdi.join(eig_vals)
mdi[["mean", "eig_vals"]].plot(kind="scatter", y="mean", x="eig_vals");
ax = plt.gca()
ax.set_title("Plot of eigenvalues vs. mean MDI")
ax.yaxis.set_major_formatter(mtick.FuncFormatter("{:.0%}".format))
fig = plt.gcf()
fig.set_size_inches(15, 5)

# %%
weightedtau(mdi["mean"], mdi["eig_vals"]).correlation

# %% [markdown]
# Correlation does not look great - I think this might be because we have relatively few features, and also because using the binary variables (eg sex) in this analysis might give weird results. I should look at this in more detail.

# %% [markdown]
# ## Testing on synthetic dataset

# %% [markdown]
# To conclude the notebook, we create some synthetic data and show how the above methods perform.

# %%
from sklearn.datasets import make_classification


# %%