def consistancy(explanation_objects, stype='kendall'): ''' Compares the relative differences in explanations for the same document accross different sampling ''' #kendall_values = {} scores = [] l1 = None l2 = None for i in range(len(explanation_objects)): #kendall_values[i] = {} l1 = explanation_objects[i] for j in range(i + 1, len(explanation_objects)): l2 = explanation_objects[j] if len(l1) > 3 and len(l2) > 3: if len(l1) != len(l2): min_len = min(len(l1), len(l2)) #print(len(l1), len(l2), min_len) l1 = explanation_objects[i][:min_len] l2 = explanation_objects[j][:min_len] if stype == 'kendall': kscore = kendalltau(l1, l2) if kscore[1] < 0.05: #kendall_values[i][j] = kscore[0] scores.append(kscore[0]) else: kscore = weightedtau(l1, l2, False) scores.append(kscore[0]) return np.mean(scores)
def kendal_weighted(featImp=np.array([.55, .33, .07, .05]), pcRank=np.array([1, 2, 4, 3])): """SNIPPET 8.6 COMPUTATION OF WEIGHTED KENDALL’S TAU BETWEEN FEATURE IMPORTANCE AND INVERSE PCA RANKING featImp: Feature importance pcRank: PCA rank """ from scipy.stats import weightedtau return weightedtau(featImp, pcRank**-1.)[0]
def get_pca_rank_weighted_kendall_tau(feature_imp, pca_rank): """ Compute Kendall's weighted tau (hyperbolic). :param feature_imp: (np.array): with feature mean importance :param pca_rank: (np.array): PCA based feature importance rank :return: (float): weighted Kendall tau of feature importance and inverse PCA rank with p_value """ return weightedtau(feature_imp, pca_rank**-1.)
def get_pca_rank_weighted_kendall_tau(feature_imp, pca_rank): """ Snippet 8.6, page 121. Computation of Weighted Kendall's Tau Between Feature Importance and Inverse PCA Ranking :param feature_imp: (np.array): with feature mean importance :param pca_rank: (np.array): PCA based feature importance rank :return: (float): weighted Kendall tau of feature importance and inverse PCA rank with p_value """ return weightedtau(feature_imp, pca_rank ** -1.)
def correlation(x, y, xlabel, ylabel): print('\n', xlabel, '-', ylabel) print('Covariance:\n', np.cov(x, y)) print('Pearson Correlation\n', stats.pearsonr(x, y)) print('Spearman Correlation\n', stats.spearmanr(x, y)) print('Fisher-Z Transformation\n', np.arctan(stats.pearsonr(x, y))) print('Kendall Correlation\n', stats.kendalltau(x, y)) print('Weighted Kendall\n', stats.weightedtau(x, y)) print('Cosine Similarity\n', cosine_similarity(x, y))
def correlation(self, a: np.ndarray, b: np.ndarray, **kwargs) -> CorrelationMap: results = {} for alpha in self.alphas: weigher = lambda x: (1 / (x + 1)**alpha) wkt, _ = weightedtau(a, b, weigher=weigher) results[f"{self.id}_{alpha}"] = CorrelationResult(correlation=wkt, k=len(a)) return results
def kendall_predictand(data: np.ndarray) -> float: """ Takes in two timeseries in a 2D array (n_obs,[x,y]). computes weighted kendall tau. Weights are determined by the y (done by rank is None, meaning that weighting is determined by x) (rank = True, would compute twice, once with x and second with y) Significance is not implemented but might be obtained by the bootstrap decorator """ corr, _ = weightedtau(x=data[:, 1], y=data[:, 0], rank=None) return corr
def kendall_choice(data: np.ndarray) -> float: """ Takes in two timeseries in a 2D array (n_obs,[x,y]). computes weighted kendall tau. Weighting direction in terms of precursor ranks is chosen based on pearsons Significance is not implemented """ corr, _ = weightedtau(x=data[:, 0], y=data[:, 1], rank=rankdirection(x=data[:, 0], y=data[:, 1])) return corr
def Find_Wtau(Rankings, num_comps): size = Rankings.shape[1] AdjacencyM = np.zeros((size, size)) combs = itertools.combinations(range(num_comps), 2) for comb in combs: weightedT = weightedtau(list(Rankings[:, comb[0]]), list(Rankings[:, comb[1]])).correlation AdjacencyM[comb[0], comb[1]] = weightedT AdjacencyM[comb[1], comb[0]] = weightedT return AdjacencyM
def update(self, prediction_probs, eviction_mask, oracle_scores): del oracle_scores _, predicted_order = prediction_probs.sort(descending=True) for unbatched_order in predicted_order.cpu().data.numpy(): # Need to negate arguments for rank: see weightedtau docs # NOTE: This is incorporating potentially masked & padded probs weighted_tau, _ = stats.weightedtau( -unbatched_order, -np.array(range(len(unbatched_order))), rank=False) self._weighted_taus.append(weighted_tau) self._masks.extend(eviction_mask.cpu().data.numpy())
def get_pca_rank_weighted_kendall_tau(feature_imp, pca_rank): """ Advances in Financial Machine Learning, Snippet 8.6, page 121. Computes Weighted Kendall's Tau Between Feature Importance and Inverse PCA Ranking. :param feature_imp: (np.array): Feature mean importance. :param pca_rank: (np.array): PCA based feature importance rank. :return: (float): Weighted Kendall Tau of feature importance and inverse PCA rank with p_value. """ return weightedtau(feature_imp, pca_rank**-1.0)
def _query_differences(self, run1, run2, *args, **kwargs): """ :param run1: TREC run. Has the format {qid: {docid: score}, ...} :param run2: Same as above :param args: :param kwargs: :return: The union of top k qids in both runs, sorted by the order in which the queries appear in run 1 ^ This is because run 1 appears on the left hand side in the web ui """ topk = self.topk metric = self.metric qids = run1.keys() & run2.keys() if not qids: raise ValueError("run1 and run2 have no shared qids") id2measure = {} for qid in qids: from collections import defaultdict min_value = min(min(run1[qid].values()), min( run2[qid].values())) - 1e-5 doc_score_1 = defaultdict(lambda: min_value, run1[qid]) doc_score_2 = defaultdict(lambda: min_value, run2[qid]) doc_ids_1 = doc_score_1.keys() doc_ids_2 = doc_score_2.keys() doc_ids_union = set(doc_ids_1).union(set(doc_ids_2)) doc_ids_union = sorted(list(doc_ids_union), key=lambda id: (doc_score_1[id] + doc_score_2[id]), reverse=True) union_score1 = [doc_score_1[doc_id] for doc_id in doc_ids_union] union_score2 = [doc_score_2[doc_id] for doc_id in doc_ids_union] if metric == "weightedtau": tau, p_value = stats.weightedtau(union_score1, union_score2) elif metric == "tauap": tau = (self.tauap_fast(union_score1, union_score2) + self.tauap_fast(union_score2, union_score1)) / 2 elif metric == "spearmanr": tau, p_value = stats.spearmanr(union_score1, union_score2) elif metric == "pearsonrank": tau = (self.pearson_rank(union_score1, union_score2) + self.pearson_rank(union_score2, union_score1)) / 2 elif metric == "kldiv": tau = self.kl_div(union_score1, union_score2) else: raise ValueError( "Metric {} not supported for the measure {}".format( self.metric, "metric")) id2measure[qid] = tau qids = sorted(qids, key=lambda x: id2measure[x]) qids = qids[:topk] id2measure = {idx: id2measure[idx] for idx in qids} return qids, id2measure, metric, None
def get_correlation_scores(actual_performances, transferability_scores, metric='w-kendall'): """Return a correlation score, according to the metric.""" assert metric in ['w-kendall', 'kendall', 'pearson'] if metric == 'w-kendall': return stats.weightedtau(actual_performances, transferability_scores)[0] if metric == 'kendall': return stats.kendalltau(actual_performances, transferability_scores)[0] if metric == 'pearson': return stats.pearsonr(actual_performances, transferability_scores)[0]
def kendalltau_correlation(X, rowvar=False, weighted=False): """ Computes kendall's tau correlation estimate. The option to use scipy.stats.weightedtau is not recommended as the implementation does not appear to handle ties correctly. Parameters ---------- X: array-like, shape = [n_samples, n_features] Data matrix using which we compute the empirical correlation Returns ------- rank_correlation References ---------- Liu, Han, Fang; Yuan, Ming; Lafferty, John; Wasserman, Larry. "High-dimensional semiparametric Gaussian copula graphical models." Ann. Statist. 40.4 (2012): 2293-2326. doi:10.1214/12-AOS1037 Barber, Rina Foygel; Kolar, Mladen. "ROCKET: Robust Confidence Intervals via Kendall's Tau for Transelliptical Graphical Models." arXiv:1502.07641 """ if rowvar: X = X.T _, n_features = X.shape rank_correlation = np.eye(n_features) for row in np.arange(n_features): for col in np.arange(1 + row, n_features): if weighted: rank_correlation[row, col], _ = weightedtau(X[:, row], X[:, col], rank=False) else: rank_correlation[row, col], _ = kendalltau(X[:, row], X[:, col]) rank_correlation = np.triu(rank_correlation, 1) + rank_correlation.T return np.sin(rank_correlation * np.pi / 2)
def calcCorrelationCoef(PDF1,PDF2,mode='simple'): if mode in ['simple']: tau, p_value = ss.kendalltau(PDF1[1,:],PDF2[1,:]) wtaur, wtp = ss.weightedtau(PDF2[1,:],PDF1[1,:],rank=None) pr, prp = ss.pearsonr(PDF1[1,:],PDF2[1,:]) print (tau) print (wtaur) print (pr) return wtaur elif mode in ['complex','complex1']: #In complex mode, we will define the correlation coefficient as the degree to which the observations conform to the expectation rcorr = np.zeros(len(PDF1[1,:])) def expectation(x): return x #i.e. as we are measuring the same quantity, we expect that the PDFs should follow a 1:1 relationship in the absence of bias or poor sampling for i in range(len(PDF1[1,:])): if PDF2[1,i] < expectation(PDF1[1,i]): rcorr[i] = PDF2[1,i] / expectation(PDF1[1,i]) else: rcorr[i] = expectation(PDF1[1,i]) / PDF2[1,i] print (rcorr) return rcorr elif mode in ['gradient','gradient_avg']: rcorr = np.zeros(len(PDF1[1,:])) stretchlength = 2 expectedGradient = 1.0 for i in range(len(PDF1[1,:])): if (i > stretchlength) and (i < len(PDF1[1,:]) - stretchlength): measgrad = (PDF2[1,i+stretchlength] - PDF2[1,i-stretchlength]) / (PDF1[1,i+stretchlength]-PDF1[1,i-stretchlength]) elif (i <= stretchlength): measgrad = (PDF2[1,i+1] - PDF2[1,i]) / (PDF1[1,i+1]-PDF1[1,i]) elif (i >=(len(PDF1[1,:]) - stretchlength)): measgrad = (PDF2[1,i] - PDF2[1,i-1]) / (PDF1[1,i]-PDF1[1,i-1]) else: print ("something unexpected has occured") if measgrad > expectedGradient: rcorr[i] = expectedGradient / measgrad else: rcorr[i] = measgrad / expectedGradient if mode in ['gradient']: return rcorr elif mode in ['gradient_avg']: return np.median(np.nan_to_num(rcorr)) else: print ('something has gone horribly wrong') return None else: print ('mode does not exist')
def compute_weighted_tau(ranking_A, ranking_B): # Arrays of scores app_ids = list_app_ids(ranking_A, ranking_B) x = convert_ranking_to_vector_of_scores(ranking_A, app_ids=app_ids) y = convert_ranking_to_vector_of_scores(ranking_B, app_ids=app_ids) # NB: it is important NOT to feed arrays of ranks for the weighted tau! # # > Note that if you are computing the weighted on arrays of ranks, rather than of scores (i.e., a larger value # > implies a lower rank) you must negate the ranks, so that elements of higher rank are associated with a larger # > value. # # Reference: http://scipy.github.io/devdocs/generated/scipy.stats.weightedtau.html#scipy.stats.weightedtau weighted_tau, p_value = stats.weightedtau(x, y) print('Weighted Kendall rank-order correlation coefficient: {:.4f}'.format( weighted_tau)) print('p-value to test for non-correlation: {:.4f}'.format(p_value)) return weighted_tau, p_value
def evaluate(answers, predictions): if len(answers) != len(predictions): raise Exception("Invalid Answers or Predictions!") rank_crr = 0 for i in range(len(answers)): answer = answers[i] prediction = predictions[i] if len(prediction) != 3 or not (1 in prediction) or not ( 2 in prediction) or not (0 in prediction): raise Exception("Invalid Prediction! %s" % prediction) wkt = stats.weightedtau(answer, prediction)[0] rank_crr += wkt sum_wkt = rank_crr avg_wkt = rank_crr / len(answers) return {"sum_wkt": sum_wkt, "avg_wkt": avg_wkt}
def quick_kendall(data: np.ndarray) -> tuple: """ no significance testing """ corr, pval = weightedtau(x=data[:, 1], y=data[:, 0], rank=None) return (corr, 1e-9)
def func_correlation_kendall(lhs, rhs): return MultiRollingAggregate.func_correlation( lhs, rhs, lambda x, y: stats.weightedtau(x, y, rank=False))
dfZ = dfX.sub(dfX.mean(), axis=1).div(dfX.std(), axis=1) # standartize dot = pd.DataFrame(np.dot(dfZ.T, dfZ), index=dfX.columns, columns=dfX.columns) eVal, eVec = get_eVec(dot, varThres) dfP = np.dot(dfZ, eVec) return dfP import numpy as np from scipy.stats import weightedtau featImp = np.array([.55, .33, .07, .05]) # feature importance pcRank = np.array([1, 2, 4, 3]) # PCA rank weightedtau(featImp, pcRank**-1.)[0] def getTestData(n_features=40, n_informative=10, n_redundant=10, n_samples=10000): # Generate random dataset for a classification problem from sklearn.datasets import make_classification trnsX, cont = make_classification(n_samples=n_samples, n_features=n_features, n_informative=n_informative, n_redundant=n_redundant,
1, 2, 3, 4, 5, 20, 20, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 ] b = [ 1, 2, 5, 20, 20, 3, 4, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 ] s, p = wilcoxon(a, b) print(s, p) a = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] b = [1, 2, 5, 6, 3, 4, 7, 8, 9, 10] cor, p = weightedtau(a, b) print(cor, p) def read_list(path, rows): ranked_genes = list() i = 0 with open(path) as f: for row in f: name = (row.split(",")[0]).split("_")[0] ranked_genes.append(name) i = i + 1 if i >= rows: break return ranked_genes
correlation_return_tau = [] correlation_return_pearson = [] correlation_return_spearmann = [] for extern_index_i in range(len(extern)): # temp_tau = [ ] # this contains values based on computation of two K2 columns (intern and external validation) temp_pearson = [ ] # this contains values.But it checks for some conditions before that value is added temp_spearmann = [ ] # this contains values.But the value is based on what pearson says for intern_index_i in range(len(intern)): # pass in k2 (Box)(internal) and K2 (Box)(external) temp_tau.append( weightedtau(extern[extern_index_i], intern[intern_index_i]).correlation) # if K2 Box(external) contains only 1 unique number (like 0) (all scores were the same) if len(np.unique(extern[extern_index_i])) == 1: temp_pearson.append(0) value = 0 else: temp_pearson.append( pearsonr(extern[extern_index_i], intern[intern_index_i])[0]) value = spearmanr(extern[extern_index_i], intern[intern_index_i]).correlation temp_spearmann.append(value) valueL.append(value) interL.append(header_1d[intern_index_i]
def get_pca_rank_weighted_kendall_tau(feature_imp, pca_rank): return weightedtau(feature_imp, pca_rank**-1.)
# %% # fit random forest model to the PCA features rf = RandomForestClassifier(n_estimators=200) rf.fit(features_pca, target) mdi = get_mdi(rf.estimators_, features_pca.columns) mdi = mdi.join(eig_vals) mdi[["mean", "eig_vals"]].plot(kind="scatter", y="mean", x="eig_vals"); ax = plt.gca() ax.set_title("Plot of eigenvalues vs. mean MDI") ax.yaxis.set_major_formatter(mtick.FuncFormatter("{:.0%}".format)) fig = plt.gcf() fig.set_size_inches(15, 5) # %% weightedtau(mdi["mean"], mdi["eig_vals"]).correlation # %% [markdown] # Correlation does not look great - I think this might be because we have relatively few features, and also because using the binary variables (eg sex) in this analysis might give weird results. I should look at this in more detail. # %% [markdown] # ## Testing on synthetic dataset # %% [markdown] # To conclude the notebook, we create some synthetic data and show how the above methods perform. # %% from sklearn.datasets import make_classification # %%