def cal_info(X, Y, k): """ calculate the max energy distance among random vector Xs that condition on k classes, associated with the random discrete variable Y Requirments: dcor, which is on PyPi and can be installed using pip and conda Input: X (:obj:`numpy.ndarray`): matrix of size n x d, n observations of X where X is a d-dimensional random vector taking continuous values Y (:obj:`numpy.ndarray`): matrix of size n x 1, n observations of Y where Y is a 1-dimensional random vector taking discrete values 0,1,...k-1 k (:obj:`int`): number of discrete values Y can take Output: dist (:obj:`float`): mean energy distance of k assumed conditioanl distributions among Xs, the larger the more difference and the more dependence of Y err (:obj:`bool`): True, if the output can be used, and False otherwise Example: If the X is a 1000 dimensional input, Y is the 10-classification label and if there are 2000 observations, then X: float-valued matrix of size 2000 x 1000 Y: discrete-valued vector of size 2000 x 1 k: integer 10 The algo returns near-zero if X and Y are independent, and positive otherwise """ n, d = X.shape class_ind = [] for i in range(k): ind = np.where(Y == i)[0] if len(ind) > 0: class_ind.append(ind) # some class may be unobserved, redefine k ##how to redefine? k0 = len(class_ind) if k0 < 2: return 0, False # use the largest class as reference for pairwise calculation ref = np.argmax(np.array([len(class_ind[i]) for i in range(k0)])) X1 = X[class_ind[ref], :] dists = [] for j in range(k0): if j != ref: disttemp = dcor.energy_distance(X1, X[class_ind[j], :]) dists.append(disttemp) dist = np.max(np.array(dists)) return dist, True
print(feat) print("Pearson's = {}".format( pearsonr(biochemistry_data[feat], questionnaire_data["HYPERTENSION"])[0])) print("Spearman's = {}".format( spearmanr(biochemistry_data[feat], questionnaire_data["HYPERTENSION"])[0])) print("Kendall's Tau = {}\n".format( kendalltau(biochemistry_data[feat], questionnaire_data["HYPERTENSION"])[0])) print("Distance Correlation = {}".format( dcor.distance_correlation(biochemistry_data[feat], questionnaire_data["HYPERTENSION"]))) print("Energy Distance = {}\n".format( dcor.energy_distance(biochemistry_data[feat], questionnaire_data["HYPERTENSION"]))) #Plot correlation matrix using Pearson's correlation measure if False: import seaborn as sns cols = list(biochemistry_data.columns) corr_matrix = np.corrcoef(biochemistry_data[cols].values.T) print(corr_matrix) plt.figure(1, figsize=(12, 18)) sns.set(font_scale=1.0) heat_map = sns.heatmap(corr_matrix, cbar=False, annot=True, square=True, fmt='.2f', annot_kws={'size': 10},
def energy_dist(self): try: return energy_distance(self.s1, self.s2) except: return 0.0
level = get_level_from_z(z) for l in level: print(l) print('\n') print("Training: ") level = get_level_from_z(tz) for l in level: print(l) print('\n\n') print(len(zs), len(train_z)) print("Avg Train Density: ", train_d / len(train_z)) print("Avg Gen Density: ", gen_d / len(zs)) print("Avg Train NL: ", train_nl / len(train_z)) print("Avg Gen NL: ", gen_nl / len(zs)) print("Avg Plagiarism: ", tot_p / (len(train_z) * len(zs))) print("ED: ", dcor.energy_distance(z_mets, t_mets)) sys.exit() z = get_z_from_file('CV_chunk_100.txt') print(nonlinearity(z)) for _ in range(10): z = torch.DoubleTensor(1, nz).normal_(0, 1) level = get_level_from_z(z) for l in level: print(l) print('\n') interpolate_chunks('CV_chunk_100.txt', 'CV_chunk_1000.txt') z = get_z_from_file('smb_chunk_150.txt')
''' Created on 5 Nov 2019 @author: snake91 ''' import dcor import numpy as np X = np.random.normal(size=100) Y = np.random.normal(size=100) dcor.energy_distance(X, Y)