예제 #1
0
def cal_info(X, Y, k):
    """
		calculate the max energy distance among random vector Xs that condition
		on k classes, associated with the random discrete variable Y

		Requirments:
		dcor, which is on PyPi and can be installed using pip and conda

		Input:
		X (:obj:`numpy.ndarray`): matrix of size n x d, n observations of X
		where X is a d-dimensional random vector taking continuous values
		Y (:obj:`numpy.ndarray`): matrix of size n x 1, n observations of Y
		where Y is a 1-dimensional random vector taking discrete values
			0,1,...k-1
		k (:obj:`int`): number of discrete values Y can take

		Output:
		dist (:obj:`float`): mean energy distance of k assumed conditioanl 
		distributions among Xs, the larger the more difference and the more dependence of Y
		err (:obj:`bool`): True, if the output can be used,
			and False otherwise

		Example:
		If the X is a 1000 dimensional input, Y is the 10-classification label
		and if there are 2000 observations, then
		X: float-valued matrix of size 2000 x 1000
		Y: discrete-valued vector of size 2000 x 1
		k: integer 10
		The algo returns near-zero if X and Y are independent, and positive otherwise
	"""

    n, d = X.shape
    class_ind = []
    for i in range(k):
        ind = np.where(Y == i)[0]
        if len(ind) > 0:
            class_ind.append(ind)

    # some class may be unobserved, redefine k
    ##how to redefine?
    k0 = len(class_ind)
    if k0 < 2:
        return 0, False

    # use the largest class as reference for pairwise calculation
    ref = np.argmax(np.array([len(class_ind[i]) for i in range(k0)]))
    X1 = X[class_ind[ref], :]
    dists = []
    for j in range(k0):
        if j != ref:
            disttemp = dcor.energy_distance(X1, X[class_ind[j], :])
            dists.append(disttemp)
    dist = np.max(np.array(dists))
    return dist, True
        print(feat)

        print("Pearson's = {}".format(
            pearsonr(biochemistry_data[feat],
                     questionnaire_data["HYPERTENSION"])[0]))
        print("Spearman's = {}".format(
            spearmanr(biochemistry_data[feat],
                      questionnaire_data["HYPERTENSION"])[0]))
        print("Kendall's Tau = {}\n".format(
            kendalltau(biochemistry_data[feat],
                       questionnaire_data["HYPERTENSION"])[0]))
        print("Distance Correlation = {}".format(
            dcor.distance_correlation(biochemistry_data[feat],
                                      questionnaire_data["HYPERTENSION"])))
        print("Energy Distance = {}\n".format(
            dcor.energy_distance(biochemistry_data[feat],
                                 questionnaire_data["HYPERTENSION"])))

#Plot correlation matrix using Pearson's correlation measure
if False:
    import seaborn as sns
    cols = list(biochemistry_data.columns)
    corr_matrix = np.corrcoef(biochemistry_data[cols].values.T)
    print(corr_matrix)
    plt.figure(1, figsize=(12, 18))
    sns.set(font_scale=1.0)
    heat_map = sns.heatmap(corr_matrix,
                           cbar=False,
                           annot=True,
                           square=True,
                           fmt='.2f',
                           annot_kws={'size': 10},
예제 #3
0
 def energy_dist(self):
     try:
         return energy_distance(self.s1, self.s2)
     except:
         return 0.0
예제 #4
0
level = get_level_from_z(z)
for l in level:
    print(l)
print('\n')
print("Training: ")
level = get_level_from_z(tz)
for l in level:
    print(l)
print('\n\n')
print(len(zs), len(train_z))
print("Avg Train Density: ", train_d / len(train_z))
print("Avg Gen Density: ", gen_d / len(zs))
print("Avg Train NL: ", train_nl / len(train_z))
print("Avg Gen NL: ", gen_nl / len(zs))
print("Avg Plagiarism: ", tot_p / (len(train_z) * len(zs)))
print("ED: ", dcor.energy_distance(z_mets, t_mets))
sys.exit()

z = get_z_from_file('CV_chunk_100.txt')
print(nonlinearity(z))

for _ in range(10):
    z = torch.DoubleTensor(1, nz).normal_(0, 1)
    level = get_level_from_z(z)
    for l in level:
        print(l)

    print('\n')

interpolate_chunks('CV_chunk_100.txt', 'CV_chunk_1000.txt')
z = get_z_from_file('smb_chunk_150.txt')
예제 #5
0
'''
Created on 5 Nov 2019

@author: snake91
'''

import dcor
import numpy as np

X = np.random.normal(size=100)
Y = np.random.normal(size=100)

dcor.energy_distance(X, Y)