def compute_mmd2u_and_null_distributions(Ks, m, n, iterations=1000, seed=0, parallel=True, permutation=None, n_jobs=-1, verbose=False): """Compute MMD2u statistic and its null-distribution for each unit from kernel matrices Ks. Each null-distributions is approximated with the given number of iterations. Parallel (multiprocess, with n_jobs processes) computation is available. Note: n_jobs=-1 means 'use all available cores'. Precomputed permutations (array of size iterations x (m+n)) can be used instead of randomly generated ones to enforce reproducibility and keep the desired permutation schema for each kernel/unit. This is important during parallel computation. """ n_units = len(Ks) unit_statistic = np.zeros(n_units) unit_statistic_permutation = np.zeros((n_units, iterations)) print("Computing MMD2u for each unit.") for i, K in enumerate(Ks): mmd2u = MMD2u(K, m, n) unit_statistic[i] = mmd2u print("Computing MMD2u's null-distribution, for each unit.") if not parallel: for i, K in enumerate(Ks): if permutation is None: # NOTE: IT IS FUNDAMENTAL THAT THE SAME SEED IS USED # FOR EACH UNIT! mmd2u_null = compute_null_distribution(K, m, n, iterations=iterations, verbose=verbose, random_state=seed, marker_interval=100) else: mmd2u_null = compute_null_distribution_given_permutations(K, m, n, permutation, iterations=iterations) unit_statistic_permutation[i, :] = mmd2u_null else: print("Parallel computation!") if permutation is None: # NOTE: IT IS FUNDAMENTAL THAT THE SAME SEED IS USED FOR EACH UNIT! results = Parallel(n_jobs=n_jobs, verbose=10)(delayed(compute_null_distribution)(K, m, n, iterations=iterations, verbose=False, random_state=seed) for K in Ks) else: results = Parallel(n_jobs=n_jobs, verbose=10)(delayed(compute_null_distribution_given_permutations)(K, m, n, permutation, iterations=iterations) for K in Ks) unit_statistic_permutation = np.vstack(results) return unit_statistic, unit_statistic_permutation
def apply_ktst(K, y, iterations=10000, subjects=False, verbose=True): """ Compute MMD^2_u, its null distribution and the p-value of the kernel two-sample test. Parameters: ---------- K: array-like Kernel matrix y: array_like class labels verbose: bool Verbosity Returns: ------- mmd2u: float MMD^2_u value. acc_null: array Null distribution of the MMD^2_u p_value: float p-value """ assert len(np.unique(y)) == 2, 'KTST only works on binary problems' # Assuming that the first m rows of the kernel matrix are from one # class and the other n rows from the second class. m = len(y[y == 0]) n = len(y[y == 1]) mmd2u = MMD2u(K, m, n) if verbose: print("MMD^2_u = %s" % mmd2u) print("Computing the null distribution.") if subjects: perms = [_permutation_subjects_ktst(y) for i in range(iterations)] mmd2u_null = compute_null_distribution_given_permutations( K, m, n, perms, iterations) else: mmd2u_null = compute_null_distribution(K, m, n, iterations, verbose=verbose) p_value = max(1.0 / iterations, (mmd2u_null > mmd2u).sum() / float(iterations)) if verbose: print("p-value ~= %s \t (resolution : %s)" % (p_value, 1.0 / iterations)) return mmd2u, mmd2u_null, p_value
def apply_ktst(K, y, iterations=10000, subjects=False, verbose=True): """ Compute MMD^2_u, its null distribution and the p-value of the kernel two-sample test. Parameters: ---------- K: array-like Kernel matrix y: array_like class labels verbose: bool Verbosity Returns: ------- mmd2u: float MMD^2_u value. acc_null: array Null distribution of the MMD^2_u p_value: float p-value """ assert len(np.unique(y)) == 2, 'KTST only works on binary problems' # Assuming that the first m rows of the kernel matrix are from one # class and the other n rows from the second class. m = len(y[y == 0]) n = len(y[y == 1]) mmd2u = MMD2u(K, m, n) if verbose: print("MMD^2_u = %s" % mmd2u) print("Computing the null distribution.") if subjects: perms = [_permutation_subjects_ktst(y) for i in range(iterations)] mmd2u_null = compute_null_distribution_given_permutations(K, m, n, perms, iterations) else: mmd2u_null = compute_null_distribution(K, m, n, iterations, verbose=verbose) p_value = max(1.0/iterations, (mmd2u_null > mmd2u).sum() / float(iterations)) if verbose: print("p-value ~= %s \t (resolution : %s)" % (p_value, 1.0/iterations)) return mmd2u, mmd2u_null, p_value
def MMD_single_modality(data_b6, data_btbr, modality='Structural', iterations=100000, plot=True): """ Process the data with the following approach: Embedding + RBF_kernel + KTST Parameters: ----------- Return: ---------- MMD distance, null_distribution, p-value """ print 'Analyzing %s data' %(modality) #Concatenating the data vectors = np.vstack((data_b6, data_btbr)) n_b6 = len(data_b6) n_btbr = len(data_btbr) sigma2 = np.median(pairwise_distances(vectors, metric='euclidean'))**2 k_matrix = pairwise_kernels(vectors, metric='rbf', gamma=1.0/sigma2) if plot: plot_similarity_matrix(k_matrix) #Computing the MMD mmd2u = MMD2u(k_matrix, n_b6, n_btbr) print("MMD^2_u = %s" % mmd2u) #Computing the null-distribution #Null distribution only on B6 mice # sigma2_b6 = np.median(pairwise_distances(vectors_cl1, metric='euclidean'))**2 # k_matrix_b6 = pairwise_kernels(vectors_cl1, metric='rbf', gamma=1.0/sigma2_b6) # mmd2u_null = compute_null_distribution(k_matrix_b6, 5, 5, iterations, seed=123, verbose=False) mmd2u_null = compute_null_distribution(k_matrix, n_b6, n_btbr, iterations, seed=123, verbose=False) print np.max(mmd2u_null) #Computing the p-value p_value = max(1.0/iterations, (mmd2u_null > mmd2u).sum() / float(iterations)) print("p-value ~= %s \t (resolution : %s)" % (p_value, 1.0/iterations)) print 'Number of stds from MMD^2_u to mean value of null distribution: %s' % ((mmd2u - np.mean(mmd2u_null))/np.std(mmd2u_null)) if plot: fig = plt.figure() ax = fig.add_subplot(111) prob, bins, patches = plt.hist(mmd2u_null, bins=50, normed=True) ax.plot(mmd2u, prob.max()/30, 'w*', markersize=15, markeredgecolor='k', markeredgewidth=2, label="$%s MMD^2_u = %s$" % (modality, mmd2u)) # func_p_value = max(1.0/iterations, (functional_mmd[1] > functional_mmd[0]).sum() / float(iterations)) ax.annotate('p-value: %s' %(p_value), xy=(float(mmd2u), prob.max()/9.), xycoords='data', xytext=(-105, 30), textcoords='offset points', bbox=dict(boxstyle="round", fc="1."), arrowprops=dict(arrowstyle="->", connectionstyle="angle,angleA=0,angleB=90,rad=10"), ) plt.xlabel('$MMD^2_u$') plt.ylabel('$p(MMD^2_u)$') plt.legend(numpoints=1) # plt.title('%s_DATA: $p$-value=%s' %(modality, p_value)) print ''
def compute_mmd_struc_func(k_mat, struc_b6, struc_btbr, func_b6, func_btbr, iterations=100000, plot=True): """ Computes the mmd values for the structural and functional problems and plot them with the null distributions. Parameters: ---------- k_mat: ndarray Kernel matrix struc_b6: array like Structural vectors for B6 class struc_btbr: array like Structural vectors for BTBR class func_b6: array like Functional vectors for B6 class func_btbr: array like Functional vectors for BTBR class """ # Computing the number of samples belonging to structural data in order # to split the kernel matrix. l_struc = len(struc_b6) + len(struc_btbr) # Computing MMD values struc_mmd = MMD2u(k_mat[:l_struc][:, :l_struc], len(struc_b6), len(struc_btbr)) func_mmd = MMD2u(k_mat[l_struc:][:, l_struc:], len(func_b6), len(func_btbr)) print "struc_mmd = %s, func_mmd = %s" % (struc_mmd, func_mmd) # Computing the null-distribution mmd2u_null_all = compute_null_distribution( k_mat, struc_b6.shape[0] + func_b6.shape[0], struc_btbr.shape[0] + func_btbr.shape[0], iterations, seed=123, verbose=False, ) # Computing the p-value struc_p_value = max(1.0 / iterations, (mmd2u_null_all > struc_mmd).sum() / float(iterations)) print ("struc_p-value ~= %s \t (resolution : %s)" % (struc_p_value, 1.0 / iterations)) func_p_value = max(1.0 / iterations, (mmd2u_null_all > func_mmd).sum() / float(iterations)) print ("func_p-value ~= %s \t (resolution : %s)" % (func_p_value, 1.0 / iterations)) if plot: fig = plt.figure() ax = fig.add_subplot(111) prob, bins, patches = plt.hist(mmd2u_null_all, bins=50, normed=True) ax.plot( struc_mmd, prob.max() / 30, "w*", markersize=15, markeredgecolor="k", markeredgewidth=2, label="$MMD^2_S = %s$" % struc_mmd, ) ax.plot( func_mmd, prob.max() / 30, "w^", markersize=15, markeredgecolor="k", markeredgewidth=2, label="$MMD^2_F = %s$" % func_mmd, ) plt.xlabel("$MMD^2_u$") plt.ylabel("$p(MMD^2_u)$") # plt.title('$MMD^2_u$: null-distribution and observed values') ax.annotate( "p-value: %s" % (struc_p_value), xy=(float(struc_mmd), 4.0), xycoords="data", xytext=(-105, 30), textcoords="offset points", bbox=dict(boxstyle="round", fc="1."), arrowprops=dict(arrowstyle="->", connectionstyle="angle,angleA=0,angleB=90,rad=10"), ) ax.annotate( "p-value: %s" % (func_p_value), xy=(float(func_mmd), 4.0), xycoords="data", xytext=(10, 30), textcoords="offset points", bbox=dict(boxstyle="round", fc="1."), arrowprops=dict(arrowstyle="->", connectionstyle="angle,angleA=0,angleB=90,rad=10"), ) plt.legend(numpoints=1)
def compute_mmd2u_and_null_distributions(Ks, m, n, iterations=1000, seed=0, parallel=True, permutation=None, n_jobs=-1, verbose=False): """Compute MMD2u statistic and its null-distribution for each unit from kernel matrices Ks. Each null-distributions is approximated with the given number of iterations. Parallel (multiprocess, with n_jobs processes) computation is available. Note: n_jobs=-1 means 'use all available cores'. Precomputed permutations (array of size iterations x (m+n)) can be used instead of randomly generated ones to enforce reproducibility and keep the desired permutation schema for each kernel/unit. This is important during parallel computation. """ n_units = len(Ks) unit_statistic = np.zeros(n_units) unit_statistic_permutation = np.zeros((n_units, iterations)) print("Computing MMD2u for each unit.") for i, K in enumerate(Ks): mmd2u = MMD2u(K, m, n) unit_statistic[i] = ( n + m ) * mmd2u # For using asymptotic distribution of MMD the statistic is (n+m)*mmd2u print("Computing MMD2u's null-distribution, for each unit.") if not parallel: for i, K in enumerate(Ks): if permutation is None: mmd2u_null = compute_null_distribution( K, m, n, iterations=iterations, verbose=verbose, seed=seed, marker_interval=100 ) # NOTE: IT IS FUNDAMENTAL THAT THE SAME IS USED SEED FOR EACH UNIT! else: mmd2u_null = compute_null_distribution_given_permutations( K, m, n, permutation, iterations=iterations) unit_statistic_permutation[i, :] = mmd2u_null else: print("Parallel computation!") if permutation is None: results = Parallel(n_jobs=n_jobs, verbose=10)( delayed(compute_null_distribution)(K, m, n, iterations=iterations, verbose=False, seed=seed) for K in Ks ) # NOTE: IT IS FUNDAMENTAL THAT THE SAME SEED IS USED FOR EACH UNIT! else: results = Parallel(n_jobs=n_jobs, verbose=10)( delayed(compute_null_distribution_given_permutations)( K, m, n, permutation, iterations=iterations) for K in Ks) unit_statistic_permutation = np.vstack(results) return unit_statistic, unit_statistic_permutation