def apply_ktst(K, y, iterations=10000, subjects=False, verbose=True): """ Compute MMD^2_u, its null distribution and the p-value of the kernel two-sample test. Parameters: ---------- K: array-like Kernel matrix y: array_like class labels verbose: bool Verbosity Returns: ------- mmd2u: float MMD^2_u value. acc_null: array Null distribution of the MMD^2_u p_value: float p-value """ assert len(np.unique(y)) == 2, 'KTST only works on binary problems' # Assuming that the first m rows of the kernel matrix are from one # class and the other n rows from the second class. m = len(y[y == 0]) n = len(y[y == 1]) mmd2u = MMD2u(K, m, n) if verbose: print("MMD^2_u = %s" % mmd2u) print("Computing the null distribution.") if subjects: perms = [permutation_subjects_ktst(y) for i in range(iterations)] mmd2u_null = compute_null_distribution_given_permutations( K, m, n, perms, iterations) else: mmd2u_null = compute_null_distribution(K, m, n, iterations, verbose=verbose) p_value = max(1.0 / iterations, (mmd2u_null > mmd2u).sum() / float(iterations)) if verbose: print("p-value ~= %s \t (resolution : %s)" % (p_value, 1.0 / iterations)) return mmd2u, mmd2u_null, p_value
def apply_ktst(K, y, iterations=10000, subjects=False, verbose=True): """ Compute MMD^2_u, its null distribution and the p-value of the kernel two-sample test. Parameters: ---------- K: array-like Kernel matrix y: array_like class labels verbose: bool Verbosity Returns: ------- mmd2u: float MMD^2_u value. acc_null: array Null distribution of the MMD^2_u p_value: float p-value """ assert len(np.unique(y)) == 2, 'KTST only works on binary problems' # Assuming that the first m rows of the kernel matrix are from one # class and the other n rows from the second class. m = len(y[y == 0]) n = len(y[y == 1]) mmd2u = MMD2u(K, m, n) if verbose: print("MMD^2_u = %s" % mmd2u) print("Computing the null distribution.") if subjects: perms = [permutation_subjects_ktst(y) for i in range(iterations)] mmd2u_null = compute_null_distribution_given_permutations(K, m, n, perms, iterations) else: mmd2u_null = compute_null_distribution(K, m, n, iterations, verbose=verbose) p_value = max(1.0/iterations, (mmd2u_null > mmd2u).sum() / float(iterations)) if verbose: print("p-value ~= %s \t (resolution : %s)" % (p_value, 1.0/iterations)) return mmd2u, mmd2u_null, p_value
X = np.vstack([A, B]) y = np.concatenate([np.zeros(nA), np.ones(nB)]) distances = pairwise_distances(X, metric='euclidean') sigma2 = np.median(distances)**2.0 K = np.exp(-distances * distances / sigma2) # K = X.dot(X.T) iterations = 10000 mmd2u_unpermuted = MMD2u(K, nA, nB) print("mmd2u: %s" % mmd2u_unpermuted) mmd2us[r] = mmd2u_unpermuted mmd2us_null = compute_null_distribution(K, nA, nB, iterations, random_state=rng_ktst) p_value_mmd2u = estimate_pvalue(mmd2u_unpermuted, mmd2us_null) print("mmd2u p-value: %s" % p_value_mmd2u) p_value_mmd2us[r] = p_value_mmd2u scoring = 'accuracy' n_folds = 5 iterations = 1 # score_unpermuted = compute_svm_score_nestedCV(K, y, n_folds, # scoring=scoring, # random_state=rng_cv) rngs = [ np.random.RandomState(rng_cv.randint(low=MIN_INT, high=MAX_INT))
B = rng_data.multivariate_normal(muB, covB, size=nB) X = np.vstack([A, B]) y = np.concatenate([np.zeros(nA), np.ones(nB)]) distances = pairwise_distances(X, metric='euclidean') sigma2 = np.median(distances) ** 2.0 K = np.exp(- distances * distances / sigma2) # K = X.dot(X.T) iterations = 10000 mmd2u_unpermuted = MMD2u(K, nA, nB) print("mmd2u: %s" % mmd2u_unpermuted) mmd2us[r] = mmd2u_unpermuted mmd2us_null = compute_null_distribution(K, nA, nB, iterations, random_state=rng_ktst) p_value_mmd2u = estimate_pvalue(mmd2u_unpermuted, mmd2us_null) print("mmd2u p-value: %s" % p_value_mmd2u) p_value_mmd2us[r] = p_value_mmd2u scoring = 'accuracy' n_folds = 5 iterations = 1 # score_unpermuted = compute_svm_score_nestedCV(K, y, n_folds, # scoring=scoring, # random_state=rng_cv) rngs = [np.random.RandomState(rng_cv.randint(low=MIN_INT, high=MAX_INT)) for i in range(iterations)] scores_unpermuted = Parallel(n_jobs=-1)(delayed(compute_svm_score_nestedCV)(K, y, n_folds, scoring, rngs[i], param_grid=svm_param_grid) for i in range(iterations)) score_unpermuted = np.mean(scores_unpermuted) print("accuracy: %s" % score_unpermuted)
def compute_mmd_struc_func(k_mat, struc_b6, struc_btbr, func_b6, func_btbr, iterations=100000): """ Computes the mmd values for the structural and functional problems and plot them with the null distributions. Parameters: ---------- k_mat: ndarray Kernel matrix struc_b6: array like Structural vectors for B6 class struc_btbr: array like Structural vectors for BTBR class func_b6: array like Functional vectors for B6 class func_btbr: array like Functional vectors for BTBR class """ #Computing the number of samples belonging to structural data in order #to split the kernel matrix. l_struc = len(struc_b6) + len(struc_btbr) #Computing MMD values struc_mmd = MMD2u(k_mat[:l_struc][:, :l_struc], len(struc_b6), len(struc_btbr)) func_mmd = MMD2u(k_mat[l_struc:][:, l_struc:], len(func_b6), len(func_btbr)) print "struc_mmd = %s, func_mmd = %s" % (struc_mmd, func_mmd) #Computing the null-distribution mmd2u_null_all = compute_null_distribution( k_mat, struc_b6.shape[0] + func_b6.shape[0], struc_btbr.shape[0] + func_btbr.shape[0], iterations, seed=123, verbose=False) #Computing the p-value struc_p_value = max(1.0 / iterations, (mmd2u_null_all > struc_mmd).sum() / float(iterations)) print("struc_p-value ~= %s \t (resolution : %s)" % (struc_p_value, 1.0 / iterations)) func_p_value = max(1.0 / iterations, (mmd2u_null_all > func_mmd).sum() / float(iterations)) print("func_p-value ~= %s \t (resolution : %s)" % (func_p_value, 1.0 / iterations)) fig = plt.figure() ax = fig.add_subplot(111) prob, bins, patches = plt.hist(mmd2u_null_all, bins=50, normed=True) ax.plot(struc_mmd, prob.max() / 30, 'w*', markersize=15, markeredgecolor='k', markeredgewidth=2, label="$Structural MMD^2_u = %s$" % struc_mmd) ax.plot(func_mmd, prob.max() / 30, 'w^', markersize=15, markeredgecolor='k', markeredgewidth=2, label="$Functional MMD^2_u = %s$" % func_mmd) plt.xlabel('$MMD^2_u$') plt.ylabel('$p(MMD^2_u)$') plt.title('$MMD^2_u$: null-distribution and observed values') ax.annotate( 'p-value: %s' % (struc_p_value), xy=(float(struc_mmd), 4.), xycoords='data', xytext=(-105, 30), textcoords='offset points', bbox=dict(boxstyle="round", fc="1."), arrowprops=dict(arrowstyle="->", connectionstyle="angle,angleA=0,angleB=90,rad=10"), ) ax.annotate( 'p-value: %s' % (func_p_value), xy=(float(func_mmd), 4.), xycoords='data', xytext=(10, 30), textcoords='offset points', bbox=dict(boxstyle="round", fc="1."), arrowprops=dict(arrowstyle="->", connectionstyle="angle,angleA=0,angleB=90,rad=10"), ) plt.legend(numpoints=1)