def power_given_epsilon(independence_test, simulation_type, epsilon, repeats=1000, alpha=.05, additional_params={}): # test statistics under the null, used to estimate the cutoff value under the null distribution test_stats_null = np.zeros(repeats) # test statistic under the alternative test_stats_alternative = np.zeros(repeats) # direct p values on permutation (now, only for fast_mgc) p_values = np.zeros(repeats) for rep in range(repeats): matrix_X, matrix_Y, matrix_Z = generate_three_two_d_gaussians( epsilon, 100, simulation_type) data = np.concatenate([matrix_X, matrix_Y, matrix_Z], axis=0) labels = np.concatenate([ np.repeat(1, matrix_X.shape[0]), np.repeat(2, matrix_Y.shape[0]), np.repeat(3, matrix_Z.shape[0]) ], axis=0).reshape(-1, 1) matrix_U, matrix_V = k_sample_transform(data, labels, is_y_categorical=True) # permutation test if additional_params and additional_params["is_fast"]: p_values[rep], _ = independence_test.p_value( matrix_U, matrix_V, **additional_params) else: permuted_V = np.random.permutation(matrix_V) test_stats_null[rep], _ = independence_test.test_statistic( matrix_U, permuted_V, **additional_params) test_stats_alternative[rep], _ = independence_test.test_statistic( matrix_U, matrix_V, **additional_params) if additional_params and additional_params["is_fast"]: empirical_power = np.where(p_values <= alpha)[0].shape[0] / repeats else: # the cutoff is determined so that 1-alpha of the test statistics under the null distribution # is less than the cutoff cutoff = np.sort(test_stats_null)[math.ceil(repeats * (1 - alpha))] # the proportion of test statistics under the alternative which is no less than the cutoff (in which case # the null is rejected) is the empirical power empirical_power = np.where( test_stats_alternative >= cutoff)[0].shape[0] / repeats return empirical_power
def test_k_sample(): np.random.seed(1234) # prepare data salary_data = pd.read_csv("./mgcpy/hypothesis_tests/salary_data.csv") # 2 sample case men_salaries = salary_data.loc[salary_data['Gender'] == "M"]["Current Annual Salary"].values women_salaries = salary_data.loc[salary_data['Gender'] == "F"]["Current Annual Salary"].values u, v = k_sample_transform(np.random.choice(men_salaries, 1000), np.random.choice(women_salaries, 1000)) mgc = MGC() p_value, p_value_metadata = mgc.p_value(u, v, is_fast=True) assert np.allclose(p_value, 0.0, atol=0.01) # k sample case salaries = salary_data["Current Annual Salary"].values department_labels = salary_data["Department"].values.reshape(-1, 1) u, v = k_sample_transform(salaries[:100], department_labels[:100], is_y_categorical=True) mgc = MGC() p_value, p_value_metadata = mgc.p_value(u, v) assert np.allclose(p_value, 0.0, atol=0.01) # 2 sample case (H_0 is valid) # generate 100 samples from the same distribution (x = np.random.randn(100)) x = np.array([0.34270011, 1.30064541, -0.41888945, 1.40367111, 0.31901975, -1.83695735, -0.70370144, 0.89338428, 0.86047303, -0.98841287, 0.78325279, 0.55864254, 0.33317265, 2.22286831, -0.22349382, 0.40376754, -1.05356267, 0.54994568, -1.39765046, 0.41427267, -0.24457334, 0.2464725, -0.32179342, -1.77106008, -0.52824522, 1.57839019, -1.66455582, -0.97663735, -0.55176702, -1.95347702, 1.01934119, 1.05765468, -0.69941067, -1.12479123, 0.85236935, -0.77356459, 0.30217738, 0.95246919, -0.61210025, 1.09253269, 0.13576324, 0.62642456, 0.1859519, 0.32209166, 1.98633424, -0.57271182, 1.18247811, 2.05352048, -0.28297455, 0.25754106, 0.80790087, -0.26995007, 1.8223331, -1.80151834, 0.71496981, -0.5119113, -1.45558062, 1.24115387, 1.44295579, -0.24726018, -2.07078337, 1.90810404, -1.36892494, -0.39004086, 1.35998082, 1.50891149, -1.29257757, 0.05513461, -1.58889596, 0.48703248, 0.83443891, 0.46234541, 2.20457643, 1.47884097, -0.05316384, 0.72591566, 0.14339927, -1.29137912, 0.07908333, 0.80684167, 0.22417797, 0.45241074, -1.03024521, 0.6615743, 0.27216365, 2.4188678, 0.20561134, 0.71095061, -1.02478312, 0.54512964, 0.16582386, -0.39648338, -0.77905918, -0.33196771, 0.69407125, -0.81484451, 3.01568098, -0.49053868, -0.60987204, 1.72967348]) # assign half of them as samples from 1 and the other half as samples from 2 y = np.concatenate([np.repeat(1, 50), np.repeat(2, 50)], axis=0).reshape(-1, 1) u, v = k_sample_transform(x, y, is_y_categorical=True) mgc = MGC() p_value, p_value_metadata = mgc.p_value(u, v) assert np.allclose(p_value, 0.819, atol=0.1)
def test_local_corr(): np.random.seed(0) matrix_X, matrix_Y, matrix_Z = generate_three_two_d_gaussians(2, 100, 3) data = np.concatenate([matrix_X, matrix_Y, matrix_Z], axis=0) labels = np.concatenate([np.repeat(1, matrix_X.shape[0]), np.repeat(2, matrix_Y.shape[0]), np.repeat(3, matrix_Z.shape[0])], axis=0).reshape(-1, 1) matrix_U, matrix_V = k_sample_transform(data, labels, is_y_categorical=True) # Against linear simulations manova = Manova() test_stat = manova.test_statistic(matrix_U, matrix_V)[0] assert manova.get_name() == 'manova' assert np.allclose(test_stat, 0.06, atol=1.e-2)
def power_scipy(base_path, simulation_type, num_samples, repeats=1000, alpha=.05): # direct p values on permutation p_values = np.zeros(repeats) # absolute path to the benchmark directory file_name_prefix = os.path.join( base_path, 'sample_data_power_sample_sizes/type_{}_size_{}'.format( simulation_type, num_samples)) all_matrix_X = scipy.io.loadmat(file_name_prefix + '_X.mat')['x_mtx'][..., np.newaxis] all_matrix_Y = scipy.io.loadmat(file_name_prefix + '_Y.mat')['y_mtx'][..., np.newaxis] # rotation transform matrix c, s = np.cos(math.radians(60)), np.sin(math.radians(60)) rotation_matrix = np.array([[c, s], [-s, c]]) for rep in range(repeats): matrix_X = all_matrix_X[rep, :, :] matrix_Y = all_matrix_Y[rep, :, :] # apply two sample transform data_matrix = np.concatenate([matrix_X, matrix_Y], axis=1) rotated_data_matrix = np.dot(rotation_matrix, data_matrix.T).T matrix_U, matrix_V = k_sample_transform(data_matrix, rotated_data_matrix) rf_matrix_V = matrix_V.reshape(-1) clf = RandomForestRegressor(n_estimators=500) clf.fit(matrix_U, rf_matrix_V) matrix_U = 1 - proximityMatrix(clf, matrix_U, normalize=True) matrix_U = np.power(matrix_U, 0.5) mgc = multiscale_graphcorr(matrix_U, matrix_V) p_values[rep] = mgc.pvalue empirical_power = np.where(p_values <= alpha)[0].shape[0] / repeats return empirical_power
def power_given_data(base_path, independence_test, simulation_type, num_samples, repeats=1000, alpha=.05, additional_params={}): # test statistics under the null, used to estimate the cutoff value under the null distribution test_stats_null = np.zeros(repeats) # test statistic under the alternative test_stats_alternative = np.zeros(repeats) # direct p values on permutation (now, only for fast_mgc) p_values = np.zeros(repeats) # absolute path to the benchmark directory file_name_prefix = os.path.join( base_path, 'sample_data_power_sample_sizes/type_{}_size_{}'.format( simulation_type, num_samples)) all_matrix_X = scipy.io.loadmat(file_name_prefix + '_X.mat')['x_mtx'][..., np.newaxis] all_matrix_Y = scipy.io.loadmat(file_name_prefix + '_Y.mat')['y_mtx'][..., np.newaxis] # rotation transform matrix c, s = np.cos(math.radians(60)), np.sin(math.radians(60)) rotation_matrix = np.array([[c, s], [-s, c]]) for rep in range(repeats): matrix_X = all_matrix_X[rep, :, :] matrix_Y = all_matrix_Y[rep, :, :] # apply two sample transform data_matrix = np.concatenate([matrix_X, matrix_Y], axis=1) rotated_data_matrix = np.dot(rotation_matrix, data_matrix.T).T matrix_U, matrix_V = k_sample_transform(data_matrix, rotated_data_matrix) # permutation test if additional_params and additional_params["is_fast"]: p_values[rep], _ = independence_test.p_value( matrix_U, matrix_V, **additional_params) else: permuted_V = np.random.permutation(matrix_V) test_stats_null[rep], _ = independence_test.test_statistic( matrix_U, permuted_V, **additional_params) test_stats_alternative[rep], _ = independence_test.test_statistic( matrix_U, matrix_V, **additional_params) # if the test is pearson, use absolute value of the test statistic # so the more extreme test statistic is still in a one-sided interval if independence_test.get_name() == 'pearson': test_stats_null[rep] = abs(test_stats_null[rep]) test_stats_alternative[rep] = abs(test_stats_alternative[rep]) if additional_params and additional_params["is_fast"]: empirical_power = np.where(p_values <= alpha)[0].shape[0] / repeats else: # the cutoff is determined so that 1-alpha of the test statistics under the null distribution # is less than the cutoff cutoff = np.sort(test_stats_null)[math.ceil(repeats * (1 - alpha))] # the proportion of test statistics under the alternative which is no less than the cutoff (in which case # the null is rejected) is the empirical power empirical_power = np.where( test_stats_alternative >= cutoff)[0].shape[0] / repeats return empirical_power
plt.style.use("seaborn-white") sns.set_palette("deep") n_sims = 100 n_samples = 100 n_components = 2 n_permutations = 1000 size = (n_samples, n_components) #%% mgcpy package p_vals = np.zeros(n_sims) for i in tqdm(range(n_sims)): sample1 = np.random.uniform(0.2, 0.7, size=size) sample2 = np.random.uniform(0.2, 0.7, size=size) sample, indicator = k_sample_transform(sample1, sample2) test = DCorr(which_test="unbiased") p, p_meta = test.p_value( sample, indicator, replication_factor=n_permutations, is_fast=False ) p_vals[i] = p plt.figure() sns.distplot(p_vals) plt.title("MGCPy DCorr, 2-sample under null, unbiased, not fast") plt.xlabel("p-value") plt.savefig("graspy-misc/profile_dcorr/mgcpy_dcorr.png", facecolor="w") #%% mgcpy with is_fast=True # p_vals = np.zeros(n_sims) # for i in tqdm(range(n_sims)):