Пример #1
0
def power_given_epsilon(independence_test,
                        simulation_type,
                        epsilon,
                        repeats=1000,
                        alpha=.05,
                        additional_params={}):
    # test statistics under the null, used to estimate the cutoff value under the null distribution
    test_stats_null = np.zeros(repeats)

    # test statistic under the alternative
    test_stats_alternative = np.zeros(repeats)

    # direct p values on permutation (now, only for fast_mgc)
    p_values = np.zeros(repeats)

    for rep in range(repeats):
        matrix_X, matrix_Y, matrix_Z = generate_three_two_d_gaussians(
            epsilon, 100, simulation_type)

        data = np.concatenate([matrix_X, matrix_Y, matrix_Z], axis=0)
        labels = np.concatenate([
            np.repeat(1, matrix_X.shape[0]),
            np.repeat(2, matrix_Y.shape[0]),
            np.repeat(3, matrix_Z.shape[0])
        ],
                                axis=0).reshape(-1, 1)

        matrix_U, matrix_V = k_sample_transform(data,
                                                labels,
                                                is_y_categorical=True)

        # permutation test
        if additional_params and additional_params["is_fast"]:
            p_values[rep], _ = independence_test.p_value(
                matrix_U, matrix_V, **additional_params)
        else:
            permuted_V = np.random.permutation(matrix_V)
            test_stats_null[rep], _ = independence_test.test_statistic(
                matrix_U, permuted_V, **additional_params)
            test_stats_alternative[rep], _ = independence_test.test_statistic(
                matrix_U, matrix_V, **additional_params)

    if additional_params and additional_params["is_fast"]:
        empirical_power = np.where(p_values <= alpha)[0].shape[0] / repeats
    else:
        # the cutoff is determined so that 1-alpha of the test statistics under the null distribution
        # is less than the cutoff
        cutoff = np.sort(test_stats_null)[math.ceil(repeats * (1 - alpha))]

        # the proportion of test statistics under the alternative which is no less than the cutoff (in which case
        # the null is rejected) is the empirical power
        empirical_power = np.where(
            test_stats_alternative >= cutoff)[0].shape[0] / repeats

    return empirical_power
Пример #2
0
def test_k_sample():
    np.random.seed(1234)

    # prepare data
    salary_data = pd.read_csv("./mgcpy/hypothesis_tests/salary_data.csv")

    # 2 sample case
    men_salaries = salary_data.loc[salary_data['Gender'] == "M"]["Current Annual Salary"].values
    women_salaries = salary_data.loc[salary_data['Gender'] == "F"]["Current Annual Salary"].values
    u, v = k_sample_transform(np.random.choice(men_salaries, 1000), np.random.choice(women_salaries, 1000))
    mgc = MGC()
    p_value, p_value_metadata = mgc.p_value(u, v, is_fast=True)
    assert np.allclose(p_value, 0.0, atol=0.01)

    # k sample case
    salaries = salary_data["Current Annual Salary"].values
    department_labels = salary_data["Department"].values.reshape(-1, 1)
    u, v = k_sample_transform(salaries[:100], department_labels[:100], is_y_categorical=True)
    mgc = MGC()
    p_value, p_value_metadata = mgc.p_value(u, v)
    assert np.allclose(p_value, 0.0, atol=0.01)

    # 2 sample case (H_0 is valid)

    # generate 100 samples from the same distribution (x = np.random.randn(100))
    x = np.array([0.34270011,  1.30064541, -0.41888945,  1.40367111,  0.31901975, -1.83695735, -0.70370144,  0.89338428,  0.86047303, -0.98841287,
                  0.78325279,  0.55864254,  0.33317265,  2.22286831, -0.22349382, 0.40376754, -1.05356267,  0.54994568, -1.39765046,  0.41427267,
                  -0.24457334,  0.2464725, -0.32179342, -1.77106008, -0.52824522, 1.57839019, -1.66455582, -0.97663735, -0.55176702, -1.95347702,
                  1.01934119,  1.05765468, -0.69941067, -1.12479123,  0.85236935, -0.77356459,  0.30217738,  0.95246919, -0.61210025,  1.09253269,
                  0.13576324,  0.62642456,  0.1859519,  0.32209166,  1.98633424, -0.57271182,  1.18247811,  2.05352048, -0.28297455,  0.25754106,
                  0.80790087, -0.26995007,  1.8223331, -1.80151834,  0.71496981, -0.5119113, -1.45558062,  1.24115387,  1.44295579, -0.24726018,
                  -2.07078337,  1.90810404, -1.36892494, -0.39004086,  1.35998082, 1.50891149, -1.29257757,  0.05513461, -1.58889596,  0.48703248,
                  0.83443891,  0.46234541,  2.20457643,  1.47884097, -0.05316384, 0.72591566,  0.14339927, -1.29137912,  0.07908333,  0.80684167,
                  0.22417797,  0.45241074, -1.03024521,  0.6615743,  0.27216365, 2.4188678,  0.20561134,  0.71095061, -1.02478312,  0.54512964,
                  0.16582386, -0.39648338, -0.77905918, -0.33196771,  0.69407125, -0.81484451,  3.01568098, -0.49053868, -0.60987204,  1.72967348])
    # assign half of them as samples from 1 and the other half as samples from 2
    y = np.concatenate([np.repeat(1, 50), np.repeat(2, 50)], axis=0).reshape(-1, 1)

    u, v = k_sample_transform(x, y, is_y_categorical=True)
    mgc = MGC()
    p_value, p_value_metadata = mgc.p_value(u, v)
    assert np.allclose(p_value, 0.819, atol=0.1)
Пример #3
0
def test_local_corr():
    np.random.seed(0)
    matrix_X, matrix_Y, matrix_Z = generate_three_two_d_gaussians(2, 100, 3)

    data = np.concatenate([matrix_X, matrix_Y, matrix_Z], axis=0)
    labels = np.concatenate([np.repeat(1, matrix_X.shape[0]), np.repeat(2, matrix_Y.shape[0]), np.repeat(3, matrix_Z.shape[0])], axis=0).reshape(-1, 1)

    matrix_U, matrix_V = k_sample_transform(data, labels, is_y_categorical=True)

    # Against linear simulations
    manova = Manova()
    test_stat = manova.test_statistic(matrix_U, matrix_V)[0]

    assert manova.get_name() == 'manova'
    assert np.allclose(test_stat, 0.06, atol=1.e-2)
Пример #4
0
def power_scipy(base_path,
                simulation_type,
                num_samples,
                repeats=1000,
                alpha=.05):
    # direct p values on permutation
    p_values = np.zeros(repeats)

    # absolute path to the benchmark directory
    file_name_prefix = os.path.join(
        base_path, 'sample_data_power_sample_sizes/type_{}_size_{}'.format(
            simulation_type, num_samples))

    all_matrix_X = scipy.io.loadmat(file_name_prefix +
                                    '_X.mat')['x_mtx'][..., np.newaxis]
    all_matrix_Y = scipy.io.loadmat(file_name_prefix +
                                    '_Y.mat')['y_mtx'][..., np.newaxis]

    # rotation transform matrix
    c, s = np.cos(math.radians(60)), np.sin(math.radians(60))
    rotation_matrix = np.array([[c, s], [-s, c]])

    for rep in range(repeats):
        matrix_X = all_matrix_X[rep, :, :]
        matrix_Y = all_matrix_Y[rep, :, :]

        # apply two sample transform
        data_matrix = np.concatenate([matrix_X, matrix_Y], axis=1)
        rotated_data_matrix = np.dot(rotation_matrix, data_matrix.T).T
        matrix_U, matrix_V = k_sample_transform(data_matrix,
                                                rotated_data_matrix)

        rf_matrix_V = matrix_V.reshape(-1)
        clf = RandomForestRegressor(n_estimators=500)
        clf.fit(matrix_U, rf_matrix_V)
        matrix_U = 1 - proximityMatrix(clf, matrix_U, normalize=True)
        matrix_U = np.power(matrix_U, 0.5)

        mgc = multiscale_graphcorr(matrix_U, matrix_V)
        p_values[rep] = mgc.pvalue

    empirical_power = np.where(p_values <= alpha)[0].shape[0] / repeats

    return empirical_power
Пример #5
0
def power_given_data(base_path,
                     independence_test,
                     simulation_type,
                     num_samples,
                     repeats=1000,
                     alpha=.05,
                     additional_params={}):
    # test statistics under the null, used to estimate the cutoff value under the null distribution
    test_stats_null = np.zeros(repeats)
    # test statistic under the alternative
    test_stats_alternative = np.zeros(repeats)

    # direct p values on permutation (now, only for fast_mgc)
    p_values = np.zeros(repeats)

    # absolute path to the benchmark directory
    file_name_prefix = os.path.join(
        base_path, 'sample_data_power_sample_sizes/type_{}_size_{}'.format(
            simulation_type, num_samples))

    all_matrix_X = scipy.io.loadmat(file_name_prefix +
                                    '_X.mat')['x_mtx'][..., np.newaxis]
    all_matrix_Y = scipy.io.loadmat(file_name_prefix +
                                    '_Y.mat')['y_mtx'][..., np.newaxis]

    # rotation transform matrix
    c, s = np.cos(math.radians(60)), np.sin(math.radians(60))
    rotation_matrix = np.array([[c, s], [-s, c]])

    for rep in range(repeats):
        matrix_X = all_matrix_X[rep, :, :]
        matrix_Y = all_matrix_Y[rep, :, :]

        # apply two sample transform
        data_matrix = np.concatenate([matrix_X, matrix_Y], axis=1)
        rotated_data_matrix = np.dot(rotation_matrix, data_matrix.T).T
        matrix_U, matrix_V = k_sample_transform(data_matrix,
                                                rotated_data_matrix)

        # permutation test
        if additional_params and additional_params["is_fast"]:
            p_values[rep], _ = independence_test.p_value(
                matrix_U, matrix_V, **additional_params)
        else:
            permuted_V = np.random.permutation(matrix_V)
            test_stats_null[rep], _ = independence_test.test_statistic(
                matrix_U, permuted_V, **additional_params)
            test_stats_alternative[rep], _ = independence_test.test_statistic(
                matrix_U, matrix_V, **additional_params)

        # if the test is pearson, use absolute value of the test statistic
        # so the more extreme test statistic is still in a one-sided interval
        if independence_test.get_name() == 'pearson':
            test_stats_null[rep] = abs(test_stats_null[rep])
            test_stats_alternative[rep] = abs(test_stats_alternative[rep])

    if additional_params and additional_params["is_fast"]:
        empirical_power = np.where(p_values <= alpha)[0].shape[0] / repeats
    else:
        # the cutoff is determined so that 1-alpha of the test statistics under the null distribution
        # is less than the cutoff
        cutoff = np.sort(test_stats_null)[math.ceil(repeats * (1 - alpha))]

        # the proportion of test statistics under the alternative which is no less than the cutoff (in which case
        # the null is rejected) is the empirical power
        empirical_power = np.where(
            test_stats_alternative >= cutoff)[0].shape[0] / repeats

    return empirical_power
Пример #6
0
plt.style.use("seaborn-white")
sns.set_palette("deep")

n_sims = 100
n_samples = 100
n_components = 2
n_permutations = 1000
size = (n_samples, n_components)

#%% mgcpy package
p_vals = np.zeros(n_sims)
for i in tqdm(range(n_sims)):
    sample1 = np.random.uniform(0.2, 0.7, size=size)
    sample2 = np.random.uniform(0.2, 0.7, size=size)

    sample, indicator = k_sample_transform(sample1, sample2)
    test = DCorr(which_test="unbiased")
    p, p_meta = test.p_value(
        sample, indicator, replication_factor=n_permutations, is_fast=False
    )
    p_vals[i] = p

plt.figure()
sns.distplot(p_vals)
plt.title("MGCPy DCorr, 2-sample under null, unbiased, not fast")
plt.xlabel("p-value")
plt.savefig("graspy-misc/profile_dcorr/mgcpy_dcorr.png", facecolor="w")

#%% mgcpy with is_fast=True
# p_vals = np.zeros(n_sims)
# for i in tqdm(range(n_sims)):