Пример #1
0
def run_ba_cov_neutral_sims(shape=1,
                            scale=1,
                            G=50,
                            N=50,
                            iter1=1000,
                            iter2=1000):
    df_out = open(pt.get_path() + '/data/simulations/ba_cov_neutral_sims.txt',
                  'w')
    df_out.write('\t'.join([
        'N', 'G', 'lamba_mean', 'lambda_neutral', 'Cov', 'Iteration',
        'dist_percent'
    ]) + '\n')
    covs = [0.2]
    mean_gamma = shape * scale
    neutral_range = np.logspace(-2, 1, num=20, endpoint=True, base=10.0)
    neutral_range = neutral_range[::-1]
    for neutral_ in neutral_range:
        for cov in covs:
            for i in range(iter1):
                C = pt.get_ba_cov_matrix(G, cov)
                lambda_genes = np.random.gamma(shape=shape,
                                               scale=scale,
                                               size=G)
                lambda_genes_null = np.asarray([neutral_] * G)
                test_cov_adapt = np.stack(
                    [pt.get_count_pop(lambda_genes, C=C) for x in range(N)],
                    axis=0)
                # matrix with diaganol values equal to one
                test_cov_neutral = np.stack([
                    pt.get_count_pop(lambda_genes_null, C=np.identity(G))
                    for x in range(N)
                ],
                                            axis=0)
                test_cov = test_cov_adapt + test_cov_neutral

                X = pt.hellinger_transform(test_cov)
                pca = PCA()
                pca_fit = pca.fit_transform(X)
                euc_dist = pt.get_mean_pairwise_euc_distance(pca_fit)
                euc_dists = []
                for j in range(iter2):
                    #X_j = pt.hellinger_transform(pt.random_matrix(test_cov))
                    X_j = pt.hellinger_transform(
                        pt.get_random_matrix(test_cov))
                    pca_fit_j = pca.fit_transform(X_j)
                    euc_dists.append(
                        pt.get_mean_pairwise_euc_distance(pca_fit_j))
                euc_percent = len([k for k in euc_dists if k < euc_dist
                                   ]) / len(euc_dists)
                print(neutral_, cov, i, euc_percent)
                df_out.write('\t'.join([
                    str(N),
                    str(G),
                    str(mean_gamma),
                    str(neutral_),
                    str(cov),
                    str(i),
                    str(euc_percent)
                ]) + '\n')
    df_out.close()
Пример #2
0
def wannier_hist(iter=10000):
    dir = os.path.expanduser("~/GitHub/ParEvol")
    df1 = pd.read_csv(dir + '/data/Wannier_et_al/C321.deltaA_mutation_table_clean.txt', sep='\t', index_col=0)
    df2 = pd.read_csv(dir + '/data/Wannier_et_al/C321_mutation_table_clean.txt', sep='\t', index_col=0)
    df = df1.append(df2, sort=False)
    df = df.fillna(0)
    df_np = df.values
    gene_names = df.columns.values
    N1 = df1.shape[0]
    N2 = df2.shape[0]
    df_np_delta = cd.likelihood_matrix_array(df_np, gene_names, 'Wannier_et_al').get_likelihood_matrix()
    F2_all = pt.get_F_2(df_np_delta, N1, N2)
    print(F2_all)
    F2 = F2_all[0]
    V1 = F2_all[1]
    V2 = F2_all[2]

    F2_null = []
    V1_null = []
    V2_null = []
    for i in range(iter):
        if i %1000 ==0:
            print(i)
        df_np_i = pt.get_random_matrix(df_np)
        np.seterr(divide='ignore')
        df_np_i_delta = cd.likelihood_matrix_array(df_np_i, gene_names, 'Wannier_et_al').get_likelihood_matrix()
        F2_all_iter = pt.get_F_2(df_np_i_delta, N1, N2)
        F2_null.append(F2_all_iter[0])
        V1_null.append(F2_all_iter[1])
        V2_null.append(F2_all_iter[2])

    fig = plt.figure()
    #plt.hist(F2_null, bins=30, weights=np.zeros_like(F2_null) + 1. / len(F2_null), alpha=0.8, color = '#175ac6')
    plt.hist(F2_null, bins=30, alpha=0.8, color = '#175ac6')
    plt.axvline(F2, color = 'red', lw = 3)
    plt.xlabel(r'$ F_{2}$', fontsize = 20)
    plt.ylabel("Frequency", fontsize = 12)
    fig.tight_layout()
    fig.savefig(os.path.expanduser("~/GitHub/ParEvol") + '/figs/test_hist_F.png', bbox_inches = "tight", pad_inches = 0.4, dpi = 600)
    plt.close()


    fig = plt.figure()
    plt.hist(V1_null, bins=30, alpha=0.8, color = '#175ac6')
    plt.axvline(V1, color = 'red', lw = 3)
    plt.xlabel(r'$ V_{1}$', fontsize = 20)
    plt.ylabel("Frequency", fontsize = 12)
    fig.tight_layout()
    fig.savefig(os.path.expanduser("~/GitHub/ParEvol") + '/figs/test_hist_F_V1.png', bbox_inches = "tight", pad_inches = 0.4, dpi = 600)
    plt.close()

    fig = plt.figure()
    plt.hist(V2_null, bins=30, alpha=0.8, color = '#175ac6')
    #print(V2_null)
    plt.axvline(V2, color = 'red', lw = 3)
    plt.xlabel(r'$ V_{2}$', fontsize = 20)
    plt.ylabel("Frequency", fontsize = 12)
    fig.tight_layout()
    fig.savefig(os.path.expanduser("~/GitHub/ParEvol") + '/figs/test_hist_F_V2.png', bbox_inches = "tight", pad_inches = 0.4, dpi = 600)
    plt.close()
Пример #3
0
def rndm_sample_tenaillon(iter1=1000, iter2=1000):
    df_path = pt.get_path() + '/data/Tenaillon_et_al/gene_by_pop.txt'
    df = pd.read_csv(df_path, sep='\t', header='infer', index_col=0)
    df_np = df.values
    gene_names = df.columns.values
    n_rows = df_np.shape[0]
    df_out = open(pt.get_path() + '/data/Tenaillon_et_al/sample_size_sim.txt',
                  'w')
    df_out.write(
        '\t'.join(['N', 'G', 'Iteration', 'dist_percent', 'z_score']) + '\n')
    Ns = list(range(2, 40, 2))
    for N in Ns:
        for i in range(iter1):
            #df_np_i = df_np[np.random.choice(n_rows, N, replace=False), :]
            #df_np_i = df_np_i[: , ~np.all(df_np_i == 0, axis=0)]
            #df_i = df.sample(N)
            df_np_i = df_np[np.random.randint(n_rows, size=N), :]
            gene_bool = np.all(df_np_i == 0, axis=0)
            # flip around to select gene_size
            gene_names_i = list(
                compress(gene_names, list(map(operator.not_, gene_bool))))
            df_np_i = df_np_i[:, ~np.all(df_np_i == 0, axis=0)]
            #df_i = df_i.loc[:, (df_i != 0).any(axis=0)]
            np.seterr(divide='ignore')
            df_np_i_delta = pt.likelihood_matrix_array(
                df_np_i, gene_names_i,
                'Tenaillon_et_al').get_likelihood_matrix()
            X = pt.hellinger_transform(df_np_i_delta)
            pca = PCA()
            pca_fit = pca.fit_transform(X)
            euc_dist = pt.get_mean_pairwise_euc_distance(pca_fit)
            euc_dists = []
            for j in range(iter2):
                #df_np_i_j = pt.random_matrix(df_np_i)
                df_np_i_j = pt.get_random_matrix(df_np_i)
                np.seterr(divide='ignore')
                df_np_i_j_delta = pt.likelihood_matrix_array(
                    df_np_i_j, gene_names_i,
                    'Tenaillon_et_al').get_likelihood_matrix()
                #df_i_j = pd.DataFrame(data=pt.random_matrix(df_np_i_j), index=df_i.index, columns=df_i.columns)
                #df_i_j_delta = pt.likelihood_matrix(df_i_j, 'Tenaillon_et_al').get_likelihood_matrix()
                X_j = pt.hellinger_transform(df_np_i_j_delta)
                pca_fit_j = pca.fit_transform(X_j)
                euc_dists.append(pt.get_mean_pairwise_euc_distance(pca_fit_j))

            G = df_np_i.shape[1]
            euc_percent = len([k for k in euc_dists if k < euc_dist
                               ]) / len(euc_dists)
            z_score = (euc_dist - np.mean(euc_dists)) / np.std(euc_dists)
            print(str(N), str(i), str(G), str(euc_percent), str(z_score))
            df_out.write('\t'.join([
                str(N), str(G),
                str(i), str(euc_percent),
                str(z_score)
            ]) + '\n')

    df_out.close()
Пример #4
0
def rndm_sample_tenaillon(iter1=1000, iter2=10000):
    df_path = os.path.expanduser(
        "~/GitHub/ParEvol") + '/data/Tenaillon_et_al/gene_by_pop.txt'
    df = pd.read_csv(df_path, sep='\t', header='infer', index_col=0)
    df_np = df.values
    gene_names = df.columns.values
    n_rows = list(range(df_np.shape[0]))
    df_out = open(
        os.path.expanduser("~/GitHub/ParEvol") +
        '/data/Tenaillon_et_al/dist_sample_size.txt', 'w')
    df_out.write(
        '\t'.join(['N', 'G', 'Iteration', 'dist_percent', 'z_score']) + '\n')
    #Ns = list(range(4, 40 +2, 2))
    Ns = [40]
    pca = PCA()
    for N in Ns:
        for i in range(iter1):
            df_np_i = df_np[
                np.random.choice(n_rows, size=N, replace=False, p=None), :]
            gene_bool = np.all(df_np_i == 0, axis=0)
            # flip around to select gene_size
            gene_names_i = list(
                compress(gene_names, list(map(operator.not_, gene_bool))))
            df_np_i = df_np_i[:, ~np.all(df_np_i == 0, axis=0)]
            np.seterr(divide='ignore')
            df_np_i_delta = cd.likelihood_matrix_array(
                df_np_i, gene_names_i,
                'Tenaillon_et_al').get_likelihood_matrix()
            df_np_i_delta = df_np_i_delta / df_np_i_delta.sum(axis=1)[:, None]
            X = pt.get_mean_center(df_np_i_delta)
            pca_fit = pca.fit_transform(X)
            euc_dist = pt.get_mean_pairwise_euc_distance(pca_fit)
            euc_dists = []
            for j in range(iter2):
                df_np_i_j = pt.get_random_matrix(df_np_i)
                np.seterr(divide='ignore')
                df_np_i_j_delta = cd.likelihood_matrix_array(
                    df_np_i_j, gene_names_i,
                    'Tenaillon_et_al').get_likelihood_matrix()
                df_np_i_j_delta = df_np_i_j_delta / df_np_i_j_delta.sum(
                    axis=1)[:, None]
                X_j = pt.get_mean_center(df_np_i_j_delta)
                pca_fit_j = pca.fit_transform(X_j)
                euc_dists.append(pt.get_mean_pairwise_euc_distance(pca_fit_j))

            G = df_np_i.shape[1]
            euc_percent = len([k for k in euc_dists if k > euc_dist
                               ]) / len(euc_dists)
            z_score = (euc_dist - np.mean(euc_dists)) / np.std(euc_dists)
            print(str(N), str(i), str(G), str(euc_percent), str(z_score))
            df_out.write('\t'.join([
                str(N), str(G),
                str(i), str(euc_percent),
                str(z_score)
            ]) + '\n')

    df_out.close()
Пример #5
0
def rndm_sample_tenaillon(k_eval=3, iter1=20, iter2=1000, sample_bs = 10, iter_bs=10000):
    df_path = mydir + '/data/Tenaillon_et_al/gene_by_pop.txt'
    df = pd.read_csv(df_path, sep = '\t', header = 'infer', index_col = 0)
    df_np = df.values
    gene_names = df.columns.values
    n_rows = list(range(df_np.shape[0]))
    df_out = open(mydir + '/data/Tenaillon_et_al/power_sample_size.txt', 'w')
    df_out.write('\t'.join(['N', 'G', 'Power', 'Power_025', 'Power_975']) + '\n')

    Ns = [20, 30]
    #Ns = list(range(20, n_rows, 4))
    for N in Ns:
        p_values = []
        #z_scores = []
        G_list = []
        for i in range(iter1):
            df_np_i = df_np[np.random.choice(n_rows, size=N, replace=False, p=None), :]
            gene_bool = np.all(df_np_i == 0, axis=0)
            # flip around to select gene_size
            gene_names_i = list(compress(gene_names, list(map(operator.not_, gene_bool))))
            G_list.append(len(gene_names_i))
            df_np_i = df_np_i[:,~np.all(df_np_i == 0, axis=0)]
            np.seterr(divide='ignore')
            df_np_i_delta = cd.likelihood_matrix_array(df_np_i, gene_names_i, 'Tenaillon_et_al').get_likelihood_matrix()
            X = df_np_i_delta/df_np_i_delta.sum(axis=1)[:,None]
            X -= np.mean(X, axis = 0)
            pca = PCA()
            pca_X = pca.fit_transform(X)
            mpd = pt.get_mean_pairwise_euc_distance(pca_X, k=k_eval)
            mpd_null = []
            for j in range(iter2):
                df_np_i_j = pt.get_random_matrix(df_np_i)
                np.seterr(divide='ignore')
                df_np_i_j_delta = cd.likelihood_matrix_array(df_np_i_j, gene_names_i, 'Tenaillon_et_al').get_likelihood_matrix()
                X_j = df_np_i_j_delta/df_np_i_j_delta.sum(axis=1)[:,None]
                X_j -= np.mean(X_j, axis = 0)
                pca_X_j = pca.fit_transform(X_j)
                mpd_null.append(pt.get_mean_pairwise_euc_distance(pca_X_j, k=k_eval))
            p_values.append(len( [m for m in mpd_null if m > mpd] ) / len(mpd_null))
            #z_scores.append( (euc_dist - np.mean(euc_dists)) / np.std(euc_dists) )s
        print(p_values)

        power = len([n for n in p_values if n < 0.05]) / len(p_values)
        print(p_values)
        power_bootstrap = []
        for p in range(iter_bs):
            p_values_sample = random.sample(p_values, sample_bs)
            power_sample = len([n for n in p_values_sample if n < 0.05]) / len(p_values_sample)
            power_bootstrap.append(power_sample)
        power_bootstrap.sort()
        # return number of genes, power, power lower, power upper
        #return  power, power_bootstrap[int(10000*0.025)], power_bootstrap[int(10000*0.975)]
        df_out.write('\t'.join([str(N), str(np.mean(G_list)), str(power), str(power_bootstrap[int(iter_bs*0.025)]), str(power_bootstrap[int(iter_bs*0.975)])]) + '\n')

    df_out.close()
Пример #6
0
def rndm_sample_tenaillon(N,
                          df_np,
                          gene_names,
                          n_rows,
                          k=3,
                          iter1=100,
                          iter2=1000,
                          sample_bs=10,
                          iter_bs=10000):
    p_values = []
    #z_scores = []
    G_list = []
    for i in range(iter1):
        df_np_i = df_np[
            np.random.choice(n_rows, size=N, replace=False, p=None), :]
        gene_bool = np.all(df_np_i == 0, axis=0)
        # flip around to select gene_size
        gene_names_i = list(
            compress(gene_names, list(map(operator.not_, gene_bool))))
        G_list.append(len(gene_names_i))
        df_np_i = df_np_i[:, ~np.all(df_np_i == 0, axis=0)]
        np.seterr(divide='ignore')
        df_np_i_delta = cd.likelihood_matrix_array(
            df_np_i, gene_names_i, 'Tenaillon_et_al').get_likelihood_matrix()
        X = df_np_i_delta / df_np_i_delta.sum(axis=1)[:, None]
        e_vals, a_mat = pt.pca_np(X)
        euc_dist = pt.get_mean_pairwise_euc_distance(a_mat, k=k)
        print(euc_dist)
        euc_dists = []
        for j in range(iter2):
            df_np_i_j = pt.get_random_matrix(df_np_i)
            np.seterr(divide='ignore')
            df_np_i_j_delta = cd.likelihood_matrix_array(
                df_np_i_j, gene_names_i,
                'Tenaillon_et_al').get_likelihood_matrix()
            X_j = df_np_i_j_delta / df_np_i_j_delta.sum(axis=1)[:, None]
            e_vals_j, a_mat_j = pt.pca_np(X_j)
            euc_dists.append(pt.get_mean_pairwise_euc_distance(a_mat_j, k=k))
        p_values.append(
            len([m for m in euc_dists if m > euc_dist]) / len(euc_dists))
        #z_scores.append( (euc_dist - np.mean(euc_dists)) / np.std(euc_dists) )s

    power = len([n for n in p_values if n < 0.05]) / len(p_values)
    print(p_values)
    power_bootstrap = []
    for p in range(iter_bs):
        p_values_sample = random.sample(p_values, sample_bs)
        power_sample = len([n for n in p_values_sample if n < 0.05
                            ]) / len(p_values_sample)
        power_bootstrap.append(power_sample)
    power_bootstrap.sort()
    # return number of genes, power, power lower, power upper
    return N, np.mean(G_list), power, power_bootstrap[int(
        10000 * 0.025)], power_bootstrap[int(10000 * 0.975)]
Пример #7
0
def gene_svd_tenaillon(iter=10000):
    df_path = os.path.expanduser(
        "~/GitHub/ParEvol") + '/data/Tenaillon_et_al/gene_by_pop.txt'
    df = pd.read_csv(df_path, sep='\t', header='infer', index_col=0)
    gene_names = df.columns.tolist()
    df_np = df.values
    df_np_delta = cd.likelihood_matrix_array(
        df_np, gene_names, 'Tenaillon_et_al').get_likelihood_matrix()
    df_np_delta = df_np_delta / df_np_delta.sum(axis=1)[:, None]
    X = pt.get_mean_center(df_np_delta)
    # scipy's svd returns the V matrix in transposed form
    U, s, V_T = svds(X, k=3)
    # apply another transposition to calculate basis matrix
    F = (V_T.T @ np.diag(s)) / np.sqrt(X.shape[0] - 1)
    vars = np.linalg.norm(F, axis=1)**2
    vars_null_list = []
    for i in range(iter):
        if i % 1000 == 0:
            print("Iteration " + str(i))
        df_np_i = pt.get_random_matrix(df_np)
        np.seterr(divide='ignore')
        df_np_i_delta = cd.likelihood_matrix_array(
            df_np_i, gene_names, 'Tenaillon_et_al').get_likelihood_matrix()
        df_np_i_delta = df_np_i_delta / df_np_i_delta.sum(axis=1)[:, None]
        X_j = pt.get_mean_center(df_np_i_delta)
        U_i, s_i, V_i_T = svds(X_j, k=3)
        F_i = (V_i_T.T @ np.diag(s_i)) / np.sqrt(X_j.shape[0] - 1)
        vars_null_list.append(np.linalg.norm(F_i, axis=1)**2)

    vars_null = np.stack(vars_null_list)
    vars_null_mean = np.mean(vars_null, axis=0)
    vars_null_std = np.std(vars_null, axis=0)
    z_scores = (vars - vars_null_mean) / vars_null_std
    p_values = []
    # calculate p values
    for k, column in enumerate(vars_null.T):
        column_greater = [x for x in column if x > vars[k]]
        p_values.append(len(column_greater) / iter)

    label_z_scores = list(zip(gene_names, z_scores, p_values))
    label_sig_z_scores = [x for x in label_z_scores if x[2] < 0.05]
    print(label_sig_z_scores)

    df_out = open(
        os.path.expanduser("~/GitHub/ParEvol") +
        '/data/Tenaillon_et_al/gene_z_scores.txt', 'w')
    df_out.write('\t'.join(['Gene', 'z_score', 'p_score']) + '\n')
    for label_z_score in label_z_scores:
        df_out.write('\t'.join([
            str(label_z_score[0]),
            str(label_z_score[1]),
            str(label_z_score[2])
        ]) + '\n')
    df_out.close()
Пример #8
0
def run_ba_cov_sims(gene_list, pop_list, out_name, iter1=1000, iter2=1000):
    df_out = open(pt.get_path() + '/data/simulations/' + out_name + '.txt',
                  'w')
    df_out.write('\t'.join(['N', 'G', 'Cov', 'Iteration', 'dist_percent']) +
                 '\n')
    covs = [0.1, 0.15, 0.2]
    for G in gene_list:
        for N in pop_list:
            for cov in covs:
                for i in range(iter1):
                    C = pt.get_ba_cov_matrix(G, cov)
                    while True:
                        lambda_genes = np.random.gamma(shape=1,
                                                       scale=1,
                                                       size=G)
                        test_cov = np.stack([
                            pt.get_count_pop(lambda_genes, cov=C)
                            for x in range(N)
                        ],
                                            axis=0)
                        #test_cov_row_sum = test_cov.sum(axis=1)
                        if (np.any(test_cov.sum(axis=1) == 0)) == False:
                            break
                        #if np.count_nonzero(test_cov_row_sum) == len(test_cov_row_sum):
                        #    break
                    X = pt.hellinger_transform(test_cov)
                    pca = PCA()
                    pca_fit = pca.fit_transform(X)
                    euc_dist = pt.get_mean_pairwise_euc_distance(pca_fit)
                    euc_dists = []
                    for j in range(iter2):
                        X_j = pt.hellinger_transform(
                            pt.get_random_matrix(test_cov))
                        #X_j = pt.hellinger_transform(pt.random_matrix(test_cov))
                        pca_fit_j = pca.fit_transform(X_j)
                        euc_dists.append(
                            pt.get_mean_pairwise_euc_distance(pca_fit_j))
                    euc_percent = len([k for k in euc_dists if k < euc_dist
                                       ]) / len(euc_dists)
                    print(N, G, cov, i, euc_percent)
                    df_out.write('\t'.join(
                        [str(N),
                         str(G),
                         str(cov),
                         str(i),
                         str(euc_percent)]) + '\n')
    df_out.close()
Пример #9
0
def gene_svd_tenaillon_sample_size(iter1=1000, iter2=10000, k=3):
    df_path = os.path.expanduser(
        "~/GitHub/ParEvol") + '/data/Tenaillon_et_al/gene_by_pop.txt'
    df = pd.read_csv(df_path, sep='\t', header='infer', index_col=0)
    df_np = df.values
    gene_names = df.columns.values
    n_rows = list(range(df_np.shape[0]))
    df_out = open(
        os.path.expanduser("~/GitHub/ParEvol") +
        '/data/Tenaillon_et_al/gene_z_scores_sample_size.txt', 'w')
    df_out.write('\t'.join(['N', 'G', 'Iteration', 'set_percent']) + '\n')
    Ns = list(range(4, 40, 2))
    # get genes with an absolute z-score greater than 1.96
    df_gene_path = os.path.expanduser(
        "~/GitHub/ParEvol") + '/data/Tenaillon_et_al/gene_z_scores.txt'
    df_genes = pd.read_csv(df_gene_path, sep='\t', header='infer')
    df_genes_sig = df_genes.loc[(df_genes['z_score'] > 1.96) |
                                (df_genes['z_score'] < -1.96)]
    genes = df_genes_sig.Gene.tolist()
    for N in Ns:
        for i in range(iter1):
            df_np_i = df_np[
                np.random.choice(n_rows, size=N, replace=False, p=None), :]
            testtt = np.random.choice(n_rows, size=N, replace=False, p=None)
            gene_bool = np.all(df_np_i == 0, axis=0)
            # flip around to select gene_size
            gene_names_i = list(
                compress(gene_names, list(map(operator.not_, gene_bool))))
            df_np_i = df_np_i[:, ~np.all(df_np_i == 0, axis=0)]
            np.seterr(divide='ignore')
            #df_np_i_delta = cd.likelihood_matrix_array(df_np_i, gene_names_i, 'Tenaillon_et_al').get_likelihood_matrix()
            X = pt.get_mean_center(df_np_i)
            U, s, V_T = svds(X, k=k)
            # apply another transposition to calculate basis matrix
            F = (V_T.T @ np.diag(s)) / np.sqrt(X.shape[0] - 1)
            vars = np.linalg.norm(F, axis=1)**2
            vars_null_list = []
            for j in range(iter2):
                df_np_i_j = pt.get_random_matrix(df_np_i)
                np.seterr(divide='ignore')
                #df_np_i_j_delta = cd.likelihood_matrix_array(df_np_i_j, gene_names_i, 'Tenaillon_et_al').get_likelihood_matrix()
                X_j = pt.get_mean_center(df_np_i_j)
                U_j, s_j, V_j_T = svds(X_j, k=3)
                F_j = (V_j_T.T @ np.diag(s_j)) / np.sqrt(X_j.shape[0] - 1)
                vars_null_list.append(np.linalg.norm(F_j, axis=1)**2)

            vars_null_i = np.stack(vars_null_list)
            vars_null_i_mean = np.mean(vars_null_i, axis=0)
            vars_null_i_std = np.std(vars_null_i, axis=0)
            z_scores = (vars - vars_null_i_mean) / vars_null_i_std
            label_z_scores = list(zip(gene_names_i, z_scores))
            label_sig_z_scores = [
                x for x in label_z_scores if abs(x[1]) > 1.96
            ]
            label_sig_z_scores_label = [x[0] for x in label_sig_z_scores]
            gene_inter = set(label_sig_z_scores_label) & set(genes)
            union_fract = len(gene_inter) / len(genes)
            print(N, i, union_fract)
            G = df_np_i.shape[1]
            df_out.write('\t'.join([str(
                N), str(G), str(i), str(union_fract)]) + '\n')

    df_out.close()
Пример #10
0
def time_partition_ltee(k=5, iter=100):
    df_path = os.path.expanduser(
        "~/GitHub/ParEvol") + '/data/Good_et_al/gene_by_pop.txt'
    df = pd.read_csv(df_path, sep='\t', header='infer', index_col=0)
    to_include = pt.complete_nonmutator_lines()
    df_nonmut = df[df.index.str.contains('|'.join(to_include))]
    # remove columns with all zeros
    df_nonmut = df_nonmut.loc[:, (df_nonmut != 0).any(axis=0)]
    # make sure it's sorted
    df_nonmut.sort_index(inplace=True)

    time_points = [int(x.split('_')[1]) for x in df_nonmut.index.values]
    time_points_set = sorted(
        list(set([int(x.split('_')[1]) for x in df_nonmut.index.values])))
    time_points_positions = {}
    for x in time_points_set:
        time_points_positions[x] = [
            i for i, j in enumerate(time_points) if j == x
        ]
    t_final_df = df_nonmut.iloc[time_points_positions[max(time_points_set)]]
    t_final_np = t_final_df.values
    gene_names = df_nonmut.columns.tolist()

    df_out = open(
        os.path.expanduser("~/GitHub/ParEvol") +
        '/data/Good_et_al/time_partition_z_scores.txt', 'w')
    df_out.write(
        '\t'.join(['Time', 'Time_less_z_score', 'Time_greater_z_score']) +
        '\n')
    for time_point in time_points_set:
        # very few mutations after generation 50000
        if time_point > 50000:
            continue
        print("Time point " + str(time_point))
        t_i_df = df_nonmut.iloc[time_points_positions[time_point]]
        t_i_np = t_i_df.values
        # remove rows with all zeros
        t_i_np_zeros = np.where(~t_i_np.any(axis=1))[0]
        n_zeros_t_i_np = len(t_i_np_zeros)
        if n_zeros_t_i_np > 0:
            t_i_np = np.delete(t_i_np, t_i_np_zeros, axis=0)

        t_i_to_final_np = t_final_np - t_i_np
        # remove rows with all zeros
        t_i_to_final_np_zeros = np.where(~t_i_to_final_np.any(axis=1))[0]
        n_zeros_t_i_to_final_np = len(t_i_to_final_np_zeros)
        if n_zeros_t_i_to_final_np > 0:
            t_i_to_final_np = np.delete(t_i_to_final_np,
                                        t_i_to_final_np_zeros,
                                        axis=0)

        t_concat = np.concatenate((t_i_np, t_i_to_final_np), axis=0)
        t_norm = cd.likelihood_matrix_array(
            t_concat, gene_names, 'Good_et_al').get_likelihood_matrix()
        t_norm_rel = t_norm / t_norm.sum(axis=1, keepdims=True)

        # first five axes
        e_vals, e_vecs = pt.pca_np(t_norm_rel)
        # The column v[:, i] is the normalized eigenvector corresponding to the eigenvalue w[i]
        e_vecs_k5 = e_vecs[:, -1 - k:-1]
        # account for rows with zero mutations
        e_vec_t_less = e_vecs_k5[:5 - n_zeros_t_i_np, :]
        e_vec_t_greater = e_vecs_k5[5 - n_zeros_t_i_to_final_np:, :]

        dist_t_less = pt.get_mean_pairwise_euc_distance(e_vec_t_less, k=k)
        dist_t_greater = pt.get_mean_pairwise_euc_distance(e_vec_t_greater,
                                                           k=k)

        dist_t_less_list = []
        dist_t_greater_list = []

        for i in range(iter):
            t_i_np_rndm = pt.get_random_matrix(t_i_np)
            t_i_to_final_np_rndm = pt.get_random_matrix(t_i_to_final_np)
            t_rndm_concat = np.concatenate((t_i_np_rndm, t_i_to_final_np_rndm),
                                           axis=0)

            t_rndm_norm = cd.likelihood_matrix_array(
                t_rndm_concat, gene_names,
                'Good_et_al').get_likelihood_matrix()
            t_rndm_norm_rel = t_rndm_norm / t_rndm_norm.sum(axis=1,
                                                            keepdims=True)
            # first five axes
            e_vals_rndm, e_vecs_rndm = pt.pca_np(t_rndm_norm_rel)
            # The column v[:, i] is the normalized eigenvector corresponding to the eigenvalue w[i]
            e_vecs_rndm_k5 = e_vecs_rndm[:, -1 - k:-1]
            e_vec_t_less_rndm = e_vecs_rndm_k5[:5, :]
            e_vec_t_greater_rndm = e_vecs_rndm_k5[5:, :]

            dist_t_less_rndm = pt.get_mean_pairwise_euc_distance(
                e_vec_t_less_rndm, k=k)
            dist_t_greater_rndm = pt.get_mean_pairwise_euc_distance(
                e_vec_t_greater_rndm, k=k)

            dist_t_less_list.append(dist_t_less_rndm)
            dist_t_greater_list.append(dist_t_greater_rndm)

        z_score_less = (dist_t_less -
                        np.mean(dist_t_less_list)) / np.std(dist_t_less_list)
        z_score_greater = (dist_t_greater - np.mean(dist_t_greater_list)
                           ) / np.std(dist_t_greater_list)

        df_out.write('\t'.join(
            [str(time_point),
             str(z_score_less),
             str(z_score_greater)]) + '\n')

    df_out.close()
Пример #11
0
def time_partition_ltee(k=5, iter=1000):
    df_path = mydir + '/data/Good_et_al/gene_by_pop.txt'
    df = pd.read_csv(df_path, sep='\t', header='infer', index_col=0)
    to_include = pt.complete_nonmutator_lines()
    df_nonmut = df[df.index.str.contains('|'.join(to_include))]
    # remove columns with all zeros
    df_nonmut = df_nonmut.loc[:, (df_nonmut != 0).any(axis=0)]
    # make sure it's sorted
    df_nonmut.sort_index(inplace=True)

    time_points = [int(x.split('_')[1]) for x in df_nonmut.index.values]
    time_points_set = sorted(
        list(set([int(x.split('_')[1]) for x in df_nonmut.index.values])))
    time_points_positions = {}
    for x in time_points_set:
        time_points_positions[x] = [
            i for i, j in enumerate(time_points) if j == x
        ]
    t_final_df = df_nonmut.iloc[time_points_positions[max(time_points_set)]]
    t_final_np = t_final_df.values
    gene_names = df_nonmut.columns.tolist()

    df_out = open(mydir + '/data/Good_et_al/time_partition_z_scores.txt', 'w')
    df_out.write('\t'.join([
        'Time', 'less_mbd', 'greater_mpd', 'delta_mpd', 'less_mbd_025',
        'less_mbd_975', 'greater_mpd_025', 'greater_mpd_975', 'delta_mpd_025',
        'delta_mpd_975'
    ]) + '\n')
    for time_point in time_points_set:
        # very few mutations after generation 50000
        if time_point > 50000:
            continue
        print("Time point " + str(time_point))
        t_i_df = df_nonmut.iloc[time_points_positions[time_point]]
        t_i_np = t_i_df.values
        # remove rows with all zeros
        t_i_np_zeros = np.where(~t_i_np.any(axis=1))[0]
        n_zeros_t_i_np = len(t_i_np_zeros)
        if n_zeros_t_i_np > 0:
            t_i_np = np.delete(t_i_np, t_i_np_zeros, axis=0)

        t_i_to_final_np = t_final_np - t_i_np
        # remove rows with all zeros
        t_i_to_final_np_zeros = np.where(~t_i_to_final_np.any(axis=1))[0]
        n_zeros_t_i_to_final_np = len(t_i_to_final_np_zeros)
        if n_zeros_t_i_to_final_np > 0:
            t_i_to_final_np = np.delete(t_i_to_final_np,
                                        t_i_to_final_np_zeros,
                                        axis=0)

        t_concat = np.concatenate((t_i_np, t_i_to_final_np), axis=0)
        t_norm = cd.likelihood_matrix_array(
            t_concat, gene_names, 'Good_et_al').get_likelihood_matrix()
        t_norm_rel = t_norm / t_norm.sum(axis=1, keepdims=True)
        t_norm_rel -= np.mean(t_norm_rel, axis=0)
        pca = PCA()
        t_norm_rel_pca = pca.fit_transform(t_norm_rel)
        t_norm_rel_pca_k5 = t_norm_rel_pca[:, -1 - k:-1]
        # account for rows with zero mutations
        dist_t_less = pt.get_mean_pairwise_euc_distance(
            t_norm_rel_pca_k5[:5 - n_zeros_t_i_np, :], k=k)
        dist_t_greater = pt.get_mean_pairwise_euc_distance(
            t_norm_rel_pca_k5[5 - n_zeros_t_i_to_final_np:, :], k=k)
        dist_t_change = dist_t_greater - dist_t_less
        #F_t = pt.get_F_2(t_norm_rel_pca_k5, 5-n_zeros_t_i_np, 5-n_zeros_t_i_to_final_np)[0]
        dist_t_less_list = []
        dist_t_greater_list = []
        dist_t_change_list = []
        #F_t_list = []
        for i in range(iter):
            if i % 1000 == 0:
                print("Iteration " + str(i))
            t_i_np_rndm = pt.get_random_matrix(t_i_np)
            t_i_to_final_np_rndm = pt.get_random_matrix(t_i_to_final_np)
            t_rndm_concat = np.concatenate((t_i_np_rndm, t_i_to_final_np_rndm),
                                           axis=0)
            t_rndm_norm = cd.likelihood_matrix_array(
                t_rndm_concat, gene_names,
                'Good_et_al').get_likelihood_matrix()
            t_rndm_norm_rel = t_rndm_norm / t_rndm_norm.sum(axis=1,
                                                            keepdims=True)
            t_rndm_norm_rel -= np.mean(t_rndm_norm_rel, axis=0)
            t_rndm_norm_rel_pca = pca.fit_transform(t_rndm_norm_rel)
            # first five axes
            t_rndm_norm_rel_pca_k5 = t_rndm_norm_rel_pca[:, -1 - k:-1]
            dist_t_less_rndm = pt.get_mean_pairwise_euc_distance(
                t_rndm_norm_rel_pca_k5[:5 - n_zeros_t_i_np, :], k=k)
            dist_t_greater_rndm = pt.get_mean_pairwise_euc_distance(
                t_rndm_norm_rel_pca_k5[5 - n_zeros_t_i_to_final_np:, :], k=k)
            dist_t_change_list.append(dist_t_greater_rndm - dist_t_less_rndm)
            dist_t_less_list.append(dist_t_less_rndm)
            dist_t_greater_list.append(dist_t_greater_rndm)
            #F_t_list.append(pt.get_F_2(t_rndm_norm_rel_pca, 5-n_zeros_t_i_np, 5-n_zeros_t_i_to_final_np)[0])

        dist_t_change_list.sort()
        dist_t_greater_list.sort()
        dist_t_less_list.sort()
        #F_t_list.sort()
        # get 95% CIs
        dist_t_change_025 = dist_t_change_list[int(iter * 0.025)]
        dist_t_change_975 = dist_t_change_list[int(iter * 0.975)]
        dist_t_greater_025 = dist_t_greater_list[int(iter * 0.025)]
        dist_t_greater_975 = dist_t_greater_list[int(iter * 0.975)]
        dist_t_less_025 = dist_t_less_list[int(iter * 0.025)]
        dist_t_less_975 = dist_t_less_list[int(iter * 0.975)]
        #F_t_025 = F_t_list[int(iter*0.025)]
        #F_t_975 = F_t_list[int(iter*0.975)]
        df_out.write('\t'.join([str(time_point), str(dist_t_less), str(dist_t_greater), \
                                str(dist_t_change), str(dist_t_less_025), str(dist_t_less_975), \
                                str(dist_t_greater_025), str(dist_t_greater_975), \
                                str(dist_t_change_025), str(dist_t_change_975)]) + '\n')

    df_out.close()
Пример #12
0
n_rows = list(range(df_np.shape[0]))

df_np_delta = cd.likelihood_matrix_array(df_np, gene_names, "Tenaillon_et_al").get_likelihood_matrix()

X = df_np_delta / df_np_delta.sum(axis=1)[:, None]
X = X - np.mean(X, axis=0)
# cov = np.cov(X.T)
# ev, eig = np.linalg.eig(cov)
pca = PCA()
pca_fit = pca.fit_transform(X)
# L = pt.get_L_stat(max(ev), N, cov.shape[0])
eig = pt.get_x_stat(pca.explained_variance_[:-1], n_features=X.shape[1])

eig_null = []
for j in range(iter):
    df_np_j = pt.get_random_matrix(df_np)
    np.seterr(divide="ignore")
    df_np_j_delta = cd.likelihood_matrix_array(df_np_j, gene_names, "Tenaillon_et_al").get_likelihood_matrix()
    X_j = df_np_j_delta / df_np_j_delta.sum(axis=1)[:, None]
    X_j -= np.mean(X_j, axis=0)
    pca_j = PCA()
    pca_X_j = pca_j.fit_transform(X_j)
    eig_null.append(pt.get_x_stat(pca_j.explained_variance_[:-1], n_features=X.shape[1]))


eig_null = np.asarray(eig_null)

P_eig = len(eig_null[eig_null > eig]) / len(eig_null)


eig_power = open(pt.get_path() + "/data/Tenaillon_et_al/power_sample_size_l_stat.txt", "r")
Пример #13
0
def run_ba_ntwk_cluster_sims(iter1=1000, iter2=1000, cov=0.2):
    df_out = open(mydir + '/data/simulations/cov_ba_ntwrk_cluster_methods.txt', 'w')
    df_out.write('\t'.join(['Prob', 'CC_mean', 'CC_025', 'CC_975', 'Method', 'Power', 'Power_025', 'Power_975', 'Z_mean', 'Z_025', 'Z_975']) + '\n')

    n_pops=100
    n_genes=50
    #covs = [0.05, 0.1, 0.15, 0.2]
    ps = [0, 0.2, 0.4, 0.6, 0.8, 1]
    for p in ps:
        eig_p_list = []
        mcd_k1_p_list = []
        mcd_k3_p_list = []
        mpd_k1_p_list = []
        mpd_k3_p_list = []

        eig_z_list = []
        mcd_k1_z_list = []
        mcd_k3_z_list = []
        mpd_k1_z_list = []
        mpd_k3_z_list = []

        cc_list = []
        for i in range(iter1):
            if i %100 ==0:
                print(ps, i)
            lambda_genes = np.random.gamma(shape=3, scale=1, size=n_genes)
            C, cc = pt.get_ba_cov_matrix(n_genes, cov=cov,  p=p)
            test_cov = np.stack( [pt.get_count_pop(lambda_genes, cov= C) for x in range(n_pops)] , axis=0 )
            X = test_cov/test_cov.sum(axis=1)[:,None]
            X -= np.mean(X, axis = 0)
            pca = PCA()
            pca_fit = pca.fit_transform(X)
            mpd_k1 = pt.get_mean_pairwise_euc_distance(pca_fit,k=1)
            mpd_k3 = pt.get_mean_pairwise_euc_distance(pca_fit,k=3)

            eig = pt.get_x_stat(pca.explained_variance_[:-1], n_features=n_genes)
            mcd_k1 = pt.get_mean_centroid_distance(pca_fit, k = 1)
            mcd_k3 = pt.get_mean_centroid_distance(pca_fit, k = 3)

            eig_null_list = []
            mcd_k1_null_list = []
            mcd_k3_null_list = []
            mpd_k1_null_list = []
            mpd_k3_null_list = []
            for j in range(iter2):
                test_cov_rndm = pt.get_random_matrix(test_cov)
                X_j = test_cov_rndm/test_cov_rndm.sum(axis=1)[:,None]
                X_j -= np.mean(X_j, axis = 0)
                pca_j = PCA()
                pca_fit_j = pca_j.fit_transform(X_j)
                #pca_fit_j = pca.fit_transform(X_j)
                mpd_k1_null_list.append( pt.get_mean_pairwise_euc_distance(pca_fit_j, k = 1 ) )
                mpd_k3_null_list.append( pt.get_mean_pairwise_euc_distance(pca_fit_j, k = 3 ) )
                mcd_k1_null_list.append(pt.get_mean_centroid_distance(pca_fit_j, k = 1))
                mcd_k3_null_list.append(pt.get_mean_centroid_distance(pca_fit_j, k = 3))
                eig_null_list.append( pt.get_x_stat(pca_j.explained_variance_[:-1], n_features=n_genes) )

            #print(len( [k for k in eig_null_list if k > eig] ) / iter1)
            eig_p_list.append(len( [k for k in eig_null_list if k > eig] ) / iter1)
            mcd_k1_p_list.append( len( [k for k in mcd_k1_null_list if k > mcd_k1] ) / iter1 )
            mcd_k3_p_list.append( len( [k for k in mcd_k3_null_list if k > mcd_k3] ) / iter1 )

            mpd_k1_p_list.append( len( [k for k in mpd_k1_null_list if k > mpd_k1] ) / iter1 )
            mpd_k3_p_list.append( len( [k for k in mpd_k3_null_list if k > mpd_k3] ) / iter1 )

            cc_list.append(cc)

            eig_z_list.append( (eig - np.mean(eig_null_list)) / np.std(eig_null_list)  )
            mcd_k1_z_list.append( (mcd_k1 - np.mean(mcd_k1_null_list)) / np.std(mcd_k1_null_list)  )
            mcd_k3_z_list.append( (mcd_k3 - np.mean(mcd_k3_null_list)) / np.std(mcd_k3_null_list)  )
            mpd_k1_z_list.append( (mpd_k1 - np.mean(mpd_k1_null_list)) / np.std(mpd_k1_null_list)  )
            mpd_k3_z_list.append( (mpd_k3 - np.mean(mpd_k3_null_list)) / np.std(mpd_k3_null_list)  )


        # calculate
        cc_mean = np.mean(cc_list)
        cc_bs_mean_list = []
        for iter_i in range(10000):
            cc_bs_mean_list.append( np.mean( np.random.choice(cc_list, size=50, replace=True ) ))
        cc_bs_mean_list.sort()
        cc_975 = cc_bs_mean_list[ int(0.975 * 10000) ]
        cc_025 = cc_bs_mean_list[ int(0.025 * 10000) ]


        eig_power = len([n for n in eig_p_list if n < 0.05]) / iter1
        eig_power_025, eig_power_975 = get_bootstrap_power_ci(eig_p_list)

        mcd_k1_power = len([n for n in mcd_k1_p_list if n < 0.05]) / iter1
        mcd_k1_power_025, mcd_k1_power_975 = get_bootstrap_power_ci(mcd_k1_p_list)

        mcd_k3_power = len([n for n in mcd_k3_p_list if n < 0.05]) / iter1
        mcd_k3_power_025, mcd_k3_power_975 = get_bootstrap_power_ci(mcd_k3_p_list)

        mpd_k1_power = len([n for n in mpd_k1_p_list if n < 0.05]) / iter1
        mpd_k1_power_025, mpd_k1_power_975 = get_bootstrap_power_ci(mpd_k1_p_list)

        mpd_k3_power = len([n for n in mpd_k3_p_list if n < 0.05]) / iter1
        mpd_k3_power_025, mpd_k3_power_975 = get_bootstrap_power_ci(mpd_k3_p_list)


        eig_z_025, eig_z_975 = get_bootstrap_ci(eig_z_list)
        mcd_k1_z_025, mcd_k1_z_975 = get_bootstrap_ci(mcd_k1_z_list)
        mcd_k3_z_025, mcd_k3_z_975 = get_bootstrap_ci(mcd_k3_z_list)
        mpd_k1_z_025, mpd_k1_z_975 = get_bootstrap_ci(mpd_k1_z_list)
        mpd_k3_z_025, mpd_k3_z_975 = get_bootstrap_ci(mpd_k3_z_list)

        df_out.write('\t'.join([str(p), str(cc_mean), str(cc_025), str(cc_975), 'Eig', str(eig_power), str(eig_power_025), str(eig_power_975), str(np.mean(eig_z_list)), str(eig_z_025), str(eig_z_975)]) + '\n')
        df_out.write('\t'.join([str(p), str(cc_mean), str(cc_025), str(cc_975), 'MCD_k1', str(mcd_k1_power), str(mcd_k1_power_025), str(mcd_k1_power_975), str(np.mean(mcd_k1_z_list)), str(mcd_k1_z_025), str(mcd_k1_z_975)]) + '\n')
        df_out.write('\t'.join([str(p), str(cc_mean), str(cc_025), str(cc_975), 'MCD_k3', str(mcd_k3_power), str(mcd_k3_power_025), str(mcd_k3_power_975), str(np.mean(mcd_k3_z_list)), str(mcd_k3_z_025), str(mcd_k3_z_975)]) + '\n')
        df_out.write('\t'.join([str(p), str(cc_mean), str(cc_025), str(cc_975), 'MPD_k1', str(mpd_k1_power), str(mpd_k1_power_025), str(mpd_k1_power_975), str(np.mean(mpd_k1_z_list)), str(mpd_k1_z_025), str(mpd_k1_z_975)]) + '\n')
        df_out.write('\t'.join([str(p), str(cc_mean), str(cc_025), str(cc_975), 'MPD_k3', str(mpd_k3_power), str(mpd_k3_power_025), str(mpd_k3_power_975), str(np.mean(mpd_k3_z_list)), str(mpd_k3_z_025), str(mpd_k3_z_975)]) + '\n')

    df_out.close()
Пример #14
0
def run_ba_ntwk_cov_sims(iter1=1000, iter2=1000, n_pops=100, n_genes=50):
    df_out = open(mydir + '/data/simulations/cov_ba_ntwrk_methods.txt', 'w')
    df_out.write('\t'.join(['Cov', 'Method', 'Power', 'Power_025', 'Power_975', 'Z_mean', 'Z_025', 'Z_975']) + '\n')

    covs = [0.05, 0.1, 0.15, 0.2]
    #covs = [0.2]
    for cov in covs:
        eig_p_list = []
        mcd_k1_p_list = []
        mcd_k3_p_list = []
        mpd_k1_p_list = []
        mpd_k3_p_list = []

        eig_z_list = []
        mcd_k1_z_list = []
        mcd_k3_z_list = []
        mpd_k1_z_list = []
        mpd_k3_z_list = []
        for i in range(iter1):
            if i %100 ==0:
                print(cov, i)
            lambda_genes = np.random.gamma(shape=3, scale=1, size=n_genes)
            C = pt.get_ba_cov_matrix(n_genes, cov=cov)
            test_cov = np.stack( [pt.get_count_pop(lambda_genes, cov= C) for x in range(n_pops)] , axis=0 )
            X = test_cov/test_cov.sum(axis=1)[:,None]
            X -= np.mean(X, axis = 0)
            pca = PCA()
            pca_fit = pca.fit_transform(X)
            mpd_k1 = pt.get_mean_pairwise_euc_distance(pca_fit,k=1)
            mpd_k3 = pt.get_mean_pairwise_euc_distance(pca_fit,k=3)

            eig = pt.get_x_stat(pca.explained_variance_[:-1], n_features=n_genes)
            mcd_k1 = pt.get_mean_centroid_distance(pca_fit, k = 1)
            mcd_k3 = pt.get_mean_centroid_distance(pca_fit, k = 3)

            #print(pca.explained_variance_[:-1])
            #print(pt.get_x_stat(pca.explained_variance_[:-1]))
            eig_null_list = []
            mcd_k1_null_list = []
            mcd_k3_null_list = []
            mpd_k1_null_list = []
            mpd_k3_null_list = []
            for j in range(iter2):
                test_cov_rndm = pt.get_random_matrix(test_cov)
                X_j = test_cov_rndm/test_cov_rndm.sum(axis=1)[:,None]
                X_j -= np.mean(X_j, axis = 0)
                pca_j = PCA()
                pca_fit_j = pca_j.fit_transform(X_j)
                #pca_fit_j = pca.fit_transform(X_j)
                mpd_k1_null_list.append( pt.get_mean_pairwise_euc_distance(pca_fit_j, k = 1 ) )
                mpd_k3_null_list.append( pt.get_mean_pairwise_euc_distance(pca_fit_j, k = 3 ) )
                mcd_k1_null_list.append(pt.get_mean_centroid_distance(pca_fit_j, k = 1))
                mcd_k3_null_list.append(pt.get_mean_centroid_distance(pca_fit_j, k = 3))
                eig_null_list.append( pt.get_x_stat(pca_j.explained_variance_[:-1], n_features=n_genes) )

            eig_p_list.append(len( [k for k in eig_null_list if k > eig] ) / iter1)
            mcd_k1_p_list.append( len( [k for k in mcd_k1_null_list if k > mcd_k1] ) / iter1 )
            mcd_k3_p_list.append( len( [k for k in mcd_k3_null_list if k > mcd_k3] ) / iter1 )

            mpd_k1_p_list.append( len( [k for k in mpd_k1_null_list if k > mpd_k1] ) / iter1 )
            mpd_k3_p_list.append( len( [k for k in mpd_k3_null_list if k > mpd_k3] ) / iter1 )


            eig_z_list.append( (eig - np.mean(eig_null_list)) / np.std(eig_null_list)  )
            mcd_k1_z_list.append( (mcd_k1 - np.mean(mcd_k1_null_list)) / np.std(mcd_k1_null_list)  )
            mcd_k3_z_list.append( (mcd_k3 - np.mean(mcd_k3_null_list)) / np.std(mcd_k3_null_list)  )
            mpd_k1_z_list.append( (mpd_k1 - np.mean(mpd_k1_null_list)) / np.std(mpd_k1_null_list)  )
            mpd_k3_z_list.append( (mpd_k3 - np.mean(mpd_k3_null_list)) / np.std(mpd_k3_null_list)  )



        # calculate power
        eig_power = len([n for n in eig_p_list if n < 0.05]) / iter1
        eig_power_025, eig_power_975 = get_bootstrap_power_ci(eig_p_list)

        mcd_k1_power = len([n for n in mcd_k1_p_list if n < 0.05]) / iter1
        mcd_k1_power_025, mcd_k1_power_975 = get_bootstrap_power_ci(mcd_k1_p_list)

        mcd_k3_power = len([n for n in mcd_k3_p_list if n < 0.05]) / iter1
        mcd_k3_power_025, mcd_k3_power_975 = get_bootstrap_power_ci(mcd_k3_p_list)

        mpd_k1_power = len([n for n in mpd_k1_p_list if n < 0.05]) / iter1
        mpd_k1_power_025, mpd_k1_power_975 = get_bootstrap_power_ci(mpd_k1_p_list)

        mpd_k3_power = len([n for n in mpd_k3_p_list if n < 0.05]) / iter1
        mpd_k3_power_025, mpd_k3_power_975 = get_bootstrap_power_ci(mpd_k3_p_list)

        eig_z_025, eig_z_975 = get_bootstrap_ci(eig_z_list)
        mcd_k1_z_025, mcd_k1_z_975 = get_bootstrap_ci(mcd_k1_z_list)
        mcd_k3_z_025, mcd_k3_z_975 = get_bootstrap_ci(mcd_k3_z_list)
        mpd_k1_z_025, mpd_k1_z_975 = get_bootstrap_ci(mpd_k1_z_list)
        mpd_k3_z_025, mpd_k3_z_975 = get_bootstrap_ci(mpd_k3_z_list)

        df_out.write('\t'.join([str(cov), 'Eig', str(eig_power), str(eig_power_025), str(eig_power_975), str(np.mean(eig_z_list)), str(eig_z_025), str(eig_z_975)]) + '\n')
        df_out.write('\t'.join([str(cov), 'MCD_k1', str(mcd_k1_power), str(mcd_k1_power_025), str(mcd_k1_power_975), str(np.mean(mcd_k1_z_list)), str(mcd_k1_z_025), str(mcd_k1_z_975)]) + '\n')
        df_out.write('\t'.join([str(cov), 'MCD_k3', str(mcd_k3_power), str(mcd_k3_power_025), str(mcd_k3_power_975), str(np.mean(mcd_k3_z_list)), str(mcd_k3_z_025), str(mcd_k3_z_975)]) + '\n')
        df_out.write('\t'.join([str(cov), 'MPD_k1', str(mpd_k1_power), str(mpd_k1_power_025), str(mpd_k1_power_975), str(np.mean(mpd_k1_z_list)), str(mpd_k1_z_025), str(mpd_k1_z_975)]) + '\n')
        df_out.write('\t'.join([str(cov), 'MPD_k3', str(mpd_k3_power), str(mpd_k3_power_025), str(mpd_k3_power_975), str(np.mean(mpd_k3_z_list)), str(mpd_k3_z_025), str(mpd_k3_z_975)]) + '\n')

    df_out.close()