Пример #1
0
def get_F_stat_mcd(pca_array, groups, k=3):
    X = pca_array[:, 0:k]
    mcd_overall = pt.get_mean_centroid_distance(X)
    centroid_overall = np.mean(X, axis=0)
    between_var = 0
    within_var = 0
    K = len(groups)
    N = np.shape(X)[0]

    for group in groups:
        #euc_dists_group = euc_dists[group[:, None], group]
        X_group = X[group, :]
        mcd_group = pt.get_mean_centroid_distance(X_group)
        groups_values = []
        centroid_group = np.mean(X_group, axis=0)
        #between_var += ((mcd_group - mcd_group) ** 2) * len(groups)
        between_var += ((centroid_group - centroid_overall)**2) * len(groups)

        #centroid_distances = np.sqrt(np.sum(np.square(X_group - np.mean(X_group, axis = 0)), axis=1))
        #within_var += sum( (centroid_distances - mcd_group) ** 2 )

        #within_var += sum(np.sqrt(np.sum(np.square(X_group - np.mean(X_group, axis = 0)), axis=1)))
        within_var += sum(
            np.sqrt(np.sum(np.square(X_group - centroid_group), axis=1)))

    return (between_var / (K - 1)) / (within_var / (N - K))
Пример #2
0
def run_pca_sample_size_permutation(iter=10000, analysis='PCA', k=3):
    df_path = pt.get_path() + '/data/Tenaillon_et_al/gene_by_pop.txt'
    df = pd.read_csv(df_path, sep='\t', header='infer', index_col=0)
    df_array = df.as_matrix()
    sample_sizes = np.linspace(2, df.shape[0], num=20, dtype=int)
    df_out = open(
        pt.get_path() + '/data/Tenaillon_et_al/sample_size_permute_' +
        analysis + '.txt', 'w')
    column_headers = [
        'Sample_size', 'Iteration', 'MCD', 'mean_angle', 'delta_L'
    ]
    df_out.write('\t'.join(column_headers) + '\n')
    for sample_size in sample_sizes:
        print("Sample size = " + str(sample_size))
        for i in range(iter):
            print("Sample size = " + str(sample_size) + ' Iteration = ' +
                  str(i))
            df_sample = df.sample(n=sample_size)
            #df_sample = df_sample.loc[:, (df_sample != 0).any(axis=0)]
            df_sample_delta = pt.likelihood_matrix(
                df_sample, 'Tenaillon_et_al').get_likelihood_matrix()
            df_sample_delta = df_sample_delta.loc[:,
                                                  (df_sample_delta != 0).any(
                                                      axis=0)]
            X = pt.hellinger_transform(df_sample_delta)
            pca = PCA()
            df_sample_delta_out = pca.fit_transform(X)
            mcd = pt.get_mean_centroid_distance(df_sample_delta_out, k=k)
            mean_angle = pt.get_mean_angle(df_sample_delta_out, k=k)
            mean_length = pt.get_euclidean_distance(df_sample_delta_out, k=k)

            df_out.write('\t'.join([
                str(sample_size),
                str(i),
                str(mcd),
                str(mean_angle),
                str(mean_length)
            ]) + '\n')

    df_out.close()
Пример #3
0
def run_pca_permutation(iter=10000, analysis='PCA', dataset='tenaillon'):
    if dataset == 'tenaillon':
        k = 3
        df_path = pt.get_path() + '/data/Tenaillon_et_al/gene_by_pop.txt'
        df = pd.read_csv(df_path, sep='\t', header='infer', index_col=0)
        df_array = df.as_matrix()
        df_out = open(
            pt.get_path() + '/data/Tenaillon_et_al/permute_' + analysis +
            '.txt', 'w')
        column_headers = [
            'Iteration', 'MCD', 'mean_angle', 'mean_dist', 'delta_L', 'x_stat'
        ]
        df_out.write('\t'.join(column_headers) + '\n')
        for i in range(iter):
            print(i)
            df_rndm = pd.DataFrame(data=pt.random_matrix(df_array),
                                   index=df.index,
                                   columns=df.columns)
            df_rndm_delta = pt.likelihood_matrix(
                df_rndm, 'Tenaillon_et_al').get_likelihood_matrix()
            if analysis == 'PCA':
                X = pt.hellinger_transform(df_rndm_delta)
                pca = PCA()
                df_rndm_delta_out = pca.fit_transform(X)
                #df_pca = pd.DataFrame(data=X_pca, index=df.index)
            mean_angle = pt.get_mean_angle(df_rndm_delta_out, k=k)
            mcd = pt.get_mean_centroid_distance(df_rndm_delta_out, k=k)
            mean_length = pt.get_euc_magnitude_diff(df_rndm_delta_out, k=k)
            mean_dist = pt.get_mean_pairwise_euc_distance(df_rndm_delta_out,
                                                          k=k)
            x_stat = pt.get_x_stat(pca.explained_variance_[:-1])
            df_out.write('\t'.join([
                str(i),
                str(mcd),
                str(mean_angle),
                str(mean_dist),
                str(mean_length),
                str(x_stat)
            ]) + '\n')
        df_out.close()

    elif dataset == 'good':
        k = 5
        df_path = pt.get_path() + '/data/Good_et_al/gene_by_pop.txt'
        df = pd.read_csv(df_path, sep='\t', header='infer', index_col=0)
        to_exclude = pt.complete_nonmutator_lines()
        to_exclude.append('p5')
        df_nonmut = df[df.index.str.contains('|'.join(to_exclude))]
        # remove columns with all zeros
        df_nonmut = df_nonmut.loc[:, (df_nonmut != 0).any(axis=0)]
        time_points = [int(x.split('_')[1]) for x in df_nonmut.index.values]
        time_points_set = sorted(
            list(set([int(x.split('_')[1]) for x in df_nonmut.index.values])))
        df_nonmut_array = df_nonmut.as_matrix()
        time_points_positions = {}
        for x in time_points_set:
            time_points_positions[x] = [
                i for i, j in enumerate(time_points) if j == x
            ]
        df_final = df_nonmut.iloc[time_points_positions[time_points_set[-1]]]

        df_out = open(
            pt.get_path() + '/data/Good_et_al/permute_' + analysis + '.txt',
            'w')
        #column_headers = ['Iteration', 'Generation', 'MCD']
        column_headers = [
            'Iteration', 'Generation', 'MCD', 'mean_angle', 'delta_L',
            'mean_dist'
        ]
        df_out.write('\t'.join(column_headers) + '\n')
        for i in range(iter):
            print("Iteration " + str(i))
            matrix_0 = df_nonmut.iloc[time_points_positions[
                time_points_set[0]]]
            matrix_0_rndm = pt.random_matrix(matrix_0.as_matrix())
            df_rndm_list = [
                pd.DataFrame(data=matrix_0_rndm,
                             index=matrix_0.index,
                             columns=matrix_0.columns)
            ]
            # skip first time step
            for j, tp in enumerate(time_points_set[0:]):
                if j == 0:
                    continue
                df_tp_minus1 = df_nonmut[df_nonmut.index.str.contains(
                    '_' + str(time_points_set[j - 1]))]
                df_tp = df_nonmut[df_nonmut.index.str.contains('_' + str(tp))]
                matrix_diff = df_tp.as_matrix() - df_tp_minus1.as_matrix()
                matrix_0_rndm = matrix_0_rndm + pt.random_matrix(matrix_diff)
                df_0_rndm = pd.DataFrame(data=matrix_0_rndm,
                                         index=df_tp.index,
                                         columns=df_tp.columns)
                df_rndm_list.append(df_0_rndm)

            df_rndm = pd.concat(df_rndm_list)
            df_rndm_delta = pt.likelihood_matrix(
                df_rndm, 'Good_et_al').get_likelihood_matrix()
            if analysis == 'PCA':
                X = pt.hellinger_transform(df_rndm_delta)
                pca = PCA()
                matrix_rndm_delta_out = pca.fit_transform(X)
            elif analysis == 'cMDS':
                matrix_rndm_delta_bc = np.sqrt(
                    pt.get_bray_curtis(df_rndm_delta.as_matrix()))
                matrix_rndm_delta_out = pt.cmdscale(matrix_rndm_delta_bc)[0]
            else:
                print("Analysis argument not accepted")
                continue

            df_rndm_delta_out = pd.DataFrame(data=matrix_rndm_delta_out,
                                             index=df_rndm_delta.index)
            for tp in time_points_set:
                df_rndm_delta_out_tp = df_rndm_delta_out[
                    df_rndm_delta_out.index.str.contains('_' + str(tp))]
                df_rndm_delta_out_tp_matrix = df_rndm_delta_out_tp.as_matrix()
                mean_angle = pt.get_mean_angle(df_rndm_delta_out_tp_matrix,
                                               k=k)
                mcd = pt.get_mean_centroid_distance(
                    df_rndm_delta_out_tp_matrix, k=k)
                mean_length = pt.get_euc_magnitude_diff(
                    df_rndm_delta_out_tp_matrix, k=k)
                mean_dist = pt.get_mean_pairwise_euc_distance(
                    df_rndm_delta_out_tp_matrix, k=k)
                df_out.write('\t'.join([
                    str(i),
                    str(tp),
                    str(mcd),
                    str(mean_angle),
                    str(mean_length),
                    str(mean_dist)
                ]) + '\n')

        df_out.close()
Пример #4
0
def run_ba_ntwk_cov_sims():
    df_out = open(pt.get_path() + '/data/simulations/cov_ba_ntwrk_ev.txt', 'w')
    n_pops = 100
    n_genes = 50
    ntwk = nx.barabasi_albert_graph(n_genes, 2)
    ntwk_np = nx.to_numpy_matrix(ntwk)
    lambda_genes = np.random.gamma(shape=3, scale=1, size=n_genes)
    df_out.write('\t'.join([
        'Cov', 'Iteration', 'euc_z_score', 'euc_percent', 'eig_percent',
        'mcd_percent_k1', 'mcd_percent_k3'
    ]) + '\n')
    covs = [0.05, 0.1, 0.15, 0.2]
    #covs = [0.2, 0.7]
    for cov in covs:
        C = ntwk_np * cov
        np.fill_diagonal(C, 1)
        #z_scores = []
        #eig_percents = []
        #euc_percents = []
        #centroid_percents_k1 = []
        #centroid_percents_k3 = []
        for i in range(1000):
            test_cov = np.stack(
                [get_count_pop(lambda_genes, cov=C) for x in range(n_pops)],
                axis=0)
            X = pt.hellinger_transform(test_cov)
            pca = PCA()
            pca_fit = pca.fit_transform(X)
            euc_dist = pt.get_mean_pairwise_euc_distance(pca_fit)
            euc_dists = []
            eig = pt.get_x_stat(pca.explained_variance_[:-1])
            mcd_k1 = pt.get_mean_centroid_distance(pca_fit, k=1)
            mcd_k3 = pt.get_mean_centroid_distance(pca_fit, k=3)
            eigs = []
            centroid_dists_k1 = []
            centroid_dists_k3 = []
            for j in range(1000):
                X_j = pt.hellinger_transform(pt.random_matrix(test_cov))
                #pca_j = PCA()
                #pca_fit_j = pca_j.fit_transform(X_j)
                pca_fit_j = pca.fit_transform(X_j)
                euc_dists.append(pt.get_mean_pairwise_euc_distance(pca_fit_j))
                centroid_dists_k1.append(
                    pt.get_mean_centroid_distance(pca_fit_j, k=1))
                centroid_dists_k3.append(
                    pt.get_mean_centroid_distance(pca_fit_j, k=3))
                eigs.append(pt.get_x_stat(pca.explained_variance_[:-1]))
                #eigs.append( pt.get_x_stat(pca_j.explained_variance_[:-1]) )
            z_score = (euc_dist - np.mean(euc_dists)) / np.std(euc_dists)
            euc_percent = len([k for k in euc_dists if k < euc_dist
                               ]) / len(euc_dists)
            eig_percent = len([k for k in eigs if k < eig]) / len(eigs)
            centroid_percent_k1 = len([
                k for k in centroid_dists_k1 if k < mcd_k1
            ]) / len(centroid_dists_k1)
            centroid_percent_k3 = len([
                k for k in centroid_dists_k3 if k < mcd_k3
            ]) / len(centroid_dists_k3)
            #eig_percents.append(eig_percent)
            #euc_percents.append(euc_percent)
            #z_scores.append(z_score)
            print(cov, i, z_score, euc_percent, eig_percent)
            df_out.write('\t'.join([
                str(cov),
                str(i),
                str(z_score),
                str(euc_percent),
                str(eig_percent),
                str(centroid_percent_k1),
                str(centroid_percent_k3)
            ]) + '\n')

        #print(cov, np.all(np.linalg.eigvals(C) > 0), np.mean(z_scores))

    df_out.close()
Пример #5
0
def hist_tenaillon_multi(k = 3):
    df_path = pt.get_path() + '/data/Tenaillon_et_al/gene_by_pop.txt'
    df = pd.read_csv(df_path, sep = '\t', header = 'infer', index_col = 0)
    df_delta = pt.likelihood_matrix(df, 'Tenaillon_et_al').get_likelihood_matrix()
    X = pt.hellinger_transform(df_delta)
    pca = PCA()
    df_out = pca.fit_transform(X)

    df_null_path = pt.get_path() + '/data/Tenaillon_et_al/permute_PCA.txt'
    df_null = pd.read_csv(df_null_path, sep = '\t', header = 'infer', index_col = 0)

    mean_angle = pt.get_mean_angle(df_out, k = k)
    mcd = pt.get_mean_centroid_distance(df_out, k=k)
    #mean_length = pt.get_euclidean_distance(df_out, k=k)
    mean_dist = pt.get_mean_pairwise_euc_distance(df_out, k=k)
    x_stat = pt.get_x_stat(pca.explained_variance_[:-1])

    fig = plt.figure()

    ax1 = plt.subplot2grid((2, 2), (0, 0), colspan=1)
    ax1.axhline(y=0, color='k', linestyle=':', alpha = 0.8, zorder=1)
    ax1.axvline(x=0, color='k', linestyle=':', alpha = 0.8, zorder=2)
    ax1.scatter(0, 0, marker = "o", edgecolors='none', c = 'darkgray', s = 120, zorder=3)
    ax1.scatter(df_out[:,0], df_out[:,1], marker = "o", edgecolors='#244162', c = '#175ac6', alpha = 0.4, s = 60, zorder=4)

    ax1.set_xlim([-0.75,0.75])
    ax1.set_ylim([-0.75,0.75])
    ax1.set_xlabel('PCA 1 (' + str(round(pca.explained_variance_ratio_[0],3)*100) + '%)' , fontsize = 14)
    ax1.set_ylabel('PCA 2 (' + str(round(pca.explained_variance_ratio_[1],3)*100) + '%)' , fontsize = 14)


    ax2 = plt.subplot2grid((2, 2), (0, 1), colspan=1)
    mcd_list = df_null.MCD.tolist()
    #ax2.hist(mcd_list, bins=30, histtype='stepfilled', normed=True, alpha=0.6, color='b')
    ax2.hist(mcd_list,bins=30, weights=np.zeros_like(mcd_list) + 1. / len(mcd_list), alpha=0.8, color = '#175ac6')
    ax2.axvline(mcd, color = 'red', lw = 3)
    ax2.set_xlabel("Mean centroid distance, " + r'$ \left \langle \delta_{c}  \right \rangle$', fontsize = 14)
    ax2.set_ylabel("Frequency", fontsize = 16)

    mcd_list.append(mcd)
    relative_position_mcd = sorted(mcd_list).index(mcd) / (len(mcd_list) -1)
    if relative_position_mcd > 0.5:
        p_score_mcd = 1 - relative_position_mcd
    else:
        p_score_mcd = relative_position_mcd
    print('mean centroid distance p-score = ' + str(round(p_score_mcd, 3)))
    ax2.text(0.366, 0.088, r'$p < 0.05$', fontsize = 10)

    ax3 = plt.subplot2grid((2, 2), (1, 0), colspan=1)
    delta_L_list = df_null.mean_dist.tolist()
    #ax3.hist(delta_L_list, bins=30, histtype='stepfilled', normed=True, alpha=0.6, color='b')
    ax3.hist(delta_L_list,bins=30, weights=np.zeros_like(delta_L_list) + 1. / len(delta_L_list), alpha=0.8, color = '#175ac6')
    ax3.axvline(mean_dist, color = 'red', lw = 3)
    ax3.set_xlabel("Mean pair-wise \n Euclidean distance, " + r'$   \left \langle   d \right  \rangle$', fontsize = 14)
    ax3.set_ylabel("Frequency", fontsize = 16)

    delta_L_list.append(mean_dist)
    relative_position_delta_L = sorted(delta_L_list).index(mean_dist) / (len(delta_L_list) -1)
    if relative_position_delta_L > 0.5:
        p_score_delta_L = 1 - relative_position_delta_L
    else:
        p_score_delta_L = relative_position_delta_L
    print('mean difference in distances p-score = ' + str(round(p_score_delta_L, 3)))
    ax3.text(0.50, 0.09, r'$p < 0.05$', fontsize = 10)



    ax4 = plt.subplot2grid((2, 2), (1, 1), colspan=1)
    ax4_values = df_null.x_stat.values
    ax4_values = ax4_values[np.logical_not(np.isnan(ax4_values))]
    #ax4.hist(ax4_values, bins=30, histtype='stepfilled', normed=True, alpha=0.6, color='b')
    ax4.hist(ax4_values, bins=30, weights=np.zeros_like(ax4_values) + 1. / len(ax4_values), alpha=0.8, color = '#175ac6')
    print(np.mean(ax4_values))
    print(stats.mode(ax4_values))

    ax4.axvline(x_stat, color = 'red', lw = 3)
    ax4.set_xlabel(r'$F_{1}$', fontsize = 14)
    ax4.set_ylabel("Frequency", fontsize = 16)

    mean_angle_list = ax4_values.tolist()
    mean_angle_list.append(mean_angle)
    relative_position_angle = sorted(mean_angle_list).index(mean_angle) / (len(mean_angle_list) -1)
    print(x_stat)
    print( len([x for x in mean_angle_list if x > x_stat])/  sum(mean_angle_list)  )
    if relative_position_angle > 0.5:
        p_score_angle = 1 - relative_position_angle
    else:
        p_score_angle = relative_position_angle
    print('F_{1} statistic p-score = ' + str(round(p_score_angle, 3)))
    ax4.text(19.1, 0.09, r'$p \nless  0.05$', fontsize = 10)

    plt.tight_layout()
    fig_name = pt.get_path() + '/figs/fig1.png'
    fig.savefig(fig_name, bbox_inches = "tight", pad_inches = 0.4, dpi = 600)
    plt.close()
Пример #6
0
def run_ba_ntwk_cluster_sims(iter1=1000, iter2=1000, cov=0.2):
    df_out = open(mydir + '/data/simulations/cov_ba_ntwrk_cluster_methods.txt', 'w')
    df_out.write('\t'.join(['Prob', 'CC_mean', 'CC_025', 'CC_975', 'Method', 'Power', 'Power_025', 'Power_975', 'Z_mean', 'Z_025', 'Z_975']) + '\n')

    n_pops=100
    n_genes=50
    #covs = [0.05, 0.1, 0.15, 0.2]
    ps = [0, 0.2, 0.4, 0.6, 0.8, 1]
    for p in ps:
        eig_p_list = []
        mcd_k1_p_list = []
        mcd_k3_p_list = []
        mpd_k1_p_list = []
        mpd_k3_p_list = []

        eig_z_list = []
        mcd_k1_z_list = []
        mcd_k3_z_list = []
        mpd_k1_z_list = []
        mpd_k3_z_list = []

        cc_list = []
        for i in range(iter1):
            if i %100 ==0:
                print(ps, i)
            lambda_genes = np.random.gamma(shape=3, scale=1, size=n_genes)
            C, cc = pt.get_ba_cov_matrix(n_genes, cov=cov,  p=p)
            test_cov = np.stack( [pt.get_count_pop(lambda_genes, cov= C) for x in range(n_pops)] , axis=0 )
            X = test_cov/test_cov.sum(axis=1)[:,None]
            X -= np.mean(X, axis = 0)
            pca = PCA()
            pca_fit = pca.fit_transform(X)
            mpd_k1 = pt.get_mean_pairwise_euc_distance(pca_fit,k=1)
            mpd_k3 = pt.get_mean_pairwise_euc_distance(pca_fit,k=3)

            eig = pt.get_x_stat(pca.explained_variance_[:-1], n_features=n_genes)
            mcd_k1 = pt.get_mean_centroid_distance(pca_fit, k = 1)
            mcd_k3 = pt.get_mean_centroid_distance(pca_fit, k = 3)

            eig_null_list = []
            mcd_k1_null_list = []
            mcd_k3_null_list = []
            mpd_k1_null_list = []
            mpd_k3_null_list = []
            for j in range(iter2):
                test_cov_rndm = pt.get_random_matrix(test_cov)
                X_j = test_cov_rndm/test_cov_rndm.sum(axis=1)[:,None]
                X_j -= np.mean(X_j, axis = 0)
                pca_j = PCA()
                pca_fit_j = pca_j.fit_transform(X_j)
                #pca_fit_j = pca.fit_transform(X_j)
                mpd_k1_null_list.append( pt.get_mean_pairwise_euc_distance(pca_fit_j, k = 1 ) )
                mpd_k3_null_list.append( pt.get_mean_pairwise_euc_distance(pca_fit_j, k = 3 ) )
                mcd_k1_null_list.append(pt.get_mean_centroid_distance(pca_fit_j, k = 1))
                mcd_k3_null_list.append(pt.get_mean_centroid_distance(pca_fit_j, k = 3))
                eig_null_list.append( pt.get_x_stat(pca_j.explained_variance_[:-1], n_features=n_genes) )

            #print(len( [k for k in eig_null_list if k > eig] ) / iter1)
            eig_p_list.append(len( [k for k in eig_null_list if k > eig] ) / iter1)
            mcd_k1_p_list.append( len( [k for k in mcd_k1_null_list if k > mcd_k1] ) / iter1 )
            mcd_k3_p_list.append( len( [k for k in mcd_k3_null_list if k > mcd_k3] ) / iter1 )

            mpd_k1_p_list.append( len( [k for k in mpd_k1_null_list if k > mpd_k1] ) / iter1 )
            mpd_k3_p_list.append( len( [k for k in mpd_k3_null_list if k > mpd_k3] ) / iter1 )

            cc_list.append(cc)

            eig_z_list.append( (eig - np.mean(eig_null_list)) / np.std(eig_null_list)  )
            mcd_k1_z_list.append( (mcd_k1 - np.mean(mcd_k1_null_list)) / np.std(mcd_k1_null_list)  )
            mcd_k3_z_list.append( (mcd_k3 - np.mean(mcd_k3_null_list)) / np.std(mcd_k3_null_list)  )
            mpd_k1_z_list.append( (mpd_k1 - np.mean(mpd_k1_null_list)) / np.std(mpd_k1_null_list)  )
            mpd_k3_z_list.append( (mpd_k3 - np.mean(mpd_k3_null_list)) / np.std(mpd_k3_null_list)  )


        # calculate
        cc_mean = np.mean(cc_list)
        cc_bs_mean_list = []
        for iter_i in range(10000):
            cc_bs_mean_list.append( np.mean( np.random.choice(cc_list, size=50, replace=True ) ))
        cc_bs_mean_list.sort()
        cc_975 = cc_bs_mean_list[ int(0.975 * 10000) ]
        cc_025 = cc_bs_mean_list[ int(0.025 * 10000) ]


        eig_power = len([n for n in eig_p_list if n < 0.05]) / iter1
        eig_power_025, eig_power_975 = get_bootstrap_power_ci(eig_p_list)

        mcd_k1_power = len([n for n in mcd_k1_p_list if n < 0.05]) / iter1
        mcd_k1_power_025, mcd_k1_power_975 = get_bootstrap_power_ci(mcd_k1_p_list)

        mcd_k3_power = len([n for n in mcd_k3_p_list if n < 0.05]) / iter1
        mcd_k3_power_025, mcd_k3_power_975 = get_bootstrap_power_ci(mcd_k3_p_list)

        mpd_k1_power = len([n for n in mpd_k1_p_list if n < 0.05]) / iter1
        mpd_k1_power_025, mpd_k1_power_975 = get_bootstrap_power_ci(mpd_k1_p_list)

        mpd_k3_power = len([n for n in mpd_k3_p_list if n < 0.05]) / iter1
        mpd_k3_power_025, mpd_k3_power_975 = get_bootstrap_power_ci(mpd_k3_p_list)


        eig_z_025, eig_z_975 = get_bootstrap_ci(eig_z_list)
        mcd_k1_z_025, mcd_k1_z_975 = get_bootstrap_ci(mcd_k1_z_list)
        mcd_k3_z_025, mcd_k3_z_975 = get_bootstrap_ci(mcd_k3_z_list)
        mpd_k1_z_025, mpd_k1_z_975 = get_bootstrap_ci(mpd_k1_z_list)
        mpd_k3_z_025, mpd_k3_z_975 = get_bootstrap_ci(mpd_k3_z_list)

        df_out.write('\t'.join([str(p), str(cc_mean), str(cc_025), str(cc_975), 'Eig', str(eig_power), str(eig_power_025), str(eig_power_975), str(np.mean(eig_z_list)), str(eig_z_025), str(eig_z_975)]) + '\n')
        df_out.write('\t'.join([str(p), str(cc_mean), str(cc_025), str(cc_975), 'MCD_k1', str(mcd_k1_power), str(mcd_k1_power_025), str(mcd_k1_power_975), str(np.mean(mcd_k1_z_list)), str(mcd_k1_z_025), str(mcd_k1_z_975)]) + '\n')
        df_out.write('\t'.join([str(p), str(cc_mean), str(cc_025), str(cc_975), 'MCD_k3', str(mcd_k3_power), str(mcd_k3_power_025), str(mcd_k3_power_975), str(np.mean(mcd_k3_z_list)), str(mcd_k3_z_025), str(mcd_k3_z_975)]) + '\n')
        df_out.write('\t'.join([str(p), str(cc_mean), str(cc_025), str(cc_975), 'MPD_k1', str(mpd_k1_power), str(mpd_k1_power_025), str(mpd_k1_power_975), str(np.mean(mpd_k1_z_list)), str(mpd_k1_z_025), str(mpd_k1_z_975)]) + '\n')
        df_out.write('\t'.join([str(p), str(cc_mean), str(cc_025), str(cc_975), 'MPD_k3', str(mpd_k3_power), str(mpd_k3_power_025), str(mpd_k3_power_975), str(np.mean(mpd_k3_z_list)), str(mpd_k3_z_025), str(mpd_k3_z_975)]) + '\n')

    df_out.close()
Пример #7
0
def run_ba_ntwk_cov_sims(iter1=1000, iter2=1000, n_pops=100, n_genes=50):
    df_out = open(mydir + '/data/simulations/cov_ba_ntwrk_methods.txt', 'w')
    df_out.write('\t'.join(['Cov', 'Method', 'Power', 'Power_025', 'Power_975', 'Z_mean', 'Z_025', 'Z_975']) + '\n')

    covs = [0.05, 0.1, 0.15, 0.2]
    #covs = [0.2]
    for cov in covs:
        eig_p_list = []
        mcd_k1_p_list = []
        mcd_k3_p_list = []
        mpd_k1_p_list = []
        mpd_k3_p_list = []

        eig_z_list = []
        mcd_k1_z_list = []
        mcd_k3_z_list = []
        mpd_k1_z_list = []
        mpd_k3_z_list = []
        for i in range(iter1):
            if i %100 ==0:
                print(cov, i)
            lambda_genes = np.random.gamma(shape=3, scale=1, size=n_genes)
            C = pt.get_ba_cov_matrix(n_genes, cov=cov)
            test_cov = np.stack( [pt.get_count_pop(lambda_genes, cov= C) for x in range(n_pops)] , axis=0 )
            X = test_cov/test_cov.sum(axis=1)[:,None]
            X -= np.mean(X, axis = 0)
            pca = PCA()
            pca_fit = pca.fit_transform(X)
            mpd_k1 = pt.get_mean_pairwise_euc_distance(pca_fit,k=1)
            mpd_k3 = pt.get_mean_pairwise_euc_distance(pca_fit,k=3)

            eig = pt.get_x_stat(pca.explained_variance_[:-1], n_features=n_genes)
            mcd_k1 = pt.get_mean_centroid_distance(pca_fit, k = 1)
            mcd_k3 = pt.get_mean_centroid_distance(pca_fit, k = 3)

            #print(pca.explained_variance_[:-1])
            #print(pt.get_x_stat(pca.explained_variance_[:-1]))
            eig_null_list = []
            mcd_k1_null_list = []
            mcd_k3_null_list = []
            mpd_k1_null_list = []
            mpd_k3_null_list = []
            for j in range(iter2):
                test_cov_rndm = pt.get_random_matrix(test_cov)
                X_j = test_cov_rndm/test_cov_rndm.sum(axis=1)[:,None]
                X_j -= np.mean(X_j, axis = 0)
                pca_j = PCA()
                pca_fit_j = pca_j.fit_transform(X_j)
                #pca_fit_j = pca.fit_transform(X_j)
                mpd_k1_null_list.append( pt.get_mean_pairwise_euc_distance(pca_fit_j, k = 1 ) )
                mpd_k3_null_list.append( pt.get_mean_pairwise_euc_distance(pca_fit_j, k = 3 ) )
                mcd_k1_null_list.append(pt.get_mean_centroid_distance(pca_fit_j, k = 1))
                mcd_k3_null_list.append(pt.get_mean_centroid_distance(pca_fit_j, k = 3))
                eig_null_list.append( pt.get_x_stat(pca_j.explained_variance_[:-1], n_features=n_genes) )

            eig_p_list.append(len( [k for k in eig_null_list if k > eig] ) / iter1)
            mcd_k1_p_list.append( len( [k for k in mcd_k1_null_list if k > mcd_k1] ) / iter1 )
            mcd_k3_p_list.append( len( [k for k in mcd_k3_null_list if k > mcd_k3] ) / iter1 )

            mpd_k1_p_list.append( len( [k for k in mpd_k1_null_list if k > mpd_k1] ) / iter1 )
            mpd_k3_p_list.append( len( [k for k in mpd_k3_null_list if k > mpd_k3] ) / iter1 )


            eig_z_list.append( (eig - np.mean(eig_null_list)) / np.std(eig_null_list)  )
            mcd_k1_z_list.append( (mcd_k1 - np.mean(mcd_k1_null_list)) / np.std(mcd_k1_null_list)  )
            mcd_k3_z_list.append( (mcd_k3 - np.mean(mcd_k3_null_list)) / np.std(mcd_k3_null_list)  )
            mpd_k1_z_list.append( (mpd_k1 - np.mean(mpd_k1_null_list)) / np.std(mpd_k1_null_list)  )
            mpd_k3_z_list.append( (mpd_k3 - np.mean(mpd_k3_null_list)) / np.std(mpd_k3_null_list)  )



        # calculate power
        eig_power = len([n for n in eig_p_list if n < 0.05]) / iter1
        eig_power_025, eig_power_975 = get_bootstrap_power_ci(eig_p_list)

        mcd_k1_power = len([n for n in mcd_k1_p_list if n < 0.05]) / iter1
        mcd_k1_power_025, mcd_k1_power_975 = get_bootstrap_power_ci(mcd_k1_p_list)

        mcd_k3_power = len([n for n in mcd_k3_p_list if n < 0.05]) / iter1
        mcd_k3_power_025, mcd_k3_power_975 = get_bootstrap_power_ci(mcd_k3_p_list)

        mpd_k1_power = len([n for n in mpd_k1_p_list if n < 0.05]) / iter1
        mpd_k1_power_025, mpd_k1_power_975 = get_bootstrap_power_ci(mpd_k1_p_list)

        mpd_k3_power = len([n for n in mpd_k3_p_list if n < 0.05]) / iter1
        mpd_k3_power_025, mpd_k3_power_975 = get_bootstrap_power_ci(mpd_k3_p_list)

        eig_z_025, eig_z_975 = get_bootstrap_ci(eig_z_list)
        mcd_k1_z_025, mcd_k1_z_975 = get_bootstrap_ci(mcd_k1_z_list)
        mcd_k3_z_025, mcd_k3_z_975 = get_bootstrap_ci(mcd_k3_z_list)
        mpd_k1_z_025, mpd_k1_z_975 = get_bootstrap_ci(mpd_k1_z_list)
        mpd_k3_z_025, mpd_k3_z_975 = get_bootstrap_ci(mpd_k3_z_list)

        df_out.write('\t'.join([str(cov), 'Eig', str(eig_power), str(eig_power_025), str(eig_power_975), str(np.mean(eig_z_list)), str(eig_z_025), str(eig_z_975)]) + '\n')
        df_out.write('\t'.join([str(cov), 'MCD_k1', str(mcd_k1_power), str(mcd_k1_power_025), str(mcd_k1_power_975), str(np.mean(mcd_k1_z_list)), str(mcd_k1_z_025), str(mcd_k1_z_975)]) + '\n')
        df_out.write('\t'.join([str(cov), 'MCD_k3', str(mcd_k3_power), str(mcd_k3_power_025), str(mcd_k3_power_975), str(np.mean(mcd_k3_z_list)), str(mcd_k3_z_025), str(mcd_k3_z_975)]) + '\n')
        df_out.write('\t'.join([str(cov), 'MPD_k1', str(mpd_k1_power), str(mpd_k1_power_025), str(mpd_k1_power_975), str(np.mean(mpd_k1_z_list)), str(mpd_k1_z_025), str(mpd_k1_z_975)]) + '\n')
        df_out.write('\t'.join([str(cov), 'MPD_k3', str(mpd_k3_power), str(mpd_k3_power_025), str(mpd_k3_power_975), str(np.mean(mpd_k3_z_list)), str(mpd_k3_z_025), str(mpd_k3_z_975)]) + '\n')

    df_out.close()