Пример #1
0
def run_all_sims():
    df_out = open(pt.get_path() + '/data/simulations/cov_euc.txt', 'w')
    n_pops = 20
    n_genes = 50
    lambda_genes = np.random.gamma(shape=3, scale=1, size=n_genes)
    covs = [0.5, 0, -0.5]
    df_out.write('\t'.join(['Covariance', 'Iteration', 'z_score']) + '\n')
    for cov in covs:
        for i in range(100):
            print(str(cov) + ' ' + str(i))
            test_cov = np.stack(
                [get_count_pop(lambda_genes, cov=cov) for x in range(n_pops)],
                axis=0)
            X = pt.hellinger_transform(test_cov)
            pca = PCA()
            pca_fit = pca.fit_transform(X)
            euc_dist = pt.get_euclidean_distance(pca_fit)
            sim_eucs = []
            for j in range(1000):
                #if j % 100 == 0:
                #    print(j)
                X_j = pt.hellinger_transform(pt.random_matrix(test_cov))
                pca_fit_j = pca.fit_transform(X_j)
                sim_eucs.append(pt.get_euclidean_distance(pca_fit_j))
            z_score = (euc_dist - np.mean(sim_eucs)) / np.std(sim_eucs)

            df_out.write('\t'.join([str(cov), str(i), str(z_score)]) + '\n')

    df_out.close()
Пример #2
0
def run_ntwrk_cov_sims(var=1, cov=0.25):
    df_out = open(
        pt.get_path() + '/data/simulations/cov_ntwrk_euc_pos_only_010.txt',
        'w')
    n_pops = 20
    n_genes = 250
    lambda_genes = np.random.gamma(shape=3, scale=1, size=n_genes)
    df_out.write('\t'.join(['Cov', 'Iteration', 'z_score']) + '\n')
    C = np.loadtxt(pt.get_path() + '/data/modular_ntwrk_mu_010.txt',
                   delimiter='\t')  #, dtype='int')
    #print(np.mean(np.sum(ntwrk, axis =1)))
    C = C * cov
    np.fill_diagonal(C, var)
    for i in range(100):
        test_cov = np.stack(
            [get_count_pop(lambda_genes, cov=C) for x in range(n_pops)],
            axis=0)
        X = pt.hellinger_transform(test_cov)
        pca = PCA()
        pca_fit = pca.fit_transform(X)
        euc_dist = pt.get_mean_pairwise_euc_distance(pca_fit)
        sim_eucs = []
        for j in range(1000):
            X_j = pt.hellinger_transform(pt.random_matrix(test_cov))
            pca_fit_j = pca.fit_transform(X_j)
            sim_eucs.append(pt.get_mean_pairwise_euc_distance(pca_fit_j))
        z_score = (euc_dist - np.mean(sim_eucs)) / np.std(sim_eucs)
        print(str(cov), ' ', str(i), ' ', str(z_score))
        df_out.write('\t'.join([str(cov), str(i), str(z_score)]) + '\n')

    df_out.close()
Пример #3
0
def run_block_cov_sims():
    df_out = open(
        pt.get_path() + '/data/simulations/cov_block_euc_pos_only.txt', 'w')
    n_pops = 20
    n_genes = 50
    lambda_genes = np.random.gamma(shape=3, scale=1, size=n_genes)
    df_out.write('\t'.join(['Cov', 'Iteration', 'z_score']) + '\n')
    #covs = [0.1, 0.3, 0.4, 0.5, 0.6, 0.7, 0.9]
    covs = [-0.9]
    for cov in covs:
        C = get_block_cov(n_genes, pos_cov=cov, neg_cov=cov)
        print(np.all(np.linalg.eigvals(C) > 0))
        print(C)
        for i in range(100):
            test_cov = np.stack(
                [get_count_pop(lambda_genes, cov=C) for x in range(n_pops)],
                axis=0)
            X = pt.hellinger_transform(test_cov)
            pca = PCA()
            pca_fit = pca.fit_transform(X)
            euc_dist = pt.get_mean_pairwise_euc_distance(pca_fit)
            sim_eucs = []
            for j in range(1000):
                X_j = pt.hellinger_transform(pt.random_matrix(test_cov))
                pca_fit_j = pca.fit_transform(X_j)
                sim_eucs.append(pt.get_mean_pairwise_euc_distance(pca_fit_j))
            z_score = (euc_dist - np.mean(sim_eucs)) / np.std(sim_eucs)
            print(str(cov), ' ', str(i), ' ', str(z_score))
            df_out.write('\t'.join([str(cov), str(i), str(z_score)]) + '\n')

    df_out.close()
Пример #4
0
def two_treats_sim(iter1=1000, iter2=1000, alpha=0.05):
    genes = 10
    pops1 = pops2 = 10
    shape = 1
    scale = 1
    muts1 = muts2 = 20
    to_reshuffle = [0, 5, 10, 15, 20]

    for reshuf in to_reshuffle:
        p_vales = []
        for i in range(iter1):
            #print(i)
            rates = np.random.gamma(shape, scale=scale, size=genes)
            rates1 = rates.copy()
            # permute rates
            shuffle(rates[:reshuf])
            rates2 = rates.copy()
            list_dicts1 = [
                Counter(
                    np.random.choice(genes,
                                     size=muts1,
                                     replace=True,
                                     p=rates1 / sum(rates1)))
                for i in range(pops1)
            ]
            list_dicts2 = [
                Counter(
                    np.random.choice(genes,
                                     size=muts2,
                                     replace=True,
                                     p=rates2 / sum(rates2)))
                for i in range(pops2)
            ]

            df1 = pd.DataFrame(list_dicts1)
            df2 = pd.DataFrame(list_dicts2)
            df = pd.concat([df1, df2])
            df = df.fillna(0)
            count_matrix = df.values
            groups = [
                np.asarray(list(range(0, pops1))),
                np.asarray(list(range(pops1, pops1 + pops2)))
            ]
            pca = PCA()
            X = pt.hellinger_transform(count_matrix)
            pca_fit = pca.fit_transform(X)
            F = get_F_stat_pairwise(pca_fit, groups)
            F_list = []
            for j in range(iter2):
                count_matrix_n0 = pt.random_matrix(count_matrix)
                X_n0 = pt.hellinger_transform(count_matrix_n0)
                pca_fit_n0 = pca.fit_transform(X_n0)
                F_list.append(get_F_stat_pairwise(pca_fit_n0, groups))

            p_vales.append(
                (len([x for x in F_list if x > F]) + 1) / (iter2 + 1))

        power = (len([k for k in p_vales if k < alpha]) + 1) / (iter1 + 1)
        print('Reshuffle = ' + str(reshuf) + ', Power ' + str(power))
Пример #5
0
def run_pca_permutation(iter=10000, analysis='PCA', dataset='tenaillon'):
    if dataset == 'tenaillon':
        k = 3
        df_path = pt.get_path() + '/data/Tenaillon_et_al/gene_by_pop.txt'
        df = pd.read_csv(df_path, sep='\t', header='infer', index_col=0)
        df_array = df.as_matrix()
        df_out = open(
            pt.get_path() + '/data/Tenaillon_et_al/permute_' + analysis +
            '.txt', 'w')
        column_headers = [
            'Iteration', 'MCD', 'mean_angle', 'mean_dist', 'delta_L', 'x_stat'
        ]
        df_out.write('\t'.join(column_headers) + '\n')
        for i in range(iter):
            print(i)
            df_rndm = pd.DataFrame(data=pt.random_matrix(df_array),
                                   index=df.index,
                                   columns=df.columns)
            df_rndm_delta = pt.likelihood_matrix(
                df_rndm, 'Tenaillon_et_al').get_likelihood_matrix()
            if analysis == 'PCA':
                X = pt.hellinger_transform(df_rndm_delta)
                pca = PCA()
                df_rndm_delta_out = pca.fit_transform(X)
                #df_pca = pd.DataFrame(data=X_pca, index=df.index)
            mean_angle = pt.get_mean_angle(df_rndm_delta_out, k=k)
            mcd = pt.get_mean_centroid_distance(df_rndm_delta_out, k=k)
            mean_length = pt.get_euc_magnitude_diff(df_rndm_delta_out, k=k)
            mean_dist = pt.get_mean_pairwise_euc_distance(df_rndm_delta_out,
                                                          k=k)
            x_stat = pt.get_x_stat(pca.explained_variance_[:-1])
            df_out.write('\t'.join([
                str(i),
                str(mcd),
                str(mean_angle),
                str(mean_dist),
                str(mean_length),
                str(x_stat)
            ]) + '\n')
        df_out.close()

    elif dataset == 'good':
        k = 5
        df_path = pt.get_path() + '/data/Good_et_al/gene_by_pop.txt'
        df = pd.read_csv(df_path, sep='\t', header='infer', index_col=0)
        to_exclude = pt.complete_nonmutator_lines()
        to_exclude.append('p5')
        df_nonmut = df[df.index.str.contains('|'.join(to_exclude))]
        # remove columns with all zeros
        df_nonmut = df_nonmut.loc[:, (df_nonmut != 0).any(axis=0)]
        time_points = [int(x.split('_')[1]) for x in df_nonmut.index.values]
        time_points_set = sorted(
            list(set([int(x.split('_')[1]) for x in df_nonmut.index.values])))
        df_nonmut_array = df_nonmut.as_matrix()
        time_points_positions = {}
        for x in time_points_set:
            time_points_positions[x] = [
                i for i, j in enumerate(time_points) if j == x
            ]
        df_final = df_nonmut.iloc[time_points_positions[time_points_set[-1]]]

        df_out = open(
            pt.get_path() + '/data/Good_et_al/permute_' + analysis + '.txt',
            'w')
        #column_headers = ['Iteration', 'Generation', 'MCD']
        column_headers = [
            'Iteration', 'Generation', 'MCD', 'mean_angle', 'delta_L',
            'mean_dist'
        ]
        df_out.write('\t'.join(column_headers) + '\n')
        for i in range(iter):
            print("Iteration " + str(i))
            matrix_0 = df_nonmut.iloc[time_points_positions[
                time_points_set[0]]]
            matrix_0_rndm = pt.random_matrix(matrix_0.as_matrix())
            df_rndm_list = [
                pd.DataFrame(data=matrix_0_rndm,
                             index=matrix_0.index,
                             columns=matrix_0.columns)
            ]
            # skip first time step
            for j, tp in enumerate(time_points_set[0:]):
                if j == 0:
                    continue
                df_tp_minus1 = df_nonmut[df_nonmut.index.str.contains(
                    '_' + str(time_points_set[j - 1]))]
                df_tp = df_nonmut[df_nonmut.index.str.contains('_' + str(tp))]
                matrix_diff = df_tp.as_matrix() - df_tp_minus1.as_matrix()
                matrix_0_rndm = matrix_0_rndm + pt.random_matrix(matrix_diff)
                df_0_rndm = pd.DataFrame(data=matrix_0_rndm,
                                         index=df_tp.index,
                                         columns=df_tp.columns)
                df_rndm_list.append(df_0_rndm)

            df_rndm = pd.concat(df_rndm_list)
            df_rndm_delta = pt.likelihood_matrix(
                df_rndm, 'Good_et_al').get_likelihood_matrix()
            if analysis == 'PCA':
                X = pt.hellinger_transform(df_rndm_delta)
                pca = PCA()
                matrix_rndm_delta_out = pca.fit_transform(X)
            elif analysis == 'cMDS':
                matrix_rndm_delta_bc = np.sqrt(
                    pt.get_bray_curtis(df_rndm_delta.as_matrix()))
                matrix_rndm_delta_out = pt.cmdscale(matrix_rndm_delta_bc)[0]
            else:
                print("Analysis argument not accepted")
                continue

            df_rndm_delta_out = pd.DataFrame(data=matrix_rndm_delta_out,
                                             index=df_rndm_delta.index)
            for tp in time_points_set:
                df_rndm_delta_out_tp = df_rndm_delta_out[
                    df_rndm_delta_out.index.str.contains('_' + str(tp))]
                df_rndm_delta_out_tp_matrix = df_rndm_delta_out_tp.as_matrix()
                mean_angle = pt.get_mean_angle(df_rndm_delta_out_tp_matrix,
                                               k=k)
                mcd = pt.get_mean_centroid_distance(
                    df_rndm_delta_out_tp_matrix, k=k)
                mean_length = pt.get_euc_magnitude_diff(
                    df_rndm_delta_out_tp_matrix, k=k)
                mean_dist = pt.get_mean_pairwise_euc_distance(
                    df_rndm_delta_out_tp_matrix, k=k)
                df_out.write('\t'.join([
                    str(i),
                    str(tp),
                    str(mcd),
                    str(mean_angle),
                    str(mean_length),
                    str(mean_dist)
                ]) + '\n')

        df_out.close()
Пример #6
0
def run_ba_ntwk_cov_sims():
    df_out = open(pt.get_path() + '/data/simulations/cov_ba_ntwrk_ev.txt', 'w')
    n_pops = 100
    n_genes = 50
    ntwk = nx.barabasi_albert_graph(n_genes, 2)
    ntwk_np = nx.to_numpy_matrix(ntwk)
    lambda_genes = np.random.gamma(shape=3, scale=1, size=n_genes)
    df_out.write('\t'.join([
        'Cov', 'Iteration', 'euc_z_score', 'euc_percent', 'eig_percent',
        'mcd_percent_k1', 'mcd_percent_k3'
    ]) + '\n')
    covs = [0.05, 0.1, 0.15, 0.2]
    #covs = [0.2, 0.7]
    for cov in covs:
        C = ntwk_np * cov
        np.fill_diagonal(C, 1)
        #z_scores = []
        #eig_percents = []
        #euc_percents = []
        #centroid_percents_k1 = []
        #centroid_percents_k3 = []
        for i in range(1000):
            test_cov = np.stack(
                [get_count_pop(lambda_genes, cov=C) for x in range(n_pops)],
                axis=0)
            X = pt.hellinger_transform(test_cov)
            pca = PCA()
            pca_fit = pca.fit_transform(X)
            euc_dist = pt.get_mean_pairwise_euc_distance(pca_fit)
            euc_dists = []
            eig = pt.get_x_stat(pca.explained_variance_[:-1])
            mcd_k1 = pt.get_mean_centroid_distance(pca_fit, k=1)
            mcd_k3 = pt.get_mean_centroid_distance(pca_fit, k=3)
            eigs = []
            centroid_dists_k1 = []
            centroid_dists_k3 = []
            for j in range(1000):
                X_j = pt.hellinger_transform(pt.random_matrix(test_cov))
                #pca_j = PCA()
                #pca_fit_j = pca_j.fit_transform(X_j)
                pca_fit_j = pca.fit_transform(X_j)
                euc_dists.append(pt.get_mean_pairwise_euc_distance(pca_fit_j))
                centroid_dists_k1.append(
                    pt.get_mean_centroid_distance(pca_fit_j, k=1))
                centroid_dists_k3.append(
                    pt.get_mean_centroid_distance(pca_fit_j, k=3))
                eigs.append(pt.get_x_stat(pca.explained_variance_[:-1]))
                #eigs.append( pt.get_x_stat(pca_j.explained_variance_[:-1]) )
            z_score = (euc_dist - np.mean(euc_dists)) / np.std(euc_dists)
            euc_percent = len([k for k in euc_dists if k < euc_dist
                               ]) / len(euc_dists)
            eig_percent = len([k for k in eigs if k < eig]) / len(eigs)
            centroid_percent_k1 = len([
                k for k in centroid_dists_k1 if k < mcd_k1
            ]) / len(centroid_dists_k1)
            centroid_percent_k3 = len([
                k for k in centroid_dists_k3 if k < mcd_k3
            ]) / len(centroid_dists_k3)
            #eig_percents.append(eig_percent)
            #euc_percents.append(euc_percent)
            #z_scores.append(z_score)
            print(cov, i, z_score, euc_percent, eig_percent)
            df_out.write('\t'.join([
                str(cov),
                str(i),
                str(z_score),
                str(euc_percent),
                str(eig_percent),
                str(centroid_percent_k1),
                str(centroid_percent_k3)
            ]) + '\n')

        #print(cov, np.all(np.linalg.eigvals(C) > 0), np.mean(z_scores))

    df_out.close()