def run_ba_cov_neutral_sims(shape=1, scale=1, G=50, N=50, iter1=1000, iter2=1000): df_out = open(pt.get_path() + '/data/simulations/ba_cov_neutral_sims.txt', 'w') df_out.write('\t'.join([ 'N', 'G', 'lamba_mean', 'lambda_neutral', 'Cov', 'Iteration', 'dist_percent' ]) + '\n') covs = [0.2] mean_gamma = shape * scale neutral_range = np.logspace(-2, 1, num=20, endpoint=True, base=10.0) neutral_range = neutral_range[::-1] for neutral_ in neutral_range: for cov in covs: for i in range(iter1): C = pt.get_ba_cov_matrix(G, cov) lambda_genes = np.random.gamma(shape=shape, scale=scale, size=G) lambda_genes_null = np.asarray([neutral_] * G) test_cov_adapt = np.stack( [pt.get_count_pop(lambda_genes, C=C) for x in range(N)], axis=0) # matrix with diaganol values equal to one test_cov_neutral = np.stack([ pt.get_count_pop(lambda_genes_null, C=np.identity(G)) for x in range(N) ], axis=0) test_cov = test_cov_adapt + test_cov_neutral X = pt.hellinger_transform(test_cov) pca = PCA() pca_fit = pca.fit_transform(X) euc_dist = pt.get_mean_pairwise_euc_distance(pca_fit) euc_dists = [] for j in range(iter2): #X_j = pt.hellinger_transform(pt.random_matrix(test_cov)) X_j = pt.hellinger_transform( pt.get_random_matrix(test_cov)) pca_fit_j = pca.fit_transform(X_j) euc_dists.append( pt.get_mean_pairwise_euc_distance(pca_fit_j)) euc_percent = len([k for k in euc_dists if k < euc_dist ]) / len(euc_dists) print(neutral_, cov, i, euc_percent) df_out.write('\t'.join([ str(N), str(G), str(mean_gamma), str(neutral_), str(cov), str(i), str(euc_percent) ]) + '\n') df_out.close()
def wannier_hist(iter=10000): dir = os.path.expanduser("~/GitHub/ParEvol") df1 = pd.read_csv(dir + '/data/Wannier_et_al/C321.deltaA_mutation_table_clean.txt', sep='\t', index_col=0) df2 = pd.read_csv(dir + '/data/Wannier_et_al/C321_mutation_table_clean.txt', sep='\t', index_col=0) df = df1.append(df2, sort=False) df = df.fillna(0) df_np = df.values gene_names = df.columns.values N1 = df1.shape[0] N2 = df2.shape[0] df_np_delta = cd.likelihood_matrix_array(df_np, gene_names, 'Wannier_et_al').get_likelihood_matrix() F2_all = pt.get_F_2(df_np_delta, N1, N2) print(F2_all) F2 = F2_all[0] V1 = F2_all[1] V2 = F2_all[2] F2_null = [] V1_null = [] V2_null = [] for i in range(iter): if i %1000 ==0: print(i) df_np_i = pt.get_random_matrix(df_np) np.seterr(divide='ignore') df_np_i_delta = cd.likelihood_matrix_array(df_np_i, gene_names, 'Wannier_et_al').get_likelihood_matrix() F2_all_iter = pt.get_F_2(df_np_i_delta, N1, N2) F2_null.append(F2_all_iter[0]) V1_null.append(F2_all_iter[1]) V2_null.append(F2_all_iter[2]) fig = plt.figure() #plt.hist(F2_null, bins=30, weights=np.zeros_like(F2_null) + 1. / len(F2_null), alpha=0.8, color = '#175ac6') plt.hist(F2_null, bins=30, alpha=0.8, color = '#175ac6') plt.axvline(F2, color = 'red', lw = 3) plt.xlabel(r'$ F_{2}$', fontsize = 20) plt.ylabel("Frequency", fontsize = 12) fig.tight_layout() fig.savefig(os.path.expanduser("~/GitHub/ParEvol") + '/figs/test_hist_F.png', bbox_inches = "tight", pad_inches = 0.4, dpi = 600) plt.close() fig = plt.figure() plt.hist(V1_null, bins=30, alpha=0.8, color = '#175ac6') plt.axvline(V1, color = 'red', lw = 3) plt.xlabel(r'$ V_{1}$', fontsize = 20) plt.ylabel("Frequency", fontsize = 12) fig.tight_layout() fig.savefig(os.path.expanduser("~/GitHub/ParEvol") + '/figs/test_hist_F_V1.png', bbox_inches = "tight", pad_inches = 0.4, dpi = 600) plt.close() fig = plt.figure() plt.hist(V2_null, bins=30, alpha=0.8, color = '#175ac6') #print(V2_null) plt.axvline(V2, color = 'red', lw = 3) plt.xlabel(r'$ V_{2}$', fontsize = 20) plt.ylabel("Frequency", fontsize = 12) fig.tight_layout() fig.savefig(os.path.expanduser("~/GitHub/ParEvol") + '/figs/test_hist_F_V2.png', bbox_inches = "tight", pad_inches = 0.4, dpi = 600) plt.close()
def rndm_sample_tenaillon(iter1=1000, iter2=1000): df_path = pt.get_path() + '/data/Tenaillon_et_al/gene_by_pop.txt' df = pd.read_csv(df_path, sep='\t', header='infer', index_col=0) df_np = df.values gene_names = df.columns.values n_rows = df_np.shape[0] df_out = open(pt.get_path() + '/data/Tenaillon_et_al/sample_size_sim.txt', 'w') df_out.write( '\t'.join(['N', 'G', 'Iteration', 'dist_percent', 'z_score']) + '\n') Ns = list(range(2, 40, 2)) for N in Ns: for i in range(iter1): #df_np_i = df_np[np.random.choice(n_rows, N, replace=False), :] #df_np_i = df_np_i[: , ~np.all(df_np_i == 0, axis=0)] #df_i = df.sample(N) df_np_i = df_np[np.random.randint(n_rows, size=N), :] gene_bool = np.all(df_np_i == 0, axis=0) # flip around to select gene_size gene_names_i = list( compress(gene_names, list(map(operator.not_, gene_bool)))) df_np_i = df_np_i[:, ~np.all(df_np_i == 0, axis=0)] #df_i = df_i.loc[:, (df_i != 0).any(axis=0)] np.seterr(divide='ignore') df_np_i_delta = pt.likelihood_matrix_array( df_np_i, gene_names_i, 'Tenaillon_et_al').get_likelihood_matrix() X = pt.hellinger_transform(df_np_i_delta) pca = PCA() pca_fit = pca.fit_transform(X) euc_dist = pt.get_mean_pairwise_euc_distance(pca_fit) euc_dists = [] for j in range(iter2): #df_np_i_j = pt.random_matrix(df_np_i) df_np_i_j = pt.get_random_matrix(df_np_i) np.seterr(divide='ignore') df_np_i_j_delta = pt.likelihood_matrix_array( df_np_i_j, gene_names_i, 'Tenaillon_et_al').get_likelihood_matrix() #df_i_j = pd.DataFrame(data=pt.random_matrix(df_np_i_j), index=df_i.index, columns=df_i.columns) #df_i_j_delta = pt.likelihood_matrix(df_i_j, 'Tenaillon_et_al').get_likelihood_matrix() X_j = pt.hellinger_transform(df_np_i_j_delta) pca_fit_j = pca.fit_transform(X_j) euc_dists.append(pt.get_mean_pairwise_euc_distance(pca_fit_j)) G = df_np_i.shape[1] euc_percent = len([k for k in euc_dists if k < euc_dist ]) / len(euc_dists) z_score = (euc_dist - np.mean(euc_dists)) / np.std(euc_dists) print(str(N), str(i), str(G), str(euc_percent), str(z_score)) df_out.write('\t'.join([ str(N), str(G), str(i), str(euc_percent), str(z_score) ]) + '\n') df_out.close()
def rndm_sample_tenaillon(iter1=1000, iter2=10000): df_path = os.path.expanduser( "~/GitHub/ParEvol") + '/data/Tenaillon_et_al/gene_by_pop.txt' df = pd.read_csv(df_path, sep='\t', header='infer', index_col=0) df_np = df.values gene_names = df.columns.values n_rows = list(range(df_np.shape[0])) df_out = open( os.path.expanduser("~/GitHub/ParEvol") + '/data/Tenaillon_et_al/dist_sample_size.txt', 'w') df_out.write( '\t'.join(['N', 'G', 'Iteration', 'dist_percent', 'z_score']) + '\n') #Ns = list(range(4, 40 +2, 2)) Ns = [40] pca = PCA() for N in Ns: for i in range(iter1): df_np_i = df_np[ np.random.choice(n_rows, size=N, replace=False, p=None), :] gene_bool = np.all(df_np_i == 0, axis=0) # flip around to select gene_size gene_names_i = list( compress(gene_names, list(map(operator.not_, gene_bool)))) df_np_i = df_np_i[:, ~np.all(df_np_i == 0, axis=0)] np.seterr(divide='ignore') df_np_i_delta = cd.likelihood_matrix_array( df_np_i, gene_names_i, 'Tenaillon_et_al').get_likelihood_matrix() df_np_i_delta = df_np_i_delta / df_np_i_delta.sum(axis=1)[:, None] X = pt.get_mean_center(df_np_i_delta) pca_fit = pca.fit_transform(X) euc_dist = pt.get_mean_pairwise_euc_distance(pca_fit) euc_dists = [] for j in range(iter2): df_np_i_j = pt.get_random_matrix(df_np_i) np.seterr(divide='ignore') df_np_i_j_delta = cd.likelihood_matrix_array( df_np_i_j, gene_names_i, 'Tenaillon_et_al').get_likelihood_matrix() df_np_i_j_delta = df_np_i_j_delta / df_np_i_j_delta.sum( axis=1)[:, None] X_j = pt.get_mean_center(df_np_i_j_delta) pca_fit_j = pca.fit_transform(X_j) euc_dists.append(pt.get_mean_pairwise_euc_distance(pca_fit_j)) G = df_np_i.shape[1] euc_percent = len([k for k in euc_dists if k > euc_dist ]) / len(euc_dists) z_score = (euc_dist - np.mean(euc_dists)) / np.std(euc_dists) print(str(N), str(i), str(G), str(euc_percent), str(z_score)) df_out.write('\t'.join([ str(N), str(G), str(i), str(euc_percent), str(z_score) ]) + '\n') df_out.close()
def rndm_sample_tenaillon(k_eval=3, iter1=20, iter2=1000, sample_bs = 10, iter_bs=10000): df_path = mydir + '/data/Tenaillon_et_al/gene_by_pop.txt' df = pd.read_csv(df_path, sep = '\t', header = 'infer', index_col = 0) df_np = df.values gene_names = df.columns.values n_rows = list(range(df_np.shape[0])) df_out = open(mydir + '/data/Tenaillon_et_al/power_sample_size.txt', 'w') df_out.write('\t'.join(['N', 'G', 'Power', 'Power_025', 'Power_975']) + '\n') Ns = [20, 30] #Ns = list(range(20, n_rows, 4)) for N in Ns: p_values = [] #z_scores = [] G_list = [] for i in range(iter1): df_np_i = df_np[np.random.choice(n_rows, size=N, replace=False, p=None), :] gene_bool = np.all(df_np_i == 0, axis=0) # flip around to select gene_size gene_names_i = list(compress(gene_names, list(map(operator.not_, gene_bool)))) G_list.append(len(gene_names_i)) df_np_i = df_np_i[:,~np.all(df_np_i == 0, axis=0)] np.seterr(divide='ignore') df_np_i_delta = cd.likelihood_matrix_array(df_np_i, gene_names_i, 'Tenaillon_et_al').get_likelihood_matrix() X = df_np_i_delta/df_np_i_delta.sum(axis=1)[:,None] X -= np.mean(X, axis = 0) pca = PCA() pca_X = pca.fit_transform(X) mpd = pt.get_mean_pairwise_euc_distance(pca_X, k=k_eval) mpd_null = [] for j in range(iter2): df_np_i_j = pt.get_random_matrix(df_np_i) np.seterr(divide='ignore') df_np_i_j_delta = cd.likelihood_matrix_array(df_np_i_j, gene_names_i, 'Tenaillon_et_al').get_likelihood_matrix() X_j = df_np_i_j_delta/df_np_i_j_delta.sum(axis=1)[:,None] X_j -= np.mean(X_j, axis = 0) pca_X_j = pca.fit_transform(X_j) mpd_null.append(pt.get_mean_pairwise_euc_distance(pca_X_j, k=k_eval)) p_values.append(len( [m for m in mpd_null if m > mpd] ) / len(mpd_null)) #z_scores.append( (euc_dist - np.mean(euc_dists)) / np.std(euc_dists) )s print(p_values) power = len([n for n in p_values if n < 0.05]) / len(p_values) print(p_values) power_bootstrap = [] for p in range(iter_bs): p_values_sample = random.sample(p_values, sample_bs) power_sample = len([n for n in p_values_sample if n < 0.05]) / len(p_values_sample) power_bootstrap.append(power_sample) power_bootstrap.sort() # return number of genes, power, power lower, power upper #return power, power_bootstrap[int(10000*0.025)], power_bootstrap[int(10000*0.975)] df_out.write('\t'.join([str(N), str(np.mean(G_list)), str(power), str(power_bootstrap[int(iter_bs*0.025)]), str(power_bootstrap[int(iter_bs*0.975)])]) + '\n') df_out.close()
def rndm_sample_tenaillon(N, df_np, gene_names, n_rows, k=3, iter1=100, iter2=1000, sample_bs=10, iter_bs=10000): p_values = [] #z_scores = [] G_list = [] for i in range(iter1): df_np_i = df_np[ np.random.choice(n_rows, size=N, replace=False, p=None), :] gene_bool = np.all(df_np_i == 0, axis=0) # flip around to select gene_size gene_names_i = list( compress(gene_names, list(map(operator.not_, gene_bool)))) G_list.append(len(gene_names_i)) df_np_i = df_np_i[:, ~np.all(df_np_i == 0, axis=0)] np.seterr(divide='ignore') df_np_i_delta = cd.likelihood_matrix_array( df_np_i, gene_names_i, 'Tenaillon_et_al').get_likelihood_matrix() X = df_np_i_delta / df_np_i_delta.sum(axis=1)[:, None] e_vals, a_mat = pt.pca_np(X) euc_dist = pt.get_mean_pairwise_euc_distance(a_mat, k=k) print(euc_dist) euc_dists = [] for j in range(iter2): df_np_i_j = pt.get_random_matrix(df_np_i) np.seterr(divide='ignore') df_np_i_j_delta = cd.likelihood_matrix_array( df_np_i_j, gene_names_i, 'Tenaillon_et_al').get_likelihood_matrix() X_j = df_np_i_j_delta / df_np_i_j_delta.sum(axis=1)[:, None] e_vals_j, a_mat_j = pt.pca_np(X_j) euc_dists.append(pt.get_mean_pairwise_euc_distance(a_mat_j, k=k)) p_values.append( len([m for m in euc_dists if m > euc_dist]) / len(euc_dists)) #z_scores.append( (euc_dist - np.mean(euc_dists)) / np.std(euc_dists) )s power = len([n for n in p_values if n < 0.05]) / len(p_values) print(p_values) power_bootstrap = [] for p in range(iter_bs): p_values_sample = random.sample(p_values, sample_bs) power_sample = len([n for n in p_values_sample if n < 0.05 ]) / len(p_values_sample) power_bootstrap.append(power_sample) power_bootstrap.sort() # return number of genes, power, power lower, power upper return N, np.mean(G_list), power, power_bootstrap[int( 10000 * 0.025)], power_bootstrap[int(10000 * 0.975)]
def gene_svd_tenaillon(iter=10000): df_path = os.path.expanduser( "~/GitHub/ParEvol") + '/data/Tenaillon_et_al/gene_by_pop.txt' df = pd.read_csv(df_path, sep='\t', header='infer', index_col=0) gene_names = df.columns.tolist() df_np = df.values df_np_delta = cd.likelihood_matrix_array( df_np, gene_names, 'Tenaillon_et_al').get_likelihood_matrix() df_np_delta = df_np_delta / df_np_delta.sum(axis=1)[:, None] X = pt.get_mean_center(df_np_delta) # scipy's svd returns the V matrix in transposed form U, s, V_T = svds(X, k=3) # apply another transposition to calculate basis matrix F = (V_T.T @ np.diag(s)) / np.sqrt(X.shape[0] - 1) vars = np.linalg.norm(F, axis=1)**2 vars_null_list = [] for i in range(iter): if i % 1000 == 0: print("Iteration " + str(i)) df_np_i = pt.get_random_matrix(df_np) np.seterr(divide='ignore') df_np_i_delta = cd.likelihood_matrix_array( df_np_i, gene_names, 'Tenaillon_et_al').get_likelihood_matrix() df_np_i_delta = df_np_i_delta / df_np_i_delta.sum(axis=1)[:, None] X_j = pt.get_mean_center(df_np_i_delta) U_i, s_i, V_i_T = svds(X_j, k=3) F_i = (V_i_T.T @ np.diag(s_i)) / np.sqrt(X_j.shape[0] - 1) vars_null_list.append(np.linalg.norm(F_i, axis=1)**2) vars_null = np.stack(vars_null_list) vars_null_mean = np.mean(vars_null, axis=0) vars_null_std = np.std(vars_null, axis=0) z_scores = (vars - vars_null_mean) / vars_null_std p_values = [] # calculate p values for k, column in enumerate(vars_null.T): column_greater = [x for x in column if x > vars[k]] p_values.append(len(column_greater) / iter) label_z_scores = list(zip(gene_names, z_scores, p_values)) label_sig_z_scores = [x for x in label_z_scores if x[2] < 0.05] print(label_sig_z_scores) df_out = open( os.path.expanduser("~/GitHub/ParEvol") + '/data/Tenaillon_et_al/gene_z_scores.txt', 'w') df_out.write('\t'.join(['Gene', 'z_score', 'p_score']) + '\n') for label_z_score in label_z_scores: df_out.write('\t'.join([ str(label_z_score[0]), str(label_z_score[1]), str(label_z_score[2]) ]) + '\n') df_out.close()
def run_ba_cov_sims(gene_list, pop_list, out_name, iter1=1000, iter2=1000): df_out = open(pt.get_path() + '/data/simulations/' + out_name + '.txt', 'w') df_out.write('\t'.join(['N', 'G', 'Cov', 'Iteration', 'dist_percent']) + '\n') covs = [0.1, 0.15, 0.2] for G in gene_list: for N in pop_list: for cov in covs: for i in range(iter1): C = pt.get_ba_cov_matrix(G, cov) while True: lambda_genes = np.random.gamma(shape=1, scale=1, size=G) test_cov = np.stack([ pt.get_count_pop(lambda_genes, cov=C) for x in range(N) ], axis=0) #test_cov_row_sum = test_cov.sum(axis=1) if (np.any(test_cov.sum(axis=1) == 0)) == False: break #if np.count_nonzero(test_cov_row_sum) == len(test_cov_row_sum): # break X = pt.hellinger_transform(test_cov) pca = PCA() pca_fit = pca.fit_transform(X) euc_dist = pt.get_mean_pairwise_euc_distance(pca_fit) euc_dists = [] for j in range(iter2): X_j = pt.hellinger_transform( pt.get_random_matrix(test_cov)) #X_j = pt.hellinger_transform(pt.random_matrix(test_cov)) pca_fit_j = pca.fit_transform(X_j) euc_dists.append( pt.get_mean_pairwise_euc_distance(pca_fit_j)) euc_percent = len([k for k in euc_dists if k < euc_dist ]) / len(euc_dists) print(N, G, cov, i, euc_percent) df_out.write('\t'.join( [str(N), str(G), str(cov), str(i), str(euc_percent)]) + '\n') df_out.close()
def gene_svd_tenaillon_sample_size(iter1=1000, iter2=10000, k=3): df_path = os.path.expanduser( "~/GitHub/ParEvol") + '/data/Tenaillon_et_al/gene_by_pop.txt' df = pd.read_csv(df_path, sep='\t', header='infer', index_col=0) df_np = df.values gene_names = df.columns.values n_rows = list(range(df_np.shape[0])) df_out = open( os.path.expanduser("~/GitHub/ParEvol") + '/data/Tenaillon_et_al/gene_z_scores_sample_size.txt', 'w') df_out.write('\t'.join(['N', 'G', 'Iteration', 'set_percent']) + '\n') Ns = list(range(4, 40, 2)) # get genes with an absolute z-score greater than 1.96 df_gene_path = os.path.expanduser( "~/GitHub/ParEvol") + '/data/Tenaillon_et_al/gene_z_scores.txt' df_genes = pd.read_csv(df_gene_path, sep='\t', header='infer') df_genes_sig = df_genes.loc[(df_genes['z_score'] > 1.96) | (df_genes['z_score'] < -1.96)] genes = df_genes_sig.Gene.tolist() for N in Ns: for i in range(iter1): df_np_i = df_np[ np.random.choice(n_rows, size=N, replace=False, p=None), :] testtt = np.random.choice(n_rows, size=N, replace=False, p=None) gene_bool = np.all(df_np_i == 0, axis=0) # flip around to select gene_size gene_names_i = list( compress(gene_names, list(map(operator.not_, gene_bool)))) df_np_i = df_np_i[:, ~np.all(df_np_i == 0, axis=0)] np.seterr(divide='ignore') #df_np_i_delta = cd.likelihood_matrix_array(df_np_i, gene_names_i, 'Tenaillon_et_al').get_likelihood_matrix() X = pt.get_mean_center(df_np_i) U, s, V_T = svds(X, k=k) # apply another transposition to calculate basis matrix F = (V_T.T @ np.diag(s)) / np.sqrt(X.shape[0] - 1) vars = np.linalg.norm(F, axis=1)**2 vars_null_list = [] for j in range(iter2): df_np_i_j = pt.get_random_matrix(df_np_i) np.seterr(divide='ignore') #df_np_i_j_delta = cd.likelihood_matrix_array(df_np_i_j, gene_names_i, 'Tenaillon_et_al').get_likelihood_matrix() X_j = pt.get_mean_center(df_np_i_j) U_j, s_j, V_j_T = svds(X_j, k=3) F_j = (V_j_T.T @ np.diag(s_j)) / np.sqrt(X_j.shape[0] - 1) vars_null_list.append(np.linalg.norm(F_j, axis=1)**2) vars_null_i = np.stack(vars_null_list) vars_null_i_mean = np.mean(vars_null_i, axis=0) vars_null_i_std = np.std(vars_null_i, axis=0) z_scores = (vars - vars_null_i_mean) / vars_null_i_std label_z_scores = list(zip(gene_names_i, z_scores)) label_sig_z_scores = [ x for x in label_z_scores if abs(x[1]) > 1.96 ] label_sig_z_scores_label = [x[0] for x in label_sig_z_scores] gene_inter = set(label_sig_z_scores_label) & set(genes) union_fract = len(gene_inter) / len(genes) print(N, i, union_fract) G = df_np_i.shape[1] df_out.write('\t'.join([str( N), str(G), str(i), str(union_fract)]) + '\n') df_out.close()
def time_partition_ltee(k=5, iter=100): df_path = os.path.expanduser( "~/GitHub/ParEvol") + '/data/Good_et_al/gene_by_pop.txt' df = pd.read_csv(df_path, sep='\t', header='infer', index_col=0) to_include = pt.complete_nonmutator_lines() df_nonmut = df[df.index.str.contains('|'.join(to_include))] # remove columns with all zeros df_nonmut = df_nonmut.loc[:, (df_nonmut != 0).any(axis=0)] # make sure it's sorted df_nonmut.sort_index(inplace=True) time_points = [int(x.split('_')[1]) for x in df_nonmut.index.values] time_points_set = sorted( list(set([int(x.split('_')[1]) for x in df_nonmut.index.values]))) time_points_positions = {} for x in time_points_set: time_points_positions[x] = [ i for i, j in enumerate(time_points) if j == x ] t_final_df = df_nonmut.iloc[time_points_positions[max(time_points_set)]] t_final_np = t_final_df.values gene_names = df_nonmut.columns.tolist() df_out = open( os.path.expanduser("~/GitHub/ParEvol") + '/data/Good_et_al/time_partition_z_scores.txt', 'w') df_out.write( '\t'.join(['Time', 'Time_less_z_score', 'Time_greater_z_score']) + '\n') for time_point in time_points_set: # very few mutations after generation 50000 if time_point > 50000: continue print("Time point " + str(time_point)) t_i_df = df_nonmut.iloc[time_points_positions[time_point]] t_i_np = t_i_df.values # remove rows with all zeros t_i_np_zeros = np.where(~t_i_np.any(axis=1))[0] n_zeros_t_i_np = len(t_i_np_zeros) if n_zeros_t_i_np > 0: t_i_np = np.delete(t_i_np, t_i_np_zeros, axis=0) t_i_to_final_np = t_final_np - t_i_np # remove rows with all zeros t_i_to_final_np_zeros = np.where(~t_i_to_final_np.any(axis=1))[0] n_zeros_t_i_to_final_np = len(t_i_to_final_np_zeros) if n_zeros_t_i_to_final_np > 0: t_i_to_final_np = np.delete(t_i_to_final_np, t_i_to_final_np_zeros, axis=0) t_concat = np.concatenate((t_i_np, t_i_to_final_np), axis=0) t_norm = cd.likelihood_matrix_array( t_concat, gene_names, 'Good_et_al').get_likelihood_matrix() t_norm_rel = t_norm / t_norm.sum(axis=1, keepdims=True) # first five axes e_vals, e_vecs = pt.pca_np(t_norm_rel) # The column v[:, i] is the normalized eigenvector corresponding to the eigenvalue w[i] e_vecs_k5 = e_vecs[:, -1 - k:-1] # account for rows with zero mutations e_vec_t_less = e_vecs_k5[:5 - n_zeros_t_i_np, :] e_vec_t_greater = e_vecs_k5[5 - n_zeros_t_i_to_final_np:, :] dist_t_less = pt.get_mean_pairwise_euc_distance(e_vec_t_less, k=k) dist_t_greater = pt.get_mean_pairwise_euc_distance(e_vec_t_greater, k=k) dist_t_less_list = [] dist_t_greater_list = [] for i in range(iter): t_i_np_rndm = pt.get_random_matrix(t_i_np) t_i_to_final_np_rndm = pt.get_random_matrix(t_i_to_final_np) t_rndm_concat = np.concatenate((t_i_np_rndm, t_i_to_final_np_rndm), axis=0) t_rndm_norm = cd.likelihood_matrix_array( t_rndm_concat, gene_names, 'Good_et_al').get_likelihood_matrix() t_rndm_norm_rel = t_rndm_norm / t_rndm_norm.sum(axis=1, keepdims=True) # first five axes e_vals_rndm, e_vecs_rndm = pt.pca_np(t_rndm_norm_rel) # The column v[:, i] is the normalized eigenvector corresponding to the eigenvalue w[i] e_vecs_rndm_k5 = e_vecs_rndm[:, -1 - k:-1] e_vec_t_less_rndm = e_vecs_rndm_k5[:5, :] e_vec_t_greater_rndm = e_vecs_rndm_k5[5:, :] dist_t_less_rndm = pt.get_mean_pairwise_euc_distance( e_vec_t_less_rndm, k=k) dist_t_greater_rndm = pt.get_mean_pairwise_euc_distance( e_vec_t_greater_rndm, k=k) dist_t_less_list.append(dist_t_less_rndm) dist_t_greater_list.append(dist_t_greater_rndm) z_score_less = (dist_t_less - np.mean(dist_t_less_list)) / np.std(dist_t_less_list) z_score_greater = (dist_t_greater - np.mean(dist_t_greater_list) ) / np.std(dist_t_greater_list) df_out.write('\t'.join( [str(time_point), str(z_score_less), str(z_score_greater)]) + '\n') df_out.close()
def time_partition_ltee(k=5, iter=1000): df_path = mydir + '/data/Good_et_al/gene_by_pop.txt' df = pd.read_csv(df_path, sep='\t', header='infer', index_col=0) to_include = pt.complete_nonmutator_lines() df_nonmut = df[df.index.str.contains('|'.join(to_include))] # remove columns with all zeros df_nonmut = df_nonmut.loc[:, (df_nonmut != 0).any(axis=0)] # make sure it's sorted df_nonmut.sort_index(inplace=True) time_points = [int(x.split('_')[1]) for x in df_nonmut.index.values] time_points_set = sorted( list(set([int(x.split('_')[1]) for x in df_nonmut.index.values]))) time_points_positions = {} for x in time_points_set: time_points_positions[x] = [ i for i, j in enumerate(time_points) if j == x ] t_final_df = df_nonmut.iloc[time_points_positions[max(time_points_set)]] t_final_np = t_final_df.values gene_names = df_nonmut.columns.tolist() df_out = open(mydir + '/data/Good_et_al/time_partition_z_scores.txt', 'w') df_out.write('\t'.join([ 'Time', 'less_mbd', 'greater_mpd', 'delta_mpd', 'less_mbd_025', 'less_mbd_975', 'greater_mpd_025', 'greater_mpd_975', 'delta_mpd_025', 'delta_mpd_975' ]) + '\n') for time_point in time_points_set: # very few mutations after generation 50000 if time_point > 50000: continue print("Time point " + str(time_point)) t_i_df = df_nonmut.iloc[time_points_positions[time_point]] t_i_np = t_i_df.values # remove rows with all zeros t_i_np_zeros = np.where(~t_i_np.any(axis=1))[0] n_zeros_t_i_np = len(t_i_np_zeros) if n_zeros_t_i_np > 0: t_i_np = np.delete(t_i_np, t_i_np_zeros, axis=0) t_i_to_final_np = t_final_np - t_i_np # remove rows with all zeros t_i_to_final_np_zeros = np.where(~t_i_to_final_np.any(axis=1))[0] n_zeros_t_i_to_final_np = len(t_i_to_final_np_zeros) if n_zeros_t_i_to_final_np > 0: t_i_to_final_np = np.delete(t_i_to_final_np, t_i_to_final_np_zeros, axis=0) t_concat = np.concatenate((t_i_np, t_i_to_final_np), axis=0) t_norm = cd.likelihood_matrix_array( t_concat, gene_names, 'Good_et_al').get_likelihood_matrix() t_norm_rel = t_norm / t_norm.sum(axis=1, keepdims=True) t_norm_rel -= np.mean(t_norm_rel, axis=0) pca = PCA() t_norm_rel_pca = pca.fit_transform(t_norm_rel) t_norm_rel_pca_k5 = t_norm_rel_pca[:, -1 - k:-1] # account for rows with zero mutations dist_t_less = pt.get_mean_pairwise_euc_distance( t_norm_rel_pca_k5[:5 - n_zeros_t_i_np, :], k=k) dist_t_greater = pt.get_mean_pairwise_euc_distance( t_norm_rel_pca_k5[5 - n_zeros_t_i_to_final_np:, :], k=k) dist_t_change = dist_t_greater - dist_t_less #F_t = pt.get_F_2(t_norm_rel_pca_k5, 5-n_zeros_t_i_np, 5-n_zeros_t_i_to_final_np)[0] dist_t_less_list = [] dist_t_greater_list = [] dist_t_change_list = [] #F_t_list = [] for i in range(iter): if i % 1000 == 0: print("Iteration " + str(i)) t_i_np_rndm = pt.get_random_matrix(t_i_np) t_i_to_final_np_rndm = pt.get_random_matrix(t_i_to_final_np) t_rndm_concat = np.concatenate((t_i_np_rndm, t_i_to_final_np_rndm), axis=0) t_rndm_norm = cd.likelihood_matrix_array( t_rndm_concat, gene_names, 'Good_et_al').get_likelihood_matrix() t_rndm_norm_rel = t_rndm_norm / t_rndm_norm.sum(axis=1, keepdims=True) t_rndm_norm_rel -= np.mean(t_rndm_norm_rel, axis=0) t_rndm_norm_rel_pca = pca.fit_transform(t_rndm_norm_rel) # first five axes t_rndm_norm_rel_pca_k5 = t_rndm_norm_rel_pca[:, -1 - k:-1] dist_t_less_rndm = pt.get_mean_pairwise_euc_distance( t_rndm_norm_rel_pca_k5[:5 - n_zeros_t_i_np, :], k=k) dist_t_greater_rndm = pt.get_mean_pairwise_euc_distance( t_rndm_norm_rel_pca_k5[5 - n_zeros_t_i_to_final_np:, :], k=k) dist_t_change_list.append(dist_t_greater_rndm - dist_t_less_rndm) dist_t_less_list.append(dist_t_less_rndm) dist_t_greater_list.append(dist_t_greater_rndm) #F_t_list.append(pt.get_F_2(t_rndm_norm_rel_pca, 5-n_zeros_t_i_np, 5-n_zeros_t_i_to_final_np)[0]) dist_t_change_list.sort() dist_t_greater_list.sort() dist_t_less_list.sort() #F_t_list.sort() # get 95% CIs dist_t_change_025 = dist_t_change_list[int(iter * 0.025)] dist_t_change_975 = dist_t_change_list[int(iter * 0.975)] dist_t_greater_025 = dist_t_greater_list[int(iter * 0.025)] dist_t_greater_975 = dist_t_greater_list[int(iter * 0.975)] dist_t_less_025 = dist_t_less_list[int(iter * 0.025)] dist_t_less_975 = dist_t_less_list[int(iter * 0.975)] #F_t_025 = F_t_list[int(iter*0.025)] #F_t_975 = F_t_list[int(iter*0.975)] df_out.write('\t'.join([str(time_point), str(dist_t_less), str(dist_t_greater), \ str(dist_t_change), str(dist_t_less_025), str(dist_t_less_975), \ str(dist_t_greater_025), str(dist_t_greater_975), \ str(dist_t_change_025), str(dist_t_change_975)]) + '\n') df_out.close()
n_rows = list(range(df_np.shape[0])) df_np_delta = cd.likelihood_matrix_array(df_np, gene_names, "Tenaillon_et_al").get_likelihood_matrix() X = df_np_delta / df_np_delta.sum(axis=1)[:, None] X = X - np.mean(X, axis=0) # cov = np.cov(X.T) # ev, eig = np.linalg.eig(cov) pca = PCA() pca_fit = pca.fit_transform(X) # L = pt.get_L_stat(max(ev), N, cov.shape[0]) eig = pt.get_x_stat(pca.explained_variance_[:-1], n_features=X.shape[1]) eig_null = [] for j in range(iter): df_np_j = pt.get_random_matrix(df_np) np.seterr(divide="ignore") df_np_j_delta = cd.likelihood_matrix_array(df_np_j, gene_names, "Tenaillon_et_al").get_likelihood_matrix() X_j = df_np_j_delta / df_np_j_delta.sum(axis=1)[:, None] X_j -= np.mean(X_j, axis=0) pca_j = PCA() pca_X_j = pca_j.fit_transform(X_j) eig_null.append(pt.get_x_stat(pca_j.explained_variance_[:-1], n_features=X.shape[1])) eig_null = np.asarray(eig_null) P_eig = len(eig_null[eig_null > eig]) / len(eig_null) eig_power = open(pt.get_path() + "/data/Tenaillon_et_al/power_sample_size_l_stat.txt", "r")
def run_ba_ntwk_cluster_sims(iter1=1000, iter2=1000, cov=0.2): df_out = open(mydir + '/data/simulations/cov_ba_ntwrk_cluster_methods.txt', 'w') df_out.write('\t'.join(['Prob', 'CC_mean', 'CC_025', 'CC_975', 'Method', 'Power', 'Power_025', 'Power_975', 'Z_mean', 'Z_025', 'Z_975']) + '\n') n_pops=100 n_genes=50 #covs = [0.05, 0.1, 0.15, 0.2] ps = [0, 0.2, 0.4, 0.6, 0.8, 1] for p in ps: eig_p_list = [] mcd_k1_p_list = [] mcd_k3_p_list = [] mpd_k1_p_list = [] mpd_k3_p_list = [] eig_z_list = [] mcd_k1_z_list = [] mcd_k3_z_list = [] mpd_k1_z_list = [] mpd_k3_z_list = [] cc_list = [] for i in range(iter1): if i %100 ==0: print(ps, i) lambda_genes = np.random.gamma(shape=3, scale=1, size=n_genes) C, cc = pt.get_ba_cov_matrix(n_genes, cov=cov, p=p) test_cov = np.stack( [pt.get_count_pop(lambda_genes, cov= C) for x in range(n_pops)] , axis=0 ) X = test_cov/test_cov.sum(axis=1)[:,None] X -= np.mean(X, axis = 0) pca = PCA() pca_fit = pca.fit_transform(X) mpd_k1 = pt.get_mean_pairwise_euc_distance(pca_fit,k=1) mpd_k3 = pt.get_mean_pairwise_euc_distance(pca_fit,k=3) eig = pt.get_x_stat(pca.explained_variance_[:-1], n_features=n_genes) mcd_k1 = pt.get_mean_centroid_distance(pca_fit, k = 1) mcd_k3 = pt.get_mean_centroid_distance(pca_fit, k = 3) eig_null_list = [] mcd_k1_null_list = [] mcd_k3_null_list = [] mpd_k1_null_list = [] mpd_k3_null_list = [] for j in range(iter2): test_cov_rndm = pt.get_random_matrix(test_cov) X_j = test_cov_rndm/test_cov_rndm.sum(axis=1)[:,None] X_j -= np.mean(X_j, axis = 0) pca_j = PCA() pca_fit_j = pca_j.fit_transform(X_j) #pca_fit_j = pca.fit_transform(X_j) mpd_k1_null_list.append( pt.get_mean_pairwise_euc_distance(pca_fit_j, k = 1 ) ) mpd_k3_null_list.append( pt.get_mean_pairwise_euc_distance(pca_fit_j, k = 3 ) ) mcd_k1_null_list.append(pt.get_mean_centroid_distance(pca_fit_j, k = 1)) mcd_k3_null_list.append(pt.get_mean_centroid_distance(pca_fit_j, k = 3)) eig_null_list.append( pt.get_x_stat(pca_j.explained_variance_[:-1], n_features=n_genes) ) #print(len( [k for k in eig_null_list if k > eig] ) / iter1) eig_p_list.append(len( [k for k in eig_null_list if k > eig] ) / iter1) mcd_k1_p_list.append( len( [k for k in mcd_k1_null_list if k > mcd_k1] ) / iter1 ) mcd_k3_p_list.append( len( [k for k in mcd_k3_null_list if k > mcd_k3] ) / iter1 ) mpd_k1_p_list.append( len( [k for k in mpd_k1_null_list if k > mpd_k1] ) / iter1 ) mpd_k3_p_list.append( len( [k for k in mpd_k3_null_list if k > mpd_k3] ) / iter1 ) cc_list.append(cc) eig_z_list.append( (eig - np.mean(eig_null_list)) / np.std(eig_null_list) ) mcd_k1_z_list.append( (mcd_k1 - np.mean(mcd_k1_null_list)) / np.std(mcd_k1_null_list) ) mcd_k3_z_list.append( (mcd_k3 - np.mean(mcd_k3_null_list)) / np.std(mcd_k3_null_list) ) mpd_k1_z_list.append( (mpd_k1 - np.mean(mpd_k1_null_list)) / np.std(mpd_k1_null_list) ) mpd_k3_z_list.append( (mpd_k3 - np.mean(mpd_k3_null_list)) / np.std(mpd_k3_null_list) ) # calculate cc_mean = np.mean(cc_list) cc_bs_mean_list = [] for iter_i in range(10000): cc_bs_mean_list.append( np.mean( np.random.choice(cc_list, size=50, replace=True ) )) cc_bs_mean_list.sort() cc_975 = cc_bs_mean_list[ int(0.975 * 10000) ] cc_025 = cc_bs_mean_list[ int(0.025 * 10000) ] eig_power = len([n for n in eig_p_list if n < 0.05]) / iter1 eig_power_025, eig_power_975 = get_bootstrap_power_ci(eig_p_list) mcd_k1_power = len([n for n in mcd_k1_p_list if n < 0.05]) / iter1 mcd_k1_power_025, mcd_k1_power_975 = get_bootstrap_power_ci(mcd_k1_p_list) mcd_k3_power = len([n for n in mcd_k3_p_list if n < 0.05]) / iter1 mcd_k3_power_025, mcd_k3_power_975 = get_bootstrap_power_ci(mcd_k3_p_list) mpd_k1_power = len([n for n in mpd_k1_p_list if n < 0.05]) / iter1 mpd_k1_power_025, mpd_k1_power_975 = get_bootstrap_power_ci(mpd_k1_p_list) mpd_k3_power = len([n for n in mpd_k3_p_list if n < 0.05]) / iter1 mpd_k3_power_025, mpd_k3_power_975 = get_bootstrap_power_ci(mpd_k3_p_list) eig_z_025, eig_z_975 = get_bootstrap_ci(eig_z_list) mcd_k1_z_025, mcd_k1_z_975 = get_bootstrap_ci(mcd_k1_z_list) mcd_k3_z_025, mcd_k3_z_975 = get_bootstrap_ci(mcd_k3_z_list) mpd_k1_z_025, mpd_k1_z_975 = get_bootstrap_ci(mpd_k1_z_list) mpd_k3_z_025, mpd_k3_z_975 = get_bootstrap_ci(mpd_k3_z_list) df_out.write('\t'.join([str(p), str(cc_mean), str(cc_025), str(cc_975), 'Eig', str(eig_power), str(eig_power_025), str(eig_power_975), str(np.mean(eig_z_list)), str(eig_z_025), str(eig_z_975)]) + '\n') df_out.write('\t'.join([str(p), str(cc_mean), str(cc_025), str(cc_975), 'MCD_k1', str(mcd_k1_power), str(mcd_k1_power_025), str(mcd_k1_power_975), str(np.mean(mcd_k1_z_list)), str(mcd_k1_z_025), str(mcd_k1_z_975)]) + '\n') df_out.write('\t'.join([str(p), str(cc_mean), str(cc_025), str(cc_975), 'MCD_k3', str(mcd_k3_power), str(mcd_k3_power_025), str(mcd_k3_power_975), str(np.mean(mcd_k3_z_list)), str(mcd_k3_z_025), str(mcd_k3_z_975)]) + '\n') df_out.write('\t'.join([str(p), str(cc_mean), str(cc_025), str(cc_975), 'MPD_k1', str(mpd_k1_power), str(mpd_k1_power_025), str(mpd_k1_power_975), str(np.mean(mpd_k1_z_list)), str(mpd_k1_z_025), str(mpd_k1_z_975)]) + '\n') df_out.write('\t'.join([str(p), str(cc_mean), str(cc_025), str(cc_975), 'MPD_k3', str(mpd_k3_power), str(mpd_k3_power_025), str(mpd_k3_power_975), str(np.mean(mpd_k3_z_list)), str(mpd_k3_z_025), str(mpd_k3_z_975)]) + '\n') df_out.close()
def run_ba_ntwk_cov_sims(iter1=1000, iter2=1000, n_pops=100, n_genes=50): df_out = open(mydir + '/data/simulations/cov_ba_ntwrk_methods.txt', 'w') df_out.write('\t'.join(['Cov', 'Method', 'Power', 'Power_025', 'Power_975', 'Z_mean', 'Z_025', 'Z_975']) + '\n') covs = [0.05, 0.1, 0.15, 0.2] #covs = [0.2] for cov in covs: eig_p_list = [] mcd_k1_p_list = [] mcd_k3_p_list = [] mpd_k1_p_list = [] mpd_k3_p_list = [] eig_z_list = [] mcd_k1_z_list = [] mcd_k3_z_list = [] mpd_k1_z_list = [] mpd_k3_z_list = [] for i in range(iter1): if i %100 ==0: print(cov, i) lambda_genes = np.random.gamma(shape=3, scale=1, size=n_genes) C = pt.get_ba_cov_matrix(n_genes, cov=cov) test_cov = np.stack( [pt.get_count_pop(lambda_genes, cov= C) for x in range(n_pops)] , axis=0 ) X = test_cov/test_cov.sum(axis=1)[:,None] X -= np.mean(X, axis = 0) pca = PCA() pca_fit = pca.fit_transform(X) mpd_k1 = pt.get_mean_pairwise_euc_distance(pca_fit,k=1) mpd_k3 = pt.get_mean_pairwise_euc_distance(pca_fit,k=3) eig = pt.get_x_stat(pca.explained_variance_[:-1], n_features=n_genes) mcd_k1 = pt.get_mean_centroid_distance(pca_fit, k = 1) mcd_k3 = pt.get_mean_centroid_distance(pca_fit, k = 3) #print(pca.explained_variance_[:-1]) #print(pt.get_x_stat(pca.explained_variance_[:-1])) eig_null_list = [] mcd_k1_null_list = [] mcd_k3_null_list = [] mpd_k1_null_list = [] mpd_k3_null_list = [] for j in range(iter2): test_cov_rndm = pt.get_random_matrix(test_cov) X_j = test_cov_rndm/test_cov_rndm.sum(axis=1)[:,None] X_j -= np.mean(X_j, axis = 0) pca_j = PCA() pca_fit_j = pca_j.fit_transform(X_j) #pca_fit_j = pca.fit_transform(X_j) mpd_k1_null_list.append( pt.get_mean_pairwise_euc_distance(pca_fit_j, k = 1 ) ) mpd_k3_null_list.append( pt.get_mean_pairwise_euc_distance(pca_fit_j, k = 3 ) ) mcd_k1_null_list.append(pt.get_mean_centroid_distance(pca_fit_j, k = 1)) mcd_k3_null_list.append(pt.get_mean_centroid_distance(pca_fit_j, k = 3)) eig_null_list.append( pt.get_x_stat(pca_j.explained_variance_[:-1], n_features=n_genes) ) eig_p_list.append(len( [k for k in eig_null_list if k > eig] ) / iter1) mcd_k1_p_list.append( len( [k for k in mcd_k1_null_list if k > mcd_k1] ) / iter1 ) mcd_k3_p_list.append( len( [k for k in mcd_k3_null_list if k > mcd_k3] ) / iter1 ) mpd_k1_p_list.append( len( [k for k in mpd_k1_null_list if k > mpd_k1] ) / iter1 ) mpd_k3_p_list.append( len( [k for k in mpd_k3_null_list if k > mpd_k3] ) / iter1 ) eig_z_list.append( (eig - np.mean(eig_null_list)) / np.std(eig_null_list) ) mcd_k1_z_list.append( (mcd_k1 - np.mean(mcd_k1_null_list)) / np.std(mcd_k1_null_list) ) mcd_k3_z_list.append( (mcd_k3 - np.mean(mcd_k3_null_list)) / np.std(mcd_k3_null_list) ) mpd_k1_z_list.append( (mpd_k1 - np.mean(mpd_k1_null_list)) / np.std(mpd_k1_null_list) ) mpd_k3_z_list.append( (mpd_k3 - np.mean(mpd_k3_null_list)) / np.std(mpd_k3_null_list) ) # calculate power eig_power = len([n for n in eig_p_list if n < 0.05]) / iter1 eig_power_025, eig_power_975 = get_bootstrap_power_ci(eig_p_list) mcd_k1_power = len([n for n in mcd_k1_p_list if n < 0.05]) / iter1 mcd_k1_power_025, mcd_k1_power_975 = get_bootstrap_power_ci(mcd_k1_p_list) mcd_k3_power = len([n for n in mcd_k3_p_list if n < 0.05]) / iter1 mcd_k3_power_025, mcd_k3_power_975 = get_bootstrap_power_ci(mcd_k3_p_list) mpd_k1_power = len([n for n in mpd_k1_p_list if n < 0.05]) / iter1 mpd_k1_power_025, mpd_k1_power_975 = get_bootstrap_power_ci(mpd_k1_p_list) mpd_k3_power = len([n for n in mpd_k3_p_list if n < 0.05]) / iter1 mpd_k3_power_025, mpd_k3_power_975 = get_bootstrap_power_ci(mpd_k3_p_list) eig_z_025, eig_z_975 = get_bootstrap_ci(eig_z_list) mcd_k1_z_025, mcd_k1_z_975 = get_bootstrap_ci(mcd_k1_z_list) mcd_k3_z_025, mcd_k3_z_975 = get_bootstrap_ci(mcd_k3_z_list) mpd_k1_z_025, mpd_k1_z_975 = get_bootstrap_ci(mpd_k1_z_list) mpd_k3_z_025, mpd_k3_z_975 = get_bootstrap_ci(mpd_k3_z_list) df_out.write('\t'.join([str(cov), 'Eig', str(eig_power), str(eig_power_025), str(eig_power_975), str(np.mean(eig_z_list)), str(eig_z_025), str(eig_z_975)]) + '\n') df_out.write('\t'.join([str(cov), 'MCD_k1', str(mcd_k1_power), str(mcd_k1_power_025), str(mcd_k1_power_975), str(np.mean(mcd_k1_z_list)), str(mcd_k1_z_025), str(mcd_k1_z_975)]) + '\n') df_out.write('\t'.join([str(cov), 'MCD_k3', str(mcd_k3_power), str(mcd_k3_power_025), str(mcd_k3_power_975), str(np.mean(mcd_k3_z_list)), str(mcd_k3_z_025), str(mcd_k3_z_975)]) + '\n') df_out.write('\t'.join([str(cov), 'MPD_k1', str(mpd_k1_power), str(mpd_k1_power_025), str(mpd_k1_power_975), str(np.mean(mpd_k1_z_list)), str(mpd_k1_z_025), str(mpd_k1_z_975)]) + '\n') df_out.write('\t'.join([str(cov), 'MPD_k3', str(mpd_k3_power), str(mpd_k3_power_025), str(mpd_k3_power_975), str(np.mean(mpd_k3_z_list)), str(mpd_k3_z_025), str(mpd_k3_z_975)]) + '\n') df_out.close()