def permute_ltee(k=5, n_blocks=2): df_path = os.path.expanduser( "~/GitHub/ParEvol") + '/data/Good_et_al/gene_by_pop.txt' df = pd.read_csv(df_path, sep='\t', header='infer', index_col=0) to_include = pt.complete_nonmutator_lines() to_keep.remove('p5') df_nonmut = df[df.index.str.contains('|'.join(to_exclude))] # remove columns with all zeros df_nonmut = df_nonmut.loc[:, (df_nonmut != 0).any(axis=0)] time_points = [int(x.split('_')[1]) for x in df_nonmut.index.values] time_points_set = sorted( list(set([int(x.split('_')[1]) for x in df_nonmut.index.values]))) df_nonmut_array = df_nonmut.as_matrix() time_points_positions = {} for x in time_points_set: time_points_positions[x] = [ i for i, j in enumerate(time_points) if j == x ] df_final = df_nonmut.iloc[time_points_positions[time_points_set[-1]]] df_out = open( pt.get_path() + '/data/Good_et_al/permute_' + analysis + '.txt', 'w') #column_headers = ['Iteration', 'Generation', 'MCD'] column_headers = ['Iteration', 'Generation', 'mean_dist'] df_out.write('\t'.join(column_headers) + '\n') for i in range(iter): continue print("Iteration " + str(i)) matrix_0 = df_nonmut.iloc[time_points_positions[time_points_set[0]]] matrix_0_rndm = pt.random_matrix(matrix_0.as_matrix()) df_rndm_list = [ pd.DataFrame(data=matrix_0_rndm, index=matrix_0.index, columns=matrix_0.columns) ] # skip first time step for j, tp in enumerate(time_points_set[0:]): if j == 0: continue df_tp_minus1 = df_nonmut[df_nonmut.index.str.contains( '_' + str(time_points_set[j - 1]))] df_tp = df_nonmut[df_nonmut.index.str.contains('_' + str(tp))] matrix_diff = df_tp.as_matrix() - df_tp_minus1.as_matrix() matrix_0_rndm = matrix_0_rndm + pt.random_matrix(matrix_diff) df_0_rndm = pd.DataFrame(data=matrix_0_rndm, index=df_tp.index, columns=df_tp.columns) df_rndm_list.append(df_0_rndm) df_rndm = pd.concat(df_rndm_list) df_rndm_delta = pt.likelihood_matrix( df_rndm, 'Good_et_al').get_likelihood_matrix() if analysis == 'PCA': X = pt.hellinger_transform(df_rndm_delta) pca = PCA() matrix_rndm_delta_out = pca.fit_transform(X) elif analysis == 'cMDS': matrix_rndm_delta_bc = np.sqrt( pt.get_bray_curtis(df_rndm_delta.as_matrix())) matrix_rndm_delta_out = pt.cmdscale(matrix_rndm_delta_bc)[0] else: print("Analysis argument not accepted") continue df_rndm_delta_out = pd.DataFrame(data=matrix_rndm_delta_out, index=df_rndm_delta.index) for tp in time_points_set: df_rndm_delta_out_tp = df_rndm_delta_out[ df_rndm_delta_out.index.str.contains('_' + str(tp))] df_rndm_delta_out_tp_matrix = df_rndm_delta_out_tp.as_matrix() mean_angle = pt.get_mean_angle(df_rndm_delta_out_tp_matrix, k=k) mcd = pt.get_mean_centroid_distance(df_rndm_delta_out_tp_matrix, k=k) mean_length = pt.get_euc_magnitude_diff( df_rndm_delta_out_tp_matrix, k=k) mean_dist = pt.get_mean_pairwise_euc_distance( df_rndm_delta_out_tp_matrix, k=k) df_out.write('\t'.join([ str(i), str(tp), str(mcd), str(mean_angle), str(mean_length), str(mean_dist) ]) + '\n') df_out.close()
def time_partition_ltee(k=5, iter=100): df_path = os.path.expanduser( "~/GitHub/ParEvol") + '/data/Good_et_al/gene_by_pop.txt' df = pd.read_csv(df_path, sep='\t', header='infer', index_col=0) to_include = pt.complete_nonmutator_lines() df_nonmut = df[df.index.str.contains('|'.join(to_include))] # remove columns with all zeros df_nonmut = df_nonmut.loc[:, (df_nonmut != 0).any(axis=0)] # make sure it's sorted df_nonmut.sort_index(inplace=True) time_points = [int(x.split('_')[1]) for x in df_nonmut.index.values] time_points_set = sorted( list(set([int(x.split('_')[1]) for x in df_nonmut.index.values]))) time_points_positions = {} for x in time_points_set: time_points_positions[x] = [ i for i, j in enumerate(time_points) if j == x ] t_final_df = df_nonmut.iloc[time_points_positions[max(time_points_set)]] t_final_np = t_final_df.values gene_names = df_nonmut.columns.tolist() df_out = open( os.path.expanduser("~/GitHub/ParEvol") + '/data/Good_et_al/time_partition_z_scores.txt', 'w') df_out.write( '\t'.join(['Time', 'Time_less_z_score', 'Time_greater_z_score']) + '\n') for time_point in time_points_set: # very few mutations after generation 50000 if time_point > 50000: continue print("Time point " + str(time_point)) t_i_df = df_nonmut.iloc[time_points_positions[time_point]] t_i_np = t_i_df.values # remove rows with all zeros t_i_np_zeros = np.where(~t_i_np.any(axis=1))[0] n_zeros_t_i_np = len(t_i_np_zeros) if n_zeros_t_i_np > 0: t_i_np = np.delete(t_i_np, t_i_np_zeros, axis=0) t_i_to_final_np = t_final_np - t_i_np # remove rows with all zeros t_i_to_final_np_zeros = np.where(~t_i_to_final_np.any(axis=1))[0] n_zeros_t_i_to_final_np = len(t_i_to_final_np_zeros) if n_zeros_t_i_to_final_np > 0: t_i_to_final_np = np.delete(t_i_to_final_np, t_i_to_final_np_zeros, axis=0) t_concat = np.concatenate((t_i_np, t_i_to_final_np), axis=0) t_norm = cd.likelihood_matrix_array( t_concat, gene_names, 'Good_et_al').get_likelihood_matrix() t_norm_rel = t_norm / t_norm.sum(axis=1, keepdims=True) # first five axes e_vals, e_vecs = pt.pca_np(t_norm_rel) # The column v[:, i] is the normalized eigenvector corresponding to the eigenvalue w[i] e_vecs_k5 = e_vecs[:, -1 - k:-1] # account for rows with zero mutations e_vec_t_less = e_vecs_k5[:5 - n_zeros_t_i_np, :] e_vec_t_greater = e_vecs_k5[5 - n_zeros_t_i_to_final_np:, :] dist_t_less = pt.get_mean_pairwise_euc_distance(e_vec_t_less, k=k) dist_t_greater = pt.get_mean_pairwise_euc_distance(e_vec_t_greater, k=k) dist_t_less_list = [] dist_t_greater_list = [] for i in range(iter): t_i_np_rndm = pt.get_random_matrix(t_i_np) t_i_to_final_np_rndm = pt.get_random_matrix(t_i_to_final_np) t_rndm_concat = np.concatenate((t_i_np_rndm, t_i_to_final_np_rndm), axis=0) t_rndm_norm = cd.likelihood_matrix_array( t_rndm_concat, gene_names, 'Good_et_al').get_likelihood_matrix() t_rndm_norm_rel = t_rndm_norm / t_rndm_norm.sum(axis=1, keepdims=True) # first five axes e_vals_rndm, e_vecs_rndm = pt.pca_np(t_rndm_norm_rel) # The column v[:, i] is the normalized eigenvector corresponding to the eigenvalue w[i] e_vecs_rndm_k5 = e_vecs_rndm[:, -1 - k:-1] e_vec_t_less_rndm = e_vecs_rndm_k5[:5, :] e_vec_t_greater_rndm = e_vecs_rndm_k5[5:, :] dist_t_less_rndm = pt.get_mean_pairwise_euc_distance( e_vec_t_less_rndm, k=k) dist_t_greater_rndm = pt.get_mean_pairwise_euc_distance( e_vec_t_greater_rndm, k=k) dist_t_less_list.append(dist_t_less_rndm) dist_t_greater_list.append(dist_t_greater_rndm) z_score_less = (dist_t_less - np.mean(dist_t_less_list)) / np.std(dist_t_less_list) z_score_greater = (dist_t_greater - np.mean(dist_t_greater_list) ) / np.std(dist_t_greater_list) df_out.write('\t'.join( [str(time_point), str(z_score_less), str(z_score_greater)]) + '\n') df_out.close()
def plot_permutation(dataset = 'good', analysis = 'PCA', alpha = 0.05): df_path = pt.get_path() + '/data/Good_et_al/gene_by_pop.txt' df = pd.read_csv(df_path, sep = '\t', header = 'infer', index_col = 0) to_exclude = pt.complete_nonmutator_lines() to_exclude.append('p5') df_nonmut = df[df.index.str.contains('|'.join( to_exclude))] # remove columns with all zeros df_nonmut = df_nonmut.loc[:, (df_nonmut != 0).any(axis=0)] df_delta = pt.likelihood_matrix(df_nonmut, 'Good_et_al').get_likelihood_matrix() if analysis == 'PCA': X = pt.hellinger_transform(df_delta) pca = PCA() df_out = pca.fit_transform(X) elif analysis == 'cMDS': df_delta_bc = np.sqrt(pt.get_scipy_bray_curtis(df_delta.as_matrix())) df_out = pt.cmdscale(df_delta_bc)[0] time_points = [ int(x.split('_')[1]) for x in df_nonmut.index.values] time_points_set = sorted(list(set([ int(x.split('_')[1]) for x in df_nonmut.index.values]))) df_rndm_delta_out = pd.DataFrame(data=df_out, index=df_delta.index) mcds = [] for tp in time_points_set: df_rndm_delta_out_tp = df_rndm_delta_out[df_rndm_delta_out.index.str.contains('_' + str(tp))] mcds.append(pt.get_mean_pairwise_euc_distance(df_rndm_delta_out_tp.as_matrix(), k=3)) mcd_perm_path = pt.get_path() + '/data/Good_et_al/permute_' + analysis + '.txt' mcd_perm = pd.read_csv(mcd_perm_path, sep = '\t', header = 'infer', index_col = 0) mcd_perm_x = np.sort(list(set(mcd_perm.Generation.tolist()))) lower_ci = [] upper_ci = [] mean_mcds = [] std_mcds = [] lower_z_ci = [] upper_z_ci = [] for x in mcd_perm_x: mcd_perm_y = mcd_perm.loc[mcd_perm['Generation'] == x] mcd_perm_y_sort = np.sort(mcd_perm_y.mean_dist.tolist()) mean_mcd_perm_y = np.mean(mcd_perm_y_sort) std_mcd_perm_y = np.std(mcd_perm_y_sort) mean_mcds.append(mean_mcd_perm_y) std_mcds.append(std_mcd_perm_y) lower_ci.append(mean_mcd_perm_y - mcd_perm_y_sort[int(len(mcd_perm_y_sort) * alpha)]) upper_ci.append(abs(mean_mcd_perm_y - mcd_perm_y_sort[int(len(mcd_perm_y_sort) * (1 - alpha))])) # z-scores mcd_perm_y_sort_z = [ ((i - mean_mcd_perm_y) / std_mcd_perm_y) for i in mcd_perm_y_sort] lower_z_ci.append(abs(mcd_perm_y_sort_z[int(len(mcd_perm_y_sort_z) * alpha)])) upper_z_ci.append(abs(mcd_perm_y_sort_z[int(len(mcd_perm_y_sort_z) * (1 - alpha))])) fig = plt.figure() plt.figure(1) plt.subplot(211) plt.errorbar(mcd_perm_x, mean_mcds, yerr = [lower_ci, upper_ci], fmt = 'o', alpha = 0.5, \ barsabove = True, marker = '.', mfc = 'k', mec = 'k', c = 'k', zorder=1) plt.scatter(time_points_set, mcds, c='#175ac6', marker = 'o', s = 70, \ edgecolors='#244162', linewidth = 0.6, alpha = 0.5, zorder=2)#, edgecolors='none') #plt.xlabel("Time (generations)", fontsize = 16) #plt.ylabel("Mean \n Euclidean distance", fontsize = 14) plt.ylabel("Mean pair-wise \n Euclidean \n distance, " + r'$ \left \langle d \right \rangle$', fontsize = 14) plt.figure(1) plt.subplot(212) plt.errorbar(mcd_perm_x, [0] * len(mcd_perm_x), yerr = [lower_z_ci, upper_z_ci], fmt = 'o', alpha = 0.5, \ barsabove = True, marker = '.', mfc = 'k', mec = 'k', c = 'k', zorder=1) # zip mean, std, and measured values to make z-scores zip_list = list(zip(mean_mcds, std_mcds, mcds)) z_scores = [((i[2] - i[0]) / i[1]) for i in zip_list ] plt.scatter(time_points_set, z_scores, c='#175ac6', marker = 'o', s = 70, \ edgecolors='#244162', linewidth = 0.6, alpha = 0.5, zorder=2)#, edgecolors='none') plt.ylim(-2.2, 2.2) #plt.axhline(0, color = 'k', lw = 2, ls = '-') #plt.axhline(-1, color = 'dimgrey', lw = 2, ls = '--') #plt.axhline(-2, color = 'dimgrey', lw = 2, ls = ':') plt.xlabel("Time (generations)", fontsize = 16) plt.ylabel("Standardized mean \n pair-wise Euclidean \n distance, " + r'$ z_{\left \langle d \right \rangle}$', fontsize = 14) #plt.ylabel("Standardized mean \n Euclidean distance", fontsize = 14) fig.tight_layout() fig.savefig(pt.get_path() + '/figs/permutation_scatter_good.png', bbox_inches = "tight", pad_inches = 0.4, dpi = 600) plt.close()
def time_partition_ltee(k=5, iter=1000): df_path = mydir + '/data/Good_et_al/gene_by_pop.txt' df = pd.read_csv(df_path, sep='\t', header='infer', index_col=0) to_include = pt.complete_nonmutator_lines() df_nonmut = df[df.index.str.contains('|'.join(to_include))] # remove columns with all zeros df_nonmut = df_nonmut.loc[:, (df_nonmut != 0).any(axis=0)] # make sure it's sorted df_nonmut.sort_index(inplace=True) time_points = [int(x.split('_')[1]) for x in df_nonmut.index.values] time_points_set = sorted( list(set([int(x.split('_')[1]) for x in df_nonmut.index.values]))) time_points_positions = {} for x in time_points_set: time_points_positions[x] = [ i for i, j in enumerate(time_points) if j == x ] t_final_df = df_nonmut.iloc[time_points_positions[max(time_points_set)]] t_final_np = t_final_df.values gene_names = df_nonmut.columns.tolist() df_out = open(mydir + '/data/Good_et_al/time_partition_z_scores.txt', 'w') df_out.write('\t'.join([ 'Time', 'less_mbd', 'greater_mpd', 'delta_mpd', 'less_mbd_025', 'less_mbd_975', 'greater_mpd_025', 'greater_mpd_975', 'delta_mpd_025', 'delta_mpd_975' ]) + '\n') for time_point in time_points_set: # very few mutations after generation 50000 if time_point > 50000: continue print("Time point " + str(time_point)) t_i_df = df_nonmut.iloc[time_points_positions[time_point]] t_i_np = t_i_df.values # remove rows with all zeros t_i_np_zeros = np.where(~t_i_np.any(axis=1))[0] n_zeros_t_i_np = len(t_i_np_zeros) if n_zeros_t_i_np > 0: t_i_np = np.delete(t_i_np, t_i_np_zeros, axis=0) t_i_to_final_np = t_final_np - t_i_np # remove rows with all zeros t_i_to_final_np_zeros = np.where(~t_i_to_final_np.any(axis=1))[0] n_zeros_t_i_to_final_np = len(t_i_to_final_np_zeros) if n_zeros_t_i_to_final_np > 0: t_i_to_final_np = np.delete(t_i_to_final_np, t_i_to_final_np_zeros, axis=0) t_concat = np.concatenate((t_i_np, t_i_to_final_np), axis=0) t_norm = cd.likelihood_matrix_array( t_concat, gene_names, 'Good_et_al').get_likelihood_matrix() t_norm_rel = t_norm / t_norm.sum(axis=1, keepdims=True) t_norm_rel -= np.mean(t_norm_rel, axis=0) pca = PCA() t_norm_rel_pca = pca.fit_transform(t_norm_rel) t_norm_rel_pca_k5 = t_norm_rel_pca[:, -1 - k:-1] # account for rows with zero mutations dist_t_less = pt.get_mean_pairwise_euc_distance( t_norm_rel_pca_k5[:5 - n_zeros_t_i_np, :], k=k) dist_t_greater = pt.get_mean_pairwise_euc_distance( t_norm_rel_pca_k5[5 - n_zeros_t_i_to_final_np:, :], k=k) dist_t_change = dist_t_greater - dist_t_less #F_t = pt.get_F_2(t_norm_rel_pca_k5, 5-n_zeros_t_i_np, 5-n_zeros_t_i_to_final_np)[0] dist_t_less_list = [] dist_t_greater_list = [] dist_t_change_list = [] #F_t_list = [] for i in range(iter): if i % 1000 == 0: print("Iteration " + str(i)) t_i_np_rndm = pt.get_random_matrix(t_i_np) t_i_to_final_np_rndm = pt.get_random_matrix(t_i_to_final_np) t_rndm_concat = np.concatenate((t_i_np_rndm, t_i_to_final_np_rndm), axis=0) t_rndm_norm = cd.likelihood_matrix_array( t_rndm_concat, gene_names, 'Good_et_al').get_likelihood_matrix() t_rndm_norm_rel = t_rndm_norm / t_rndm_norm.sum(axis=1, keepdims=True) t_rndm_norm_rel -= np.mean(t_rndm_norm_rel, axis=0) t_rndm_norm_rel_pca = pca.fit_transform(t_rndm_norm_rel) # first five axes t_rndm_norm_rel_pca_k5 = t_rndm_norm_rel_pca[:, -1 - k:-1] dist_t_less_rndm = pt.get_mean_pairwise_euc_distance( t_rndm_norm_rel_pca_k5[:5 - n_zeros_t_i_np, :], k=k) dist_t_greater_rndm = pt.get_mean_pairwise_euc_distance( t_rndm_norm_rel_pca_k5[5 - n_zeros_t_i_to_final_np:, :], k=k) dist_t_change_list.append(dist_t_greater_rndm - dist_t_less_rndm) dist_t_less_list.append(dist_t_less_rndm) dist_t_greater_list.append(dist_t_greater_rndm) #F_t_list.append(pt.get_F_2(t_rndm_norm_rel_pca, 5-n_zeros_t_i_np, 5-n_zeros_t_i_to_final_np)[0]) dist_t_change_list.sort() dist_t_greater_list.sort() dist_t_less_list.sort() #F_t_list.sort() # get 95% CIs dist_t_change_025 = dist_t_change_list[int(iter * 0.025)] dist_t_change_975 = dist_t_change_list[int(iter * 0.975)] dist_t_greater_025 = dist_t_greater_list[int(iter * 0.025)] dist_t_greater_975 = dist_t_greater_list[int(iter * 0.975)] dist_t_less_025 = dist_t_less_list[int(iter * 0.025)] dist_t_less_975 = dist_t_less_list[int(iter * 0.975)] #F_t_025 = F_t_list[int(iter*0.025)] #F_t_975 = F_t_list[int(iter*0.975)] df_out.write('\t'.join([str(time_point), str(dist_t_less), str(dist_t_greater), \ str(dist_t_change), str(dist_t_less_025), str(dist_t_less_975), \ str(dist_t_greater_025), str(dist_t_greater_975), \ str(dist_t_change_025), str(dist_t_change_975)]) + '\n') df_out.close()
def ltee_convergence(alpha = 0.05, k = 5): df_path = os.path.expanduser("~/GitHub/ParEvol") + '/data/Good_et_al/gene_by_pop.txt' df = pd.read_csv(df_path, sep = '\t', header = 'infer', index_col = 0) to_keep = pt.complete_nonmutator_lines() #to_keep.append('p5') to_keep.remove('p5') df_nonmut = df[df.index.str.contains('|'.join( to_keep))] # remove columns with all zeros df_nonmut = df_nonmut.loc[:, (df_nonmut != 0).any(axis=0)] gene_names = df_nonmut.columns.tolist() sample_names = df_nonmut.index.tolist() df_delta = cd.likelihood_matrix_array(df_nonmut, gene_names, 'Good_et_al').get_likelihood_matrix() df_delta = df_delta/df_delta.sum(axis=1)[:,None] X = pt.get_mean_center(df_delta) pca = PCA() df_out = pca.fit_transform(X) time_points = [ int(x.split('_')[1]) for x in df_nonmut.index.values] time_points_set = sorted(list(set([ int(x.split('_')[1]) for x in df_nonmut.index.values]))) colors = np.linspace(min(time_points_set),max(time_points_set),len(time_points_set)) color_dict = dict(zip(time_points_set, colors)) df_pca = pd.DataFrame(data=df_out, index=sample_names) mean_dist = [] for tp in time_points_set: df_pca_tp = df_pca[df_pca.index.str.contains('_' + str(tp))] mean_dist.append(pt.get_mean_pairwise_euc_distance(df_pca_tp.values, k = k)) #fig = plt.figure() #plt.scatter(time_points_set, mean_dist, marker = "o", edgecolors='#244162', c = '#175ac6', alpha = 0.4, s = 60, zorder=4) #plt.xlabel("Time", fontsize = 14) #plt.ylabel("Mean euclidean distance", fontsize = 12) #plt.figure(1) #plt.subplot(313) #plt.errorbar(perm_gens, mean_L, yerr = [lower_ci_L, upper_ci_L], fmt = 'o', alpha = 0.5, \ # barsabove = True, marker = '.', mfc = 'k', mec = 'k', c = 'k', zorder=1) #plt.scatter(time_points_set, Ls, c='#175ac6', marker = 'o', s = 70, \ # edgecolors='#244162', linewidth = 0.6, alpha = 0.5, zorder=2)#, edgecolors='none') #for pop in to_keep: # pop_df_pca = df_pca[df_pca.index.str.contains(pop)] # c_list = [ color_dict[int(x.split('_')[1])] for x in pop_df_pca.index.values] # if pt.nonmutator_shapes()[pop] == 'p2': # size == 50 # else: # size = 80 # plt.scatter(pop_df_pca.values[:,0], pop_df_pca.values[:,1], \ # c=c_list, cmap = cm.Blues, vmin=min(time_points_set), vmax=max(time_points_set), \ # marker = pt.nonmutator_shapes()[pop], s = size, edgecolors='#244162', \ # linewidth = 0.6, zorder=4, alpha=0.7)#, edgecolors='none') #c = plt.colorbar() #c.set_label("Generations", size=18) #plt.xlabel('PCA 1 (' + str(round(pca.explained_variance_ratio_[0],3)*100) + '%)' , fontsize = 16) #plt.ylabel('PCA 2 (' + str(round(pca.explained_variance_ratio_[1],3)*100) + '%)' , fontsize = 16) #plt.xlim([-0.4,0.4]) #plt.ylim([-0.4,0.4]) fig.tight_layout() fig.savefig(os.path.expanduser("~/GitHub/ParEvol") + '/figs/ltee_convergence.png', bbox_inches = "tight", pad_inches = 0.4, dpi = 600) plt.close()