def get_sites_to_remove(strain): if strain == 'minimal': acnestor_path = mt.get_path( ) + '/data/syn3B_minimal/mmW_3B.ancestor/output.gd' elif strain == 'wildtype': acnestor_path = mt.get_path( ) + '/data/syn1.0_wildtype/mm8_syn1.0.ancestor/output.gd' sites_to_remove = [] for i, line in enumerate(open(acnestor_path, 'r')): line_split = line.strip().split('\t') if line_split[0] in output_to_keep: sites_to_remove.append(line_split[3] + '_' + str(line_split[4])) return sites_to_remove
def plot_pcoa(bs_iter = 10000): # get F stat and p value df = pd.read_csv(mt.get_path() + '/data/mult_by_pop.txt', sep = '\t', index_col=0) #mt.get_F_2(df,4,4) df = df/df.sum(axis=1)[:,None] df_bc = pairwise_distances(df, metric='braycurtis') df_pcoa = pcoa(df_bc , number_of_dimensions=3) ord_matrix = df_pcoa.samples F = mt.get_F_2(ord_matrix, 4,4) F_nulls = [] for i in range(bs_iter): F_nulls.append(mt.get_F_2(ord_matrix.sample(frac=1), 4,4)[0]) p_value = len([F_null for F_null in F_nulls if F_null > F[0]]) / bs_iter print("F = " + str(round(F[0], 4))) print("p = " + str(round(p_value, 4))) #fig = plt.figure() fig, ax = plt.subplots(figsize=(6, 6)) # Scatterplot on main ax ax.axhline(y=0, color='k', linestyle=':', alpha = 0.8, zorder=1) ax.axvline(x=0, color='k', linestyle=':', alpha = 0.8, zorder=2) ax.scatter(0, 0, marker = "o", edgecolors='none', c = 'darkgray', s = 120, zorder=3) ax.scatter(ord_matrix.ix[0:4,0],ord_matrix.ix[0:4,1], marker = "o", edgecolors='#244162', c = 'blue', alpha = 0.8, s = 120, zorder=4, label='Wildtype') ax.scatter(ord_matrix.ix[4:,0],ord_matrix.ix[4:,1], marker = "o", edgecolors='#244162', c = 'r', alpha = 0.8, s = 120, zorder=4, label='Minimal cell') confidence_ellipse(ord_matrix.ix[0:4,0],ord_matrix.ix[0:4,1], ax, n_std=2, edgecolor='blue', linestyle='--', lw=3) confidence_ellipse(ord_matrix.ix[4:,0],ord_matrix.ix[4:,1], ax, n_std=2, edgecolor='red', linestyle='--', lw=3) #ax1.xlim([-0.7,0.7]) #ax1.set_ylim([-0.7,0.7]) ax.set_xlabel('PCo 1 (' + str(round(df_pcoa.proportion_explained[0],3)*100) + '%)' , fontsize = 14) ax.set_ylabel('PCo 2 (' + str(round(df_pcoa.proportion_explained[1],3)*100) + '%)' , fontsize = 14) plt.legend(loc="upper right") fig_name = mt.get_path() + '/figures/pcoa.png' fig.savefig(fig_name, bbox_inches = "tight", pad_inches = 0.4, dpi = 600) plt.close()
def plot_logpvalue_survival(): fig = plt.figure() fig.subplots_adjust(hspace=0.35, wspace=0.35) pstar_dict = pickle.load(open(mt.get_path() + '/data/p_star.txt', 'rb')) for i in range(0, len(strains)): strain = strains[i] pstar_i = pstar_dict[strain][1] num_significant_i = pstar_dict[strain][0] -1 df = pd.read_csv(mt.get_path() + '/data/logpvalues_' + strain + '.txt', sep = '\t', index_col=0) new_x = df.P_value.tolist() new_obs_y = df.Obs_num.tolist() new_null_y = df.Null_num.tolist() ax = fig.add_subplot(2, 1, i+1) ax.plot(new_x, new_null_y, '-', c='dimgrey', lw=4, alpha = 0.8, zorder=0) ax.plot(new_x, new_obs_y, '-', c='royalblue', lw=4, alpha = 0.8, zorder=1) if pstar_i <0: y_range = [f[1] for f in list(zip(new_x, new_obs_y)) if f[0] > 0] ax.plot([1, 1],[5e-02,max(y_range)],'k-',linewidth=0.5, zorder=2) ax.plot([-3,1],[max(y_range), max(y_range)],'k-',linewidth=0.5, zorder=3) ax.plot([1], [max(y_range)], c='r', marker='o', zorder=4) else: ax.plot([pstar_i, pstar_i],[5e-02,num_significant_i],'k-',linewidth=0.5, zorder=2) ax.plot([-3,pstar_i],[num_significant_i, num_significant_i],'k-',linewidth=0.5, zorder=3) ax.plot([pstar_i], [num_significant_i], c='r', marker='o', zorder=4) ax.set_xlim([0.25, 8]) ax.title.set_text(strain) ax.title.set_fontsize(12) fig.text(0.5, 0.02, '$-\mathrm{log}_{10}P$', ha='center', fontsize=16) fig.text(0.02, 0.5, 'Number of genes', va='center', rotation='vertical', fontsize=16) fig_name = mt.get_path() + '/figures/logpvalue_survival.png' fig.savefig(fig_name, bbox_inches = "tight", pad_inches = 0.4, dpi = 600) plt.close()
def plot_multiplicity_survival(): df_par = pd.read_csv(mt.get_path() + '/data/total_parallelism.txt', sep = '\t' ) fig = plt.figure() fig.subplots_adjust(hspace=0.35, wspace=0.35) for i in range(0, len(strains)): strain = strains[i] df_path = mt.get_path() + '/data/mult_survival_curves_' + strain + '.txt' df = pd.read_csv(df_path, sep = '\t', index_col=0) new_x = [1.0] + df.Mult.tolist() + [df.Mult.tolist()[-1]] new_obs_y =[1.0] + df.Obs_fract.tolist() + [ 0.0001] new_null_y = [1.0] + df.Null_fract.tolist() + [ 0.0001] ax = fig.add_subplot(2, 1, i+1) ax.plot(new_x, new_obs_y, '-', c='royalblue', lw=4, alpha = 0.8, zorder=1) ax.plot(new_x, new_null_y, '-', c='dimgrey', lw=4, alpha = 0.8, zorder=0) ax.set_xlim([0.9, 9]) taxon_par = df_par.loc[df_par['Strain'] == strain] ax.annotate(r'$\Delta \ell= $'+ str(round(float(taxon_par.G_score), 3)), (0.6 *9, 0.9), fontsize=8) if np.log10(float(taxon_par.p_value)) < -3: ax.annotate(r'$\mathrm{p} = $'+ str('%.2E' % Decimal(float(taxon_par.p_value))), (0.6 *9, 0.75), fontsize=8) else: ax.annotate(r'$\mathrm{p} = $'+ str(round(float(taxon_par.p_value),3)), (0.6 *9, 0.75), fontsize=8) if strain == 'wildtype': ax.title.set_text('Wildtype') elif strain == 'minimal': ax.title.set_text('Minimal') ax.title.set_fontsize(12) ax.xaxis.set_major_formatter(FormatStrFormatter('%d')) fig.text(0.5, 0.02, 'Gene multiplicity, ' + '$m$', ha='center', fontsize=16) fig.text(0.02, 0.5, 'Fraction mutations ' + '$\geq m$', va='center', rotation='vertical', fontsize=16) fig_name = mt.get_path() + '/figures/mult_survival.png' fig.savefig(fig_name, bbox_inches = "tight", pad_inches = 0.4, dpi = 600) plt.close()
def get_multiplicity(nmin=2, FDR=0.05): p_star_dict = {} G_score_list = [] gene_by_pop_dict = {} for strain in strains: sites_to_remove = get_sites_to_remove(strain) gene_count_dict = {} if strain == 'minimal': dirs = [ 'syn3B_minimal/mm13', 'syn3B_minimal/mm11', 'syn3B_minimal/mm10', 'syn3B_minimal/mm9' ] ref_path = mt.get_path( ) + '/data/syn3B_minimal/reference/Synthetic.bacterium_JCVI-Syn3A.gb' elif strain == 'wildtype': dirs = [ 'syn1.0_wildtype/mm6', 'syn1.0_wildtype/mm4', 'syn1.0_wildtype/mm3', 'syn1.0_wildtype/mm1' ] ref_path = mt.get_path( ) + '/data/syn1.0_wildtype/reference/Synthetic.Mycoplasma.mycoides.JCVI-syn1.0_CP002027.1.gb' effective_gene_lengths, effective_gene_lengths_syn, Lsyn, Lnon, substitution_specific_synonymous_fraction = mt.calculate_synonymous_nonsynonymous_target_sizes( ref_path) for dir in dirs: for i, line in enumerate( open(mt.get_path() + '/data/' + dir + '/annotated.gd', 'r')): line_split = line.strip().split('\t') if line_split[0] not in output_to_keep: continue if line_split[3] + '_' + line_split[4] in sites_to_remove: continue frequency = float([s for s in line_split if 'frequency=' in s][0].split('=')[1]) if frequency != 1: continue if line_split[0] == 'SNP': if [s for s in line_split if 'snp_type=' in s ][0].split('=')[1] == 'nonsynonymous': locus_tag = [ s for s in line_split if 'locus_tag=' in s ][0].split('=')[1] frequency = float([ s for s in line_split if 'frequency=' in s ][0].split('=')[1]) if ';' in locus_tag: for locus_tag_j in locus_tag.split(';'): if locus_tag_j not in gene_count_dict: gene_count_dict[locus_tag_j] = 0 gene_count_dict[locus_tag_j] += 1 else: if locus_tag not in gene_count_dict: gene_count_dict[locus_tag] = 0 gene_count_dict[locus_tag] += 1 else: continue else: if len( [s for s in line_split if 'gene_position=coding' in s ]) >= 1: locus_tag = [ s for s in line_split if 'locus_tag=' in s ][0].split('=')[1] frequency = float([ s for s in line_split if 'frequency=' in s ][0].split('=')[1]) if ';' in locus_tag: for locus_tag_j in locus_tag.split(';'): if locus_tag_j not in gene_count_dict: gene_count_dict[locus_tag_j] = 0 gene_count_dict[locus_tag_j] += 1 else: if locus_tag not in gene_count_dict: gene_count_dict[locus_tag] = 0 gene_count_dict[locus_tag] += 1 # get multiplicity scores gene_parallelism_statistics = {} for gene_i, length_i in effective_gene_lengths.items(): gene_parallelism_statistics[gene_i] = {} gene_parallelism_statistics[gene_i]['length'] = length_i gene_parallelism_statistics[gene_i]['observed'] = 0 gene_parallelism_statistics[gene_i]['multiplicity'] = 0 # save number of mutations for multiplicity for locus_tag_i, n_muts_i in gene_count_dict.items(): gene_parallelism_statistics[locus_tag_i]['observed'] = n_muts_i L_mean = np.mean(list(effective_gene_lengths.values())) L_tot = sum(list(effective_gene_lengths.values())) n_tot = sum(gene_count_dict.values()) # don't include taxa with less than 20 mutations print("N_total = " + str(n_tot)) # go back over and calculate multiplicity for locus_tag_i in gene_parallelism_statistics.keys(): # double check the measurements from this gene_parallelism_statistics[locus_tag_i][ 'multiplicity'] = gene_parallelism_statistics[locus_tag_i][ 'observed'] * 1.0 / effective_gene_lengths[ locus_tag_i] * L_mean gene_parallelism_statistics[locus_tag_i][ 'expected'] = n_tot * gene_parallelism_statistics[locus_tag_i][ 'length'] / L_tot pooled_multiplicities = np.array([ gene_parallelism_statistics[gene_name]['multiplicity'] for gene_name in gene_parallelism_statistics.keys() if gene_parallelism_statistics[gene_name]['multiplicity'] >= 1 ]) pooled_multiplicities.sort() pooled_tupe_multiplicities = np.array([ (gene_parallelism_statistics[gene_name]['multiplicity'], gene_parallelism_statistics[gene_name]['observed']) for gene_name in gene_parallelism_statistics.keys() if gene_parallelism_statistics[gene_name]['multiplicity'] >= 1 ]) pooled_tupe_multiplicities = sorted(pooled_tupe_multiplicities, key=lambda x: x[0]) pooled_tupe_multiplicities_x = [ i[0] for i in pooled_tupe_multiplicities ] pooled_tupe_multiplicities_y = [ i[1] for i in pooled_tupe_multiplicities ] pooled_tupe_multiplicities_y = [ sum(pooled_tupe_multiplicities_y[i:]) / sum(pooled_tupe_multiplicities_y) for i in range(len(pooled_tupe_multiplicities_y)) ] null_multiplicity_survival = mt.NullGeneMultiplicitySurvivalFunction.from_parallelism_statistics( gene_parallelism_statistics) null_multiplicity_survival_copy = null_multiplicity_survival( pooled_multiplicities) null_multiplicity_survival_copy = [ sum(null_multiplicity_survival_copy[i:]) / sum(null_multiplicity_survival_copy) for i in range(len(null_multiplicity_survival_copy)) ] #threshold_idx = numpy.nonzero((null_multiplicity_survival(observed_ms)*1.0/observed_multiplicity_survival)<FDR)[0][0] mult_survival_dict = { 'Mult': pooled_multiplicities, 'Obs_fract': pooled_tupe_multiplicities_y, 'Null_fract': null_multiplicity_survival_copy } mult_survival_df = pd.DataFrame(mult_survival_dict) mult_survival_df_out = mt.get_path( ) + '/data/mult_survival_curves_' + strain + '.txt' mult_survival_df.to_csv(mult_survival_df_out, sep='\t', index=True) # get likelihood score and null test observed_G, pvalue = mt.calculate_total_parallelism( gene_parallelism_statistics) G_score_list.append((strain, observed_G, pvalue)) print(strain, observed_G, pvalue) # Give each gene a p-value, get distribution gene_logpvalues = mt.calculate_parallelism_logpvalues( gene_parallelism_statistics) pooled_pvalues = [] for gene_name in gene_logpvalues.keys(): if (gene_parallelism_statistics[gene_name]['observed'] >= nmin) and (float(gene_logpvalues[gene_name]) >= 0): pooled_pvalues.append(gene_logpvalues[gene_name]) pooled_pvalues = np.array(pooled_pvalues) pooled_pvalues.sort() if len(pooled_pvalues) == 0: continue null_pvalue_survival = mt.NullGeneLogpSurvivalFunction.from_parallelism_statistics( gene_parallelism_statistics, nmin=nmin) observed_ps, observed_pvalue_survival = mt.calculate_unnormalized_survival_from_vector( pooled_pvalues, min_x=-4) # Pvalue version # remove negative minus log p values. neg_p_idx = np.where(observed_ps >= 0) observed_ps_copy = observed_ps[neg_p_idx] observed_pvalue_survival_copy = observed_pvalue_survival[neg_p_idx] pvalue_pass_threshold = np.nonzero( null_pvalue_survival(observed_ps_copy) * 1.0 / observed_pvalue_survival_copy < FDR)[0] if len(pvalue_pass_threshold) == 0: continue threshold_idx = pvalue_pass_threshold[0] pstar = observed_ps_copy[ threshold_idx] # lowest value where this is true num_significant = observed_pvalue_survival[threshold_idx] # make it log base 10 logpvalues_dict = { 'P_value': observed_ps / math.log(10), 'Obs_num': observed_pvalue_survival, 'Null_num': null_pvalue_survival(observed_ps) } logpvalues_df = pd.DataFrame(logpvalues_dict) logpvalues_df_out = mt.get_path( ) + '/data/logpvalues_' + strain + '.txt' logpvalues_df.to_csv(logpvalues_df_out, sep='\t', index=True) p_star_dict[strain] = (num_significant, pstar / math.log(10)) output_mult_gene_filename = mt.get_path( ) + '/data/mult_genes_sig_' + strain + '.txt' output_mult_gene = open(output_mult_gene_filename, "w") output_mult_gene.write(",".join([ "Gene", "Length", "Observed", "Expected", "Multiplicity", "-log10(P)" ])) for gene_name in sorted( gene_parallelism_statistics, key=lambda x: gene_parallelism_statistics.get(x)['observed'], reverse=True): if gene_logpvalues[ gene_name] >= pstar and gene_parallelism_statistics[ gene_name]['observed'] >= nmin: output_mult_gene.write("\n") # log base 10 transform the p-values here as well output_mult_gene.write( "%s, %0.1f, %d, %0.2f, %0.2f, %g" % (gene_name, gene_parallelism_statistics[gene_name]['length'], gene_parallelism_statistics[gene_name]['observed'], gene_parallelism_statistics[gene_name]['expected'], gene_parallelism_statistics[gene_name]['multiplicity'], abs(gene_logpvalues[gene_name]) / math.log(10))) output_mult_gene.close() total_parallelism_path = mt.get_path() + '/data/total_parallelism.txt' total_parallelism = open(total_parallelism_path, "w") total_parallelism.write("\t".join(["Strain", "G_score", "p_value"])) for i in range(len(G_score_list)): taxon_i = G_score_list[i][0] G_score_i = G_score_list[i][1] p_value_i = G_score_list[i][2] total_parallelism.write("\n") total_parallelism.write("\t".join( [taxon_i, str(G_score_i), str(p_value_i)])) total_parallelism.close() with open(mt.get_path() + '/data/p_star.txt', 'wb') as file: file.write( pickle.dumps(p_star_dict)) # use `pickle.loads` to do the reverse
def get_multiplicity_matrix(): gene_by_pop_dict = {} for strain in strains: sites_to_remove = get_sites_to_remove(strain) if strain == 'minimal': dirs = [ 'syn3B_minimal/mm13', 'syn3B_minimal/mm11', 'syn3B_minimal/mm10', 'syn3B_minimal/mm9' ] ref_path = mt.get_path( ) + '/data/syn3B_minimal/reference/Synthetic.bacterium_JCVI-Syn3A.gb' elif strain == 'wildtype': dirs = [ 'syn1.0_wildtype/mm6', 'syn1.0_wildtype/mm4', 'syn1.0_wildtype/mm3', 'syn1.0_wildtype/mm1' ] ref_path = mt.get_path( ) + '/data/syn1.0_wildtype/reference/Synthetic.Mycoplasma.mycoides.JCVI-syn1.0_CP002027.1.gb' effective_gene_lengths, effective_gene_lengths_syn, Lsyn, Lnon, substitution_specific_synonymous_fraction = mt.calculate_synonymous_nonsynonymous_target_sizes( ref_path) for dir in dirs: pop = dir.split('/')[1] gene_count_dict_pop = {} gene_by_pop_dict[pop] = {} for i, line in enumerate( open(mt.get_path() + '/data/' + dir + '/annotated.gd', 'r')): line_split = line.strip().split('\t') if line_split[0] not in output_to_keep: continue if line_split[3] + '_' + line_split[4] in sites_to_remove: continue frequency = float([s for s in line_split if 'frequency=' in s][0].split('=')[1]) if frequency != 1: continue if line_split[0] == 'SNP': if [s for s in line_split if 'snp_type=' in s ][0].split('=')[1] == 'nonsynonymous': locus_tag = [ s for s in line_split if 'locus_tag=' in s ][0].split('=')[1] frequency = float([ s for s in line_split if 'frequency=' in s ][0].split('=')[1]) if ';' in locus_tag: for locus_tag_j in locus_tag.split(';'): if locus_tag_j not in gene_count_dict_pop: gene_count_dict_pop[locus_tag_j] = 0 gene_count_dict_pop[locus_tag_j] += 1 else: if locus_tag not in gene_count_dict_pop: gene_count_dict_pop[locus_tag] = 0 gene_count_dict_pop[locus_tag] += 1 else: continue else: if len( [s for s in line_split if 'gene_position=coding' in s ]) >= 1: locus_tag = [ s for s in line_split if 'locus_tag=' in s ][0].split('=')[1] frequency = float([ s for s in line_split if 'frequency=' in s ][0].split('=')[1]) if ';' in locus_tag: for locus_tag_j in locus_tag.split(';'): if locus_tag_j not in gene_count_dict_pop: gene_count_dict_pop[locus_tag_j] = 0 gene_count_dict_pop[locus_tag_j] += 1 else: if locus_tag not in gene_count_dict_pop: gene_count_dict_pop[locus_tag] = 0 gene_count_dict_pop[locus_tag] += 1 gene_parallelism_statistics = {} for gene_i, length_i in effective_gene_lengths.items(): gene_parallelism_statistics[gene_i] = {} gene_parallelism_statistics[gene_i]['length'] = length_i gene_parallelism_statistics[gene_i]['observed'] = 0 gene_parallelism_statistics[gene_i]['multiplicity'] = 0 # save number of mutations for multiplicity for locus_tag_i, n_muts_i in gene_count_dict_pop.items(): gene_parallelism_statistics[locus_tag_i]['observed'] = n_muts_i # save number of mutations for multiplicity L_mean = np.mean(list(effective_gene_lengths.values())) L_tot = sum(list(effective_gene_lengths.values())) n_tot = sum(gene_count_dict_pop.values()) # go back over and calculate multiplicity for locus_tag_i in gene_parallelism_statistics.keys(): # double check the measurements from this gene_parallelism_statistics[locus_tag_i][ 'multiplicity'] = gene_parallelism_statistics[locus_tag_i][ 'observed'] * 1.0 / effective_gene_lengths[ locus_tag_i] * L_mean gene_parallelism_statistics[locus_tag_i][ 'expected'] = n_tot * gene_parallelism_statistics[ locus_tag_i]['length'] / L_tot # split locus tags for locus_tag_i in gene_parallelism_statistics.keys(): mult_i = gene_parallelism_statistics[locus_tag_i][ 'multiplicity'] if mult_i > 0: locus_tag_i_num = locus_tag_i.split('_')[1] gene_by_pop_dict[pop][locus_tag_i_num] = mult_i gene_by_pop_df = pd.DataFrame(gene_by_pop_dict) gene_by_pop_df = gene_by_pop_df.T gene_by_pop_df.fillna(0, inplace=True) gene_by_pop_df_out = mt.get_path() + '/data/mult_by_pop.txt' gene_by_pop_df.to_csv(gene_by_pop_df_out, sep='\t', index=True)