def annotate_significant_genes(): total_parallelism = open(lt.get_path() + '/data/breseq/gene_annotation.txt', "w") total_parallelism.write("\t".join(["Species", "locus_tag", "refseq_id", "annotation"]) +"\n" ) taxa = ['ATCC13985', 'KBS0702', 'KBS0707', 'KBS0711', 'KBS0715', 'KBS0721', 'KBS0722', 'KBS0724', 'KBS0801'] for taxon in taxa: locus_tags = [] for line in open(lt.get_path() + '/data/breseq/mult_genes_nonsyn_sig/' + taxon + '.txt', 'r'): line_split = line.strip().split(',') if line_split[0] == 'Gene': continue locus_tags.append(line_split[0]) # the refseq annotations don't map to KEGG annotated genes in the maple pathways # can't complete that analysis, just focus on refseq annotations # make refseq => KEGG dict #refseq_kegg_dict = {} #for line in open(lt.get_path() + '/data/genomes/genomes_ncbi_maple/' + taxon + '_MAPLE_result/query.fst.ko', 'r'): # line_split = line.strip().split('\t') # refseq_kegg_dict[line_split[0]] = line_split[1] ## get list of MAPLe modules to keep #df_modules = pd.read_csv(lt.get_path() + '/data/genomes/genomes_ncbi_maple_clean/' + taxon +'_maple_modules.txt', sep = '\t') #df_modules_mcr = df_modules.loc[df_modules['query(coverage)'] >= MCR] #modules_to_keep = df_modules_mcr.Pathway_ID.tolist() ## make KEGG => MAPLE dict #kegg_maple_dict = KO_to_module(taxon, modules_to_keep) # make locus tag => refseq dict locus_tag_refseq_dict = {} for subdir, dirs, files in os.walk(lt.get_path() + '/data/genomes/genomes_ncbi/' + taxon): for file in files: if file.endswith('.gbff'): with open(os.path.join(subdir, file), "rU") as input_handle: for record in SeqIO.parse(input_handle, "genbank"): for feature in record.features: if feature.type != 'CDS': continue if 'incomplete' in feature.qualifiers['note'][0]: continue if 'frameshifted' in feature.qualifiers['note'][0]: continue if 'internal stop' in feature.qualifiers['note'][0]: continue gene_name = feature.qualifiers['locus_tag'][0] inference = feature.qualifiers['inference'][0] product = feature.qualifiers['product'][0] if 'RefSeq' in inference: locus_tag_refseq_dict[gene_name] = [inference.split(':')[-1], product] # finally get maple annotation for the genes with significant # mutations for locus_tag in locus_tags: if locus_tag not in locus_tag_refseq_dict: continue refseq_annotation = locus_tag_refseq_dict[locus_tag] refseq_name = refseq_annotation[0].replace("_", "") total_parallelism.write("\t".join([taxon, locus_tag, refseq_name, refseq_annotation[1]]) + '\n') total_parallelism.close()
def get_sites_to_remove(taxon): to_keep_samples = get_breseq_samples_to_keep() taxon_sites = [] taxon_samples = [ x for x in to_keep_samples if x.startswith(taxon) ] fixed = [] # first list all sites that are fixed in all replicate populations # these are most likely fixed in the ancestor for taxon_sample in taxon_samples: taxon_sample_sites = [] for i, line in enumerate(open(lt.get_path() + '/data/breseq/annotated/' + taxon_sample + '.gd', 'r')): line_split = line.strip().split('\t') if line_split[0] in output_to_keep: # a lot of mutations at the first base of each contig, ignore these if line_split[4] == '1': continue freq = float([x for x in line_split if 'frequency=' in x][0].split('=')[1]) if freq == 1: fixed.append(line_split[3] + '_' + str(line_split[4])) taxon_sample_sites.append( line_split[3] + '_' + str(line_split[4])) taxon_sites.extend(list(set( taxon_sample_sites ))) count_fixed = Counter(fixed) count_fixed_all_reps = dict((k, v) for k, v in count_fixed.items() if v == len(taxon_samples)) sites_to_remove_all_fixed = list(count_fixed_all_reps.keys()) # see how many fixations have VARIANT_STRAND_COVERAGE flag # copy dict flag_fixed = copy.deepcopy(count_fixed) flag_fixed = {key:val for key, val in flag_fixed.items() if val < len(taxon_samples)-1} for taxon_sample in taxon_samples: taxon_sample_sites = [] for i, line in enumerate(open(lt.get_path() + '/data/breseq/annotated/' + taxon_sample + '.gd', 'r')): line_split = line.strip().split('\t') if line_split[0] == 'RA': freq = float([x for x in line_split if 'frequency=' in x][0].split('=')[1]) if ('VARIANT_STRAND_COVERAGE' in line) or ('SURROUNDING_HOMOPOLYMER' in line): contig_site = line_split[3] + '_' + str(line_split[4]) if contig_site in flag_fixed: del flag_fixed[contig_site] # everything breseq is calling as a fixed mutation has counts_all = Counter(taxon_sites) count_dict_to_remove = dict((k, v) for k, v in counts_all.items() if (v > 1 ) ) sites_to_remove = list(count_dict_to_remove.keys()) sites_to_remove_all = list(set(sites_to_remove + sites_to_remove_all_fixed)) #print(taxon + ' proportion sites removed ' + str(round(len(sites_to_remove)/ len(counts_all.keys()), 3 )) ) return sites_to_remove_all
def piecewise_regression(): df = pd.read_csv( lt.get_path() + '/data/demography/longtermdormancy_20190528_nocomments.csv', sep=',') # KBS0721 rep 3 df['N'] = (df['Colonies'] + 1) * (1000 / df['Inoculum']) * (10**(df['Dilution'])) df['Dormstart_date'] = pd.to_datetime(df['Dormstart_date'], format='%d-%b-%y') df['Firstread_date'] = pd.to_datetime(df['Firstread_date'], format='%d-%b-%y') df['Days'] = df['Firstread_date'].sub(df['Dormstart_date'], axis=0) df['Days'] = df['Days'].dt.days.astype('int') df_test = df[(df["Strain"] > 'KBS0721') & (df["Rep"] == 3)] x = df_test.Days.values y = np.log10(df_test.N.values) my_pwlf = pwlf.PiecewiseLinFit(x, y) # fit the data for four line segments res = my_pwlf.fit(2) # predict for the determined points xHat = np.linspace(min(x), max(x), num=10000) yHat = my_pwlf.predict(xHat)
def clean_iRep(cutoff=2.5): # very low coverage for these taxa to_remove = ['KBS0705', 'KBS0706'] directory = os.fsencode(lt.get_path() + '/data/iRep') df_out = open(lt.get_path() + '/data/iRep_clean.txt', 'w') header = ['Sample', 'Species', 'rep' ,'iRep'] df_out.write('\t'.join(header) + '\n') iRep_corrected_dict = {} iRep_uncorrected_dict = {} for file in os.listdir(directory): filename = os.fsdecode(file) if filename.endswith('.tsv'): iRep_path = os.path.join(str(directory, 'utf-8'), filename) strain = re.split(r'[.-]+', filename)[0] strain_rep = re.split(r'[.]+', filename)[0] if strain in to_remove: continue if 'W' in strain_rep: continue strain_rep = strain_rep[:-1] + str(lt.rename_rep()[strain_rep[-1]]) if strain_rep == 'ATCC13985-4': continue for i, line in enumerate(open(iRep_path, 'r')): if i == 2: last_item = line.strip().split()[-1] if last_item == 'n/a': iRep_corrected = float('nan') else: iRep_corrected = float(last_item) iRep_corrected_dict[strain_rep] = [iRep_corrected] elif i == 6: iRep_uncorrected = float(line.strip().split()[-1]) iRep_uncorrected_dict[strain_rep] = [iRep_uncorrected] for key, value in iRep_corrected_dict.items(): value.extend(iRep_uncorrected_dict[key]) for key, value in iRep_corrected_dict.items(): if value[1] > 11: continue if math.isnan(value[0]) == True: iRep = value[1] else: iRep = value[0] out_line = [key, key.split('-')[0], key.split('-')[1], str(iRep)] df_out.write('\t'.join(out_line) + '\n') df_out.close()
def get_assembly_coverage(): df_out = open(lt.get_path() + '/data/genomes/assembly_coverage.txt', 'w') df_out.write('\t'.join(['Species', 'mean_coverage']) + '\n') assembly_path = lt.get_path() + '/data/genomes/nanopore_hybrid/' for file in os.listdir(assembly_path): filename = os.fsdecode(file) if filename.endswith('.fasta'): strain = filename.split('.')[0] print(strain) fa = lt.classFASTA(assembly_path+filename).readFASTA() fa_headers = [x[0].split('_') for x in fa] fa_headers = [x for x in fa_headers if int(x[3]) > 200] size = sum(int(x[3]) for x in fa_headers) weighted_mean_cov = (sum( [int(x[3]) * float(x[5]) for x in fa_headers]) / size) df_out.write('\t'.join([ strain, str(round(weighted_mean_cov, 3)) ]) + '\n') df_out.close()
def get_16S_copy_number(): genome_path = lt.get_path() + '/data/genomes/genomes_ncbi/' df_out = open(lt.get_path() + '/data/count_16S.txt', 'w') header = ['Species', 'Number_16S'] df_out.write('\t'.join(header) + '\n') for subdir, dirs, files in os.walk(genome_path): for file in files: if file.endswith('.gbff'): strain = subdir.split('/')[-1] count_16S = 0 with open(os.path.join(subdir, file), "rU") as input_handle: for record in SeqIO.parse(input_handle, "genbank"): for feature in record.features: if feature.type == 'rRNA': if feature.qualifiers['product'][0] == '16S ribosomal RNA': count_16S+=1 df_out.write('\t'.join([strain, str(count_16S)]) + '\n') df_out.close()
def merge_maple(strain): maple_path = lt.get_path() + '/data/genomes/genomes_ncbi_maple/' IN_maple_sign_path = maple_path + strain + '_MAPLE_result/' + 'module_signature.tsv' IN_maple_sign = pd.read_csv(IN_maple_sign_path, sep = '\t') IN_maple_cmplx_path = maple_path + strain + '_MAPLE_result/' + 'module_complex.tsv' IN_maple_cmplx = pd.read_csv(IN_maple_cmplx_path, sep = '\t') IN_maple_pthwy_path = maple_path + strain + '_MAPLE_result/' + 'module_pathway.tsv' IN_maple_pthwy = pd.read_csv(IN_maple_pthwy_path, sep = '\t') IN_maple_fxn_path = maple_path + strain + '_MAPLE_result/' + 'module_function.tsv' IN_maple_fxn = pd.read_csv(IN_maple_fxn_path, sep = '\t') df_list = [IN_maple_cmplx, IN_maple_pthwy, IN_maple_sign] df_merged = IN_maple_fxn.append(df_list) # add column with pathway ID df_merged['Pathway_ID'] = df_merged['ID'].apply(lambda x: x.split('_')[0]) df_merged_no_dup = df_merged.drop_duplicates(subset='Pathway_ID', keep="last") df_merged_no_dup = df_merged_no_dup.reset_index(drop=True) # median = median MCR OUT_path = lt.get_path() + '/data/genomes/genomes_ncbi_maple_clean/' + strain + '_maple_modules.txt' df_merged_no_dup.to_csv(OUT_path, sep = '\t', index = False)
def merge_maple_all_strains(MCR = 0.8): dfs = [] maple_path = lt.get_path() + '/data/genomes/genomes_ncbi_maple_clean/' for filename in os.listdir(maple_path): if filename.endswith("_maple_modules.txt"): df = pd.read_csv(maple_path + filename, sep = '\t') strain = filename.split('_')[0] df['Strain'] = strain dfs.append(df) dfs_concat = pd.concat(dfs) dfs_concat = dfs_concat.reset_index(drop=True) # remove rows that are less than 80% complete # query(coverage) = MCR % (ITR) # query(coverage/max) = MCR % (WC) # query(coverage/mode) = Q-value dfs_concat_050 = dfs_concat.loc[dfs_concat['query(coverage)'] >= MCR] module_by_taxon = pd.crosstab(dfs_concat_050.Pathway_ID, dfs_concat_050.Strain) module_by_taxon_no_redundant = module_by_taxon[(module_by_taxon.T != 1).any()] OUT_path = lt.get_path() + '/data/genomes/genomes_ncbi_maple.txt' module_by_taxon.to_csv(OUT_path, sep = '\t', index = True)
def KO_to_module(strain, modules_to_keep = None): kaas_directory = lt.get_path() + '/data/genomes/genomes_ncbi_maple/' + strain + '_MAPLE_result/KAAS' bad_chars = '()-+,-' rgx = re.compile('[%s]' % bad_chars) kegg_maple_dict = {} for filename in os.listdir(kaas_directory): if filename.endswith("_matrix.txt"): for line in open((os.path.join(kaas_directory, filename)), 'r'): line_strip_split = line.strip().split() if len(line_strip_split) > 2 and 'M' in line_strip_split[0]: if '_' in line_strip_split[0]: pathway = line_strip_split[0].split('_')[0] else: pathway = line_strip_split[0] # ignore modules that don't meet the MCR threshold if modules_to_keep != None: if pathway not in modules_to_keep: continue ko_genes = line_strip_split[2:] for ko_gene in ko_genes: test_set_member = [bad_char for bad_char in bad_chars if bad_char in ko_gene] if len(test_set_member) > 0: ko_gene_clean = rgx.sub('', ko_gene) ko_gene_clean_split = ['K' + e for e in ko_gene_clean.split('K') if e] for split_gene in ko_gene_clean_split: if 'M' in split_gene: continue if split_gene in kegg_maple_dict: kegg_maple_dict[split_gene].append(pathway) else: kegg_maple_dict[split_gene] = [pathway] else: if 'K' in ko_gene: if ko_gene in kegg_maple_dict: kegg_maple_dict[ko_gene].append(pathway) else: kegg_maple_dict[ko_gene] = [pathway] return kegg_maple_dict
def get_breseq_samples_to_keep(cov_min=50): json_path = lt.get_path() + '/data/breseq/summary/' to_keep = [] for filename in os.listdir(json_path): if filename.endswith(".json") == False: continue if 'ATCC43928' in filename: continue if 'KBS0727' in filename: continue with open(json_path + filename) as f: data = json.load(f) contigs = list(data['references']['reference'].keys()) coverages = [] for contig in contigs: if data['references']['reference'][contig]['length'] < 300: continue coverages.append(data['references']['reference'][contig]['coverage_average'] ) mean_cov = np.mean(coverages) if mean_cov > cov_min: to_keep.append(filename.split('.')[0]) return to_keep
'c', fontsize=13, fontweight='bold', ha='center', va='center', transform=ax_mttd.transAxes) ax_ext.text(-0.1, 1.07, 'd', fontsize=13, fontweight='bold', ha='center', va='center', transform=ax_ext.transAxes) df_weibull = pd.read_csv(lt.get_path() + '/data/demography/weibull_results_clean.csv', sep=',') df_CIs = pd.read_csv(lt.get_path() + '/data/demography/model_CIs.csv', sep=',') model_features = open(lt.get_path() + '/data/demography/model_features.csv', 'r') model_features.readline() model_features_dict = {} for line in model_features: line = line.strip().replace('"', '').split(',') model_features_dict[line[0]] = float(line[1]) model_features.close() taxa = list(set(df_weibull.strain.to_list()))
import matplotlib.ticker import datetime as dt #from sklearn.model_selection import GridSearchCV #from sklearn.neighbors import KernelDensity import statsmodels.stats.multitest as mt import statsmodels.formula.api as smf from Bio import SeqIO from statsmodels.base.model import GenericLikelihoodModel # only plot taxa w/ significant g scores and at least 100 mutations df_irep = pd.read_csv(lt.get_path() + '/data/iRep_clean.txt', sep='\t') df_irep = df_irep.rename(columns={'Species': 'strain'}) df_weib = pd.read_csv(lt.get_path() + '/data/demography/weibull_results_clean.csv', sep=',') df_merged = df_weib.merge(df_irep, on=['strain', 'rep']) taxa = list(set(df_merged.strain.to_list())) df_merged['alpha_log10'] = np.log10(df_merged.alpha) mf = smf.mixedlm("alpha_log10 ~ iRep", df_merged, groups=df_merged["strain"]) mf_fit = mf.fit() print(mf_fit.summary()) irep_mean_list = [] shape_mean_list = []
t_list = list(range(1000)) for t in t_list: N_list.append(calculate_bi_exponential(N1, N2, d1, d2, t)) print(colors[d1_idx]) plt.plot(t_list, N_list, zorder=2, ls='--', label=r'$d_{1}/d_{2}=$' + str(d1 / d2), c=colors[d1_idx], lw=2) plt.xlabel('Days, ' + r'$t$', fontsize=16) plt.ylabel('Population size, ' + '$N(t)$', fontsize=16) plt.yscale('log', base=10) plt.legend(loc='upper right', prop={'size': 8}) #fig.savefig(lt.get_path() + '/figs/spoiie_death_curve.pdf', format='pdf', bbox_inches = "tight", pad_inches = 0.4, dpi = 600) fig.savefig(lt.get_path() + '/figs/test_exponential.pdf', format='pdf', bbox_inches="tight", pad_inches=0.4, dpi=600) plt.close()
import matplotlib.ticker import datetime as dt #from sklearn.model_selection import GridSearchCV #from sklearn.neighbors import KernelDensity import statsmodels.stats.multitest as mt import statsmodels.formula.api as smf from Bio import SeqIO from statsmodels.base.model import GenericLikelihoodModel # only plot taxa w/ significant g scores and at least 100 mutations df = pd.read_csv(lt.get_path() + '/data/staining.all.new.txt', sep='\t') df = df[df.strain != "KBS0725"] taxa = list(set(df.strain.to_list())) to_remove = ['KBS0711W', 'KBS0727', 'KBS0714', 'KBS0701'] taxa = [x for x in taxa if x not in to_remove] df_anc = df.loc[df['hist'] == 'anc'] df_der = df.loc[df['hist'] == 'der'] fig = plt.figure() plt.axvline(1, color='dimgrey', lw=2, ls='--', zorder=1) # first sort by mean mean_list = [] for taxon in taxa: df_der_dead = df_der.loc[df_der['strain'] == taxon].dead.values delta_dead = df_der_dead - df_anc.loc[df_anc['strain'] == taxon].dead.values[0] delta_dead_mean = np.mean(delta_dead)
import matplotlib.lines as mlines import matplotlib.ticker import datetime as dt #from sklearn.model_selection import GridSearchCV #from sklearn.neighbors import KernelDensity import statsmodels.stats.multitest as mt import statsmodels.formula.api as smf from Bio import SeqIO from statsmodels.base.model import GenericLikelihoodModel df_colors = pd.read_csv(lt.get_path() + '/data/colors.csv', sep=',') import sys if not sys.warnoptions: import warnings warnings.simplefilter("ignore") # weibull def log_weibull(t, d_0, k): t = np.asarray(t) return np.exp(-1 * ((t * d_0)**k)) class log_weibull_model(GenericLikelihoodModel): def __init__(self, endog, exog, **kwds):
import statsmodels.stats.multitest as mt import statsmodels.formula.api as smf from Bio import SeqIO from statsmodels.base.model import GenericLikelihoodModel to_remove_KBS0711 = [10, 11, 12] all_taxa = [ 'KBS0703', 'ATCC13985', 'ATCC43928', 'KBS0701', 'KBS0702', 'KBS0705', 'KBS0706', 'KBS0707', 'KBS0710', 'KBS0711', 'KBS0712', 'KBS0713', 'KBS0714', 'KBS0715', 'KBS0721', 'KBS0722', 'KBS0724', 'KBS0725', 'KBS0801', 'KBS0802', 'KBS0812' ] df_counts = pd.read_csv( lt.get_path() + '/data/demography/longtermdormancy_20190528_nocomments.csv', sep=',') df_counts['Abund'] = (df_counts.Colonies.values + 1) * ( 1000 / df_counts.Inoculum.values) * (10**df_counts.Dilution.values) df_counts['Dormstart_date'] = pd.to_datetime(df_counts['Dormstart_date']) df_counts['Firstread_date'] = pd.to_datetime(df_counts['Firstread_date']) df_counts['Days'] = df_counts['Firstread_date'] - df_counts[ 'Dormstart_date'] + dt.timedelta(days=1) fig, ax = plt.subplots(figsize=(4, 4)) fig.subplots_adjust(hspace=0.35, wspace=0.35) for taxon in all_taxa: #taxon = 'KBS0812' taxon_color = lt.df_colors.loc[lt.df_colors['strain'] ==
'KBS0722', 'KBS0724', 'KBS0801' ] maple_types = ['signature', 'complex', 'pathway', 'function'] MCR = 0.8 kegg_dict_count = {} maple_dict_count = {} maple_annotation_dict = {} treatment_count_dict = {} for taxon in taxa: # make refseq to protein ID dict refseq_to_protein_dit = {} for subdir, dirs, files in os.walk(lt.get_path() + '/data/genomes/genomes_ncbi/' + taxon): for file in files: if file.endswith('.gbff'): with open(os.path.join(subdir, file), "rU") as input_handle: for record in SeqIO.parse(input_handle, "genbank"): for feature in record.features: if feature.type != 'CDS': continue if 'incomplete' in feature.qualifiers['note'][0]: continue if 'frameshifted' in feature.qualifiers['note'][0]: continue if 'internal stop' in feature.qualifiers['note'][ 0]: continue
#from sklearn.model_selection import GridSearchCV #from sklearn.neighbors import KernelDensity import statsmodels.stats.multitest as mt import statsmodels.formula.api as smf from Bio import SeqIO from statsmodels.base.model import GenericLikelihoodModel # only plot taxa w/ significant g scores and at least 100 mutations n_bins = 20 afs_taxa_reps = [ x.split('.')[0] for x in os.listdir(lt.get_path() + '/data/breseq/allele_freq_spec/') if x.endswith(".txt") ] afs_taxa = list( set([ x.split('.')[0].split('-')[0] for x in os.listdir(lt.get_path() + '/data/breseq/allele_freq_spec/') if x.endswith(".txt") ])) _nsre = re.compile('([0-9]+)') def natural_sort_key(s): return [ int(text) if text.isdigit() else text.lower() for text in re.split(_nsre, s)
fontsize=13, fontweight='bold', ha='center', va='center', transform=ax_KBS0812.transAxes) ax_likelihood.text(-0.1, 1.07, 'd', fontsize=13, fontweight='bold', ha='center', va='center', transform=ax_likelihood.transAxes) df_counts = pd.read_csv( lt.get_path() + '/data/demography/longtermdormancy_20190528_nocomments.csv', sep=',') df_counts['Abund'] = (df_counts.Colonies.values + 1) * ( 1000 / df_counts.Inoculum.values) * (10**df_counts.Dilution.values) df_counts['Dormstart_date'] = pd.to_datetime(df_counts['Dormstart_date']) df_counts['Firstread_date'] = pd.to_datetime(df_counts['Firstread_date']) df_counts['Days'] = df_counts['Firstread_date'] - df_counts[ 'Dormstart_date'] + dt.timedelta(days=1) df_stats = pd.read_csv(lt.get_path() + '/data/demography/weibull_results_clean.csv', sep=',') df_counts_KBS0714 = df_counts.loc[((df_counts['Strain'] == 'KBS0714') & (df_counts['Rep'] == 4))] df_counts_KBS0703 = df_counts.loc[((df_counts['Strain'] == 'KBS0703') &
def get_diversity_stats(afs_cutoff=30, mean_mut_cutoff=20): df_out = open(lt.get_path() + '/data/breseq/genetic_diversity.txt', 'w') df_out_header = ['Species', 'sample', 'rep', 'mean_freq', 'max_freq', \ 'pi', 'theta', 'tajimas_d', 'dn_ds_total', \ 'mean_N_mut', 'mean_binary_divisions', 'mean_gen_per_day', 'mean_birth_per_death', \ 'max_N_mut', 'max_binary_divisions', 'max_gen_per_day', 'max_birth_per_death'] df_out.write('\t'.join(df_out_header) + '\n') # pass nest list with frequency, coverage of major, coverage of minor, taxon #output_to_keep = ['INS', 'DEL', 'SNP'] to_keep_samples = get_breseq_samples_to_keep() to_keep_taxa = get_breseq_taxa_to_keep() # all the diversity measures taxa_all = [] n_muts_all = [] mean_freq_list_all = [] max_freq_list_all = [] pi_list_all = [] theta_list_all = [] TD_list_all = [] dnds_total_list_all = [] tt_all = [] p_value_all = [] n_reps_all = [] n_syn_non_muts_all = [] # for tajimas d file n_reps_td_all = [] tt_td_all = [] p_value_td_all = [] mean_N_mut_all = [] max_N_mut_all = [] binary_divisions_mean_all = [] binary_divisions_max_all = [] b_div_d_mean_all = [] b_div_d_max_all = [] mean_gen_per_day_all = [] max_gen_per_day_all = [] for taxon in to_keep_taxa: if taxon == 'KBS0727': continue print(taxon) #effective_gene_lengths, Lsyn, Lnon, substitution_specific_synonymous_fraction = lt.calculate_synonymous_nonsynonymous_target_sizes(taxon) effective_gene_lengths, effective_gene_lengths_syn, Lsyn, Lnon, substitution_specific_synonymous_fraction = lt.calculate_synonymous_nonsynonymous_target_sizes(taxon) taxon_samples = [ x for x in to_keep_samples if x.startswith(taxon) ] sites_to_remove = get_sites_to_remove(taxon) genome_size = lt.get_genome_size_dict()[taxon] # list of diversity statistics mean_freq_list = [] max_freq_list = [] pi_list = [] theta_list = [] TD_list = [] dnds_total_list = [] n_muts_list = [] n_syn_non_muts_list = [] mean_N_mut_list = [] max_N_mut_list = [] binary_divisions_mean_list = [] binary_divisions_max_list = [] b_div_mean_d_list = [] b_div_max_d_list = [] mean_gen_per_day_list = [] max_gen_per_day_list = [] for taxon_sample in taxon_samples: if taxon_sample == 'KBS0711-K': continue n_0_c, n_c = lt.get_init_final_pop_size(taxon_sample) # get SNP identifiers SNP_IDs = [] fixed_SNP_IDs = [] for i, line in enumerate(open(lt.get_path() + '/data/breseq/output/' + taxon_sample + '.gd', 'r')): line_split = line.strip().split('\t') if line_split[0] == 'SNP': if line_split[3] + '_' + line_split[4] in sites_to_remove: continue # these are fixed in the ancestor, don't count as real fixations # fixed mutations don't count towards polymorphisms if float(line_split[6].split('=')[1]) == float(1): fixed_SNP_IDs.append(line_split[2]) else: SNP_IDs.append(line_split[2]) # go back through the file again and get the coverage info from RA lines freq_list = [] n_muts = 0 print(taxon_sample, len(fixed_SNP_IDs), fixed_SNP_IDs) #for i, line in enumerate(open(lt.get_path() + '/data/breseq/output/' + taxon_sample + '.gd', 'r')): for i, line in enumerate(open(lt.get_path() + '/data/breseq/annotated/' + taxon_sample + '.gd', 'r')): line_split = line.strip().split('\t') #if (line_split[0] == 'RA') and (line_split[1] in SNP_IDs): if (line_split[0] in output_to_keep) and (line_split[2] in SNP_IDs): #major_cov = int(line_split[15].split('=')[1].split('/')[0]) + int(line_split[15].split('=')[1].split('/')[1]) #minor_cov = int(line_split[18].split('=')[1].split('/')[0]) + int(line_split[18].split('=')[1].split('/')[1]) #total_cov = int(line_split[-1].split('=')[1].split('/')[0]) + int(line_split[-1].split('=')[1].split('/')[1]) #freq = float(line_split[20].split('=')[1]) #freq_list.append([freq, type, total_cov, major_cov, minor_cov]) freq = float([j for j in line_split if 'frequency=' in j][0].split('=')[1]) type = [j for j in line_split if 'mutation_category=' in j][0].split('=')[1] freq_list.append([freq, type]) n_muts += 1 # only look at the AFS in pops with at least 50 mutations if len(freq_list) >= afs_cutoff: # print allele frequencies to a file df_out_freq_taxa = open(lt.get_path() + '/data/breseq/allele_freq_spec/' + str(taxon_sample) + '.txt', 'w') #df_out_freq_taxa.write('\t'.join(['freq', 'total_cov', 'major_cov', 'minor_cov']) + '\n') df_out_freq_taxa.write('\t'.join(['frequency', 'mutation_category']) + '\n') for freq_list_i in freq_list: #df_out_freq_taxa.write('\t'.join([str(freq_list_i[0]), str(freq_list_i[1]), str(freq_list_i[2]), str(freq_list_i[3])]) + '\n') df_out_freq_taxa.write('\t'.join([str(freq_list_i[0]), str(freq_list_i[1])]) + '\n') df_out_freq_taxa.close() # only look at mean properties for pops with at least 20 mutations if len(freq_list) < mean_mut_cutoff: continue n_muts_list.append(n_muts) pi = lt.get_pi(freq_list, n_c=n_c, size=genome_size) theta = lt.get_theta(freq_list, n_c=n_c, size=genome_size) mean_freq = np.mean([ float(i[0]) for i in freq_list]) max_freq = max([ float(i[0]) for i in freq_list]) # print all frequencies to a file # genome size cancels out during the TD calculation tajimas_d = lt.get_TD(freq_list=freq_list, pi=pi*genome_size, theta=theta*genome_size, n_c=n_c) non_total = 0 syn_total = 0 non_fixed = 0 syn_fixed = 0 n_syn_non_muts = 0 for i, line in enumerate(open(lt.get_path() + '/data/breseq/annotated/' + taxon_sample + '.gd', 'r')): line_split = line.strip().split('\t') # don't count mutations that may be ancestral # don't count mutations in non-coding regions or psuedoregions if (line_split[0] != 'SNP') or ('frequency' in line_split[6]) or (line_split[3] + '_' + line_split[4] in sites_to_remove): continue freq = float([s for s in line_split if 'frequency=' in s][0].split('=')[1]) n_syn_non_muts += 1 if freq == float(1): if line_split[6].split('=')[1] == line_split[8].split('=')[1]: syn_fixed += 1 else: non_fixed += 1 if line_split[6].split('=')[1] == line_split[8].split('=')[1]: syn_total += 1 else: non_total += 1 # add psuedocount of 1 n_syn_non_muts_list.append(n_syn_non_muts) dnds_total = ((non_total+1)/(syn_total+1))/((Lnon+1)/(Lsyn+1)) dnds_fixed = ((non_fixed+1)/(syn_fixed+1))/((Lnon+1)/(Lsyn+1)) mean_freq_list.append(mean_freq) max_freq_list.append(max_freq) pi_list.append(pi) theta_list.append(theta) TD_list.append(tajimas_d) dnds_total_list.append(dnds_total) # number divisions mean_N_mut = n_c*mean_freq max_N_mut = n_c*max_freq binary_divisions_mean = sum([2**i for i in range(int( math.floor(np.log2(mean_N_mut)) ))]) / 2 binary_divisions_max = sum([2**i for i in range(int( math.floor(np.log2(max_N_mut)) ))]) / 2 binary_divisions_mean_list.append(binary_divisions_mean) binary_divisions_max_list.append(binary_divisions_max) mean_N_mut_list.append(mean_N_mut) max_N_mut_list.append(max_N_mut) #b_div_mean_d = binary_divisions_mean / (n_0_c - n_c) b_div_mean_d = binary_divisions_mean / n_c b_div_mean_d_list.append(b_div_mean_d) #b_div_max_d = binary_divisions_max / (n_0_c - n_c) b_div_max_d = binary_divisions_max / n_c b_div_max_d_list.append(b_div_max_d) rep_num = lt.rename_rep()[taxon_sample.split('-')[1]] time = lt.get_total_time(taxon_sample) mean_N_mut = n_c*mean_freq mean_gens_per_day = np.log2(mean_N_mut)/time max_N_mut = n_c*max_freq max_gens_per_day = np.log2(max_N_mut)/time mean_gen_per_day_list.append(mean_gens_per_day) max_gen_per_day_list.append(max_gens_per_day) df_out_data_list = [taxon, taxon_sample, str(rep_num), str(mean_freq), str(max_freq), \ str(pi), str(theta), str(tajimas_d), str(dnds_total), \ str(mean_N_mut), str(binary_divisions_mean), str(mean_gens_per_day), str(b_div_mean_d), str(max_N_mut), str(binary_divisions_max), str(max_gens_per_day), str(b_div_max_d)] df_out.write('\t'.join(df_out_data_list) + '\n') # get taxon level stats print( str(len(dnds_total_list)) + " reps") # only examine dn/ds for taxa with at least three reps if len(dnds_total_list) < 3: continue n_muts_all.append(np.mean(n_muts_list)) n_syn_non_muts_all.append(np.mean(n_syn_non_muts_list)) mean_freq_list_all.append(np.mean(mean_freq_list)) max_freq_list_all.append(np.mean(max_freq_list)) mean_N_mut_all.append(np.mean(mean_N_mut_list) ) max_N_mut_all.append(np.mean(max_N_mut_list) ) pi_list_all.append(np.mean(pi_list)) theta_list_all.append(np.mean(theta_list)) TD_list_all.append(np.mean(TD_list)) mean_dnds_total = np.mean(dnds_total_list) dnds_total_list_all.append(mean_dnds_total) binary_divisions_mean_all.append(np.mean(binary_divisions_mean_list)) binary_divisions_max_all.append(np.mean(binary_divisions_max_list)) b_div_d_mean_all.append(np.mean(b_div_mean_d_list)) b_div_d_max_all.append(np.mean(b_div_max_d_list)) mean_gen_per_day_all.append(np.mean(mean_gen_per_day_list)) max_gen_per_day_all.append(np.mean(max_gen_per_day_list)) taxa_all.append(taxon) # t > 0, right-tailed t test, use survival function # t < 0, left-tailed t test, use CDF # or just take absolute value of t and use SF tt = (mean_dnds_total-1)/ (np.std(dnds_total_list) / np.sqrt(float(len(dnds_total_list)))) p_val = t.sf(np.abs(tt), len(dnds_total_list)-1) # left-tailed one-sided t-test, so use CDF n_reps_all.append(len(dnds_total_list)) tt_all.append(tt) p_value_all.append(p_val) tt_td = (np.mean(TD_list))/ (np.std(TD_list) / np.sqrt(float(len(TD_list)))) p_val_td = t.sf(np.abs(tt_td), len(TD_list)-1) # left-tailed one-sided t-test, so use CDF n_reps_td_all.append(len(dnds_total_list)) tt_td_all.append(tt_td) p_value_td_all.append(p_val_td) df_out.close() reject, pvals_corrected, alphacSidak, alphacBonf = mt.multipletests(p_value_all, alpha=0.05, method='fdr_bh') reject_td, pvals_corrected_td, alphacSidak_td, alphacBonf_td = mt.multipletests(p_value_td_all, alpha=0.05, method='fdr_bh') # two files, one for dnds one for rest of diversity stats df_out_taxa = open(lt.get_path() + '/data/breseq/birth_estimate_taxa.txt', 'w') df_out_taxa_heder = ['Species', 'mean_n_muts', 'mean_freq', 'max_freq', 'Theta', 'Pi', 'Tajimas_D', \ 'mean_N_mut', 'mean_binary_divisions', 'mean_gen_per_day', 'mean_birth_per_death', \ 'max_N_mut', 'max_binary_divisions', 'max_gen_per_day', 'max_birth_per_death'] df_out_taxa.write('\t'.join(df_out_taxa_heder) + '\n') for i in range(len(taxa_all)): out_list_i = [taxa_all[i], str(n_muts_all[i]), str(mean_freq_list_all[i]), \ str(max_freq_list_all[i]), str(theta_list_all[i]), str(pi_list_all[i]), str(TD_list_all[i]), \ str(mean_N_mut_all[i]), str(binary_divisions_mean_all[i]), str(mean_gen_per_day_all[i]), str(b_div_d_max_all[i]), \ str(max_N_mut_all[i]), str(binary_divisions_max_all[i]), str(max_gen_per_day_all[i]), str(b_div_d_mean_all[i]) ] df_out_taxa.write('\t'.join(out_list_i) + '\n') df_out_taxa.close() df_dNdS_taxa = open(lt.get_path() + '/data/breseq/dN_dS_taxa.txt', 'w') df_dNdS_taxa.write('\t'.join(['Species', 'n_reps', 'n_syn_non_muts', 'dN_dS_total', 't_stat', 'p_BH']) + '\n') for i in range(len(taxa_all)): df_dNdS_taxa.write('\t'.join([taxa_all[i], str(n_reps_all[i]), str(n_syn_non_muts_all[i]), str(dnds_total_list_all[i]), str(tt_all[i]), str(pvals_corrected[i])]) + '\n') df_dNdS_taxa.close() df_td_taxa = open(lt.get_path() + '/data/breseq/tajimas_d_taxa.txt', 'w') df_td_taxa.write('\t'.join(['Species', 'n_reps', 'n_muts', 'tajimas_d', 't_stat', 'p_BH']) + '\n') for i in range(len(taxa_all)): df_td_taxa.write('\t'.join([taxa_all[i], str(n_reps_td_all[i]), str(n_muts_all[i]), str(TD_list_all[i]), str(tt_td_all[i]), str(pvals_corrected_td[i])]) + '\n') df_td_taxa.close()
import matplotlib.ticker import datetime as dt #from sklearn.model_selection import GridSearchCV #from sklearn.neighbors import KernelDensity import statsmodels.stats.multitest as mt import statsmodels.formula.api as smf from Bio import SeqIO from statsmodels.base.model import GenericLikelihoodModel # only plot taxa w/ significant g scores and at least 100 mutations df_taxa = pd.read_csv(lt.get_path() + '/data/breseq/tajimas_d_taxa.txt', sep='\t') df_taxa = df_taxa.sort_values(by=['tajimas_d']) taxa_to_keep = df_taxa.Species.to_list() df_taxa_samples = pd.read_csv(lt.get_path() + '/data/breseq/genetic_diversity.txt', sep='\t', index_col=None) fig = plt.figure() for i, taxon in enumerate(taxa_to_keep): #print(taxon) #print(df_taxa_samples.loc[df_taxa_samples['Species'] == taxon]) x_i = df_taxa_samples.loc[df_taxa_samples['Species'] == taxon].tajimas_d.values if len(x_i) < 3: taxa_to_keep.remove(taxon)
d_0_start = 0.01 k_start = 1 z_start = 0.8 start_params = np.array([d_0_start, k_start, z_start]) return super(log_weibull_model, self).fit(start_params=start_params, maxiter=maxiter, method=method, maxfun=maxfun, **kwds) # innocula 100uL inoccula = 100 df = pd.read_csv(lt.get_path() + '/data/demography/spoIIE_DC_assay.csv', sep=',') #df = pd.read_csv(lt.get_path() + '/data/demography/spo0IIE_assay.csv', sep = ',') df['N_spores'] = df['HT'] * (1000 / inoccula) * (10**(df['dilution_S'])) * 50 #(mL) df['N_total'] = df['NT'] * (1000 / inoccula) * (10**(df['dilution_V'])) * 50 #(mL) df['N_viable'] = df['N_total'] - df['N_spores'] df['days'] = df['hours'] / 24 df = df.sort_values('days') df_wt = df[(df['strain'] == 'wt')] df_spoiie = df[(df['strain'] == 'SpoIIE')]
def run_parallelism_analysis(nmin_reps=3, nmin = 2, FDR = 0.05, n_nonsyn_min=50): # pass nest list with frequency, coverage of major, coverage of minor, taxon #output_to_keep = ['INS', 'DEL', 'SNP', 'SUB'] to_keep_samples = get_breseq_samples_to_keep() to_keep_taxa = get_breseq_taxa_to_keep() p_star_dict = {} G_score_list = [] for taxon in to_keep_taxa: print(taxon) effective_gene_lengths, effective_gene_lengths_syn, Lsyn, Lnon, substitution_specific_synonymous_fraction = lt.calculate_synonymous_nonsynonymous_target_sizes(taxon) taxon_sites = [] taxon_samples = [ x for x in to_keep_samples if x.startswith(taxon) ] sites_to_remove = get_sites_to_remove(taxon) # keep insertion, deletions, and nonsynonymous SNPs # get size_dict gene_count_dict = {} gene_count_syn_dict = {} #print(sites_to_remove) for taxon_sample in taxon_samples: for i, line in enumerate(open(lt.get_path() + '/data/breseq/annotated/' + taxon_sample + '.gd', 'r')): line_split = line.strip().split('\t') if line_split[0] == '#=GENOME_DIFF': continue if (line_split[3] + '_' + line_split[4] in sites_to_remove): continue if (line_split[0] not in output_to_keep): #or ('frequency' in line_split[6]) or (line_split[3] + '_' + line_split[4] in sites_to_remove): continue if line_split[0] == 'SNP': if [s for s in line_split if 'snp_type=' in s][0].split('=')[1] == 'nonsynonymous': locus_tag = [s for s in line_split if 'locus_tag=' in s][0].split('=')[1] frequency = float([s for s in line_split if 'frequency=' in s][0].split('=')[1]) if ';' in locus_tag: for locus_tag_j in locus_tag.split(';'): if locus_tag_j not in gene_count_dict: gene_count_dict[locus_tag_j] = {} gene_count_dict[locus_tag_j]['freqs'] = [] gene_count_dict[locus_tag_j]['n_mut'] = 0 gene_count_dict[locus_tag_j]['n_mut'] += 1 gene_count_dict[locus_tag_j]['freqs'].append(frequency) else: if locus_tag not in gene_count_dict: #gene_count_dict[locus_tag] = 1 gene_count_dict[locus_tag] = {} gene_count_dict[locus_tag]['freqs'] = [] gene_count_dict[locus_tag]['n_mut'] = 0 gene_count_dict[locus_tag]['n_mut'] += 1 gene_count_dict[locus_tag]['freqs'].append(frequency) elif [s for s in line_split if 'snp_type=' in s][0].split('=')[1] == 'synonymous': locus_tag = [s for s in line_split if 'locus_tag=' in s][0].split('=')[1] frequency = float([s for s in line_split if 'frequency=' in s][0].split('=')[1]) if ';' in locus_tag: for locus_tag_j in locus_tag.split(';'): if locus_tag_j not in gene_count_syn_dict: gene_count_syn_dict[locus_tag_j] = {} gene_count_syn_dict[locus_tag_j]['freqs'] = [] gene_count_syn_dict[locus_tag_j]['n_mut'] = 0 gene_count_syn_dict[locus_tag_j]['n_mut'] += 1 gene_count_syn_dict[locus_tag_j]['freqs'].append(frequency) else: if locus_tag not in gene_count_syn_dict: gene_count_syn_dict[locus_tag] = {} gene_count_syn_dict[locus_tag]['freqs'] = [] gene_count_syn_dict[locus_tag]['n_mut'] = 0 gene_count_syn_dict[locus_tag]['n_mut'] += 1 gene_count_syn_dict[locus_tag]['freqs'].append(frequency) else: continue else: if len([s for s in line_split if 'gene_position=coding' in s]) >= 1: locus_tag = [s for s in line_split if 'locus_tag=' in s][0].split('=')[1] frequency = float([s for s in line_split if 'frequency=' in s][0].split('=')[1]) if ';' in locus_tag: for locus_tag_j in locus_tag.split(';'): if locus_tag_j not in gene_count_dict: gene_count_dict[locus_tag_j] = {} gene_count_dict[locus_tag_j]['freqs'] = [] gene_count_dict[locus_tag_j]['n_mut'] = 0 gene_count_dict[locus_tag_j]['freqs'].append(frequency) gene_count_dict[locus_tag_j]['n_mut'] += 1 else: if locus_tag not in gene_count_dict: #gene_count_dict[locus_tag] = 1 gene_count_dict[locus_tag] = {} gene_count_dict[locus_tag]['freqs'] = [] gene_count_dict[locus_tag]['n_mut'] = 0 gene_count_dict[locus_tag]['freqs'].append(frequency) gene_count_dict[locus_tag]['n_mut'] += 1 gene_parallelism_statistics = {} for gene_i, length_i in effective_gene_lengths.items(): gene_parallelism_statistics[gene_i] = {} gene_parallelism_statistics[gene_i]['length'] = length_i gene_parallelism_statistics[gene_i]['observed'] = 0 gene_parallelism_statistics[gene_i]['multiplicity'] = 0 gene_parallelism_statistics_syn = {} for gene_i, length_i in effective_gene_lengths_syn.items(): gene_parallelism_statistics_syn[gene_i] = {} gene_parallelism_statistics_syn[gene_i]['length'] = length_i gene_parallelism_statistics_syn[gene_i]['observed'] = 0 gene_parallelism_statistics_syn[gene_i]['multiplicity'] = 0 # save number of mutations for multiplicity for locus_tag_i, locus_tag_i_dict in gene_count_dict.items(): gene_parallelism_statistics[locus_tag_i]['observed'] = locus_tag_i_dict['n_mut'] gene_parallelism_statistics[locus_tag_i]['mean_freq'] = np.mean(locus_tag_i_dict['freqs']) # same thing for synonymous for locus_tag_i, locus_tag_i_dict in gene_count_syn_dict.items(): gene_parallelism_statistics_syn[locus_tag_i]['observed'] = locus_tag_i_dict['n_mut'] gene_parallelism_statistics_syn[locus_tag_i]['mean_freq'] = np.mean(locus_tag_i_dict['freqs']) L_mean = np.mean(list(effective_gene_lengths.values())) L_tot = sum(list(effective_gene_lengths.values())) n_tot = sum([ x['n_mut'] for x in gene_count_dict.values() ]) # don't include taxa with less than 20 mutations print("N_total = " + str(n_tot)) if n_tot < n_nonsyn_min: continue # go back over and calculate multiplicity for locus_tag_i in gene_parallelism_statistics.keys(): # double check the measurements from this gene_parallelism_statistics[locus_tag_i]['multiplicity'] = gene_parallelism_statistics[locus_tag_i]['observed'] *1.0/ effective_gene_lengths[locus_tag_i] * L_mean gene_parallelism_statistics[locus_tag_i]['expected'] = n_tot*gene_parallelism_statistics[locus_tag_i]['length']/L_tot # get multiplicity for synonymous mutations L_mean_syn = np.mean(list(effective_gene_lengths_syn.values())) L_tot_syn = sum(list(effective_gene_lengths_syn.values())) n_tot_syn = sum([ x['n_mut'] for x in gene_count_syn_dict.values() ]) # go back over and calculate multiplicity for locus_tag_i in gene_parallelism_statistics_syn.keys(): # double check the measurements from this gene_parallelism_statistics_syn[locus_tag_i]['multiplicity'] = gene_parallelism_statistics_syn[locus_tag_i]['observed'] *1.0/ effective_gene_lengths_syn[locus_tag_i] * L_mean_syn gene_parallelism_statistics_syn[locus_tag_i]['expected'] = n_tot_syn*gene_parallelism_statistics_syn[locus_tag_i]['length']/L_tot_syn pooled_multiplicities = np.array([gene_parallelism_statistics[gene_name]['multiplicity'] for gene_name in gene_parallelism_statistics.keys() if gene_parallelism_statistics[gene_name]['multiplicity'] >=1]) pooled_multiplicities.sort() pooled_tupe_multiplicities = np.array([(gene_parallelism_statistics[gene_name]['multiplicity'], gene_parallelism_statistics[gene_name]['observed']) for gene_name in gene_parallelism_statistics.keys() if gene_parallelism_statistics[gene_name]['multiplicity'] >=1]) pooled_tupe_multiplicities = sorted(pooled_tupe_multiplicities, key=lambda x: x[0]) pooled_tupe_multiplicities_x = [i[0] for i in pooled_tupe_multiplicities] pooled_tupe_multiplicities_y = [i[1] for i in pooled_tupe_multiplicities] pooled_tupe_multiplicities_y = [sum(pooled_tupe_multiplicities_y[i:]) / sum(pooled_tupe_multiplicities_y) for i in range(len(pooled_tupe_multiplicities_y))] null_multiplicity_survival = lt.NullGeneMultiplicitySurvivalFunction.from_parallelism_statistics( gene_parallelism_statistics ) #observed_ms_test, observed_multiplicity_survival_test = lt.calculate_unnormalized_survival_from_vector(pooled_multiplicities) null_multiplicity_survival_copy = null_multiplicity_survival(pooled_multiplicities) null_multiplicity_survival_copy = [sum(null_multiplicity_survival_copy[i:]) / sum(null_multiplicity_survival_copy) for i in range(len(null_multiplicity_survival_copy)) ] #threshold_idx = numpy.nonzero((null_multiplicity_survival(observed_ms)*1.0/observed_multiplicity_survival)<FDR)[0][0] mult_survival_dict = {'Mult': pooled_multiplicities, 'Obs_fract': pooled_tupe_multiplicities_y, 'Null_fract': null_multiplicity_survival_copy} mult_survival_df = pd.DataFrame(mult_survival_dict) mult_survival_df_out = lt.get_path() + '/data/breseq/mult_survival_curves/' + taxon + '.txt' mult_survival_df.to_csv(mult_survival_df_out, sep = '\t', index = True) # get likelihood score and null test observed_G, pvalue = lt.calculate_total_parallelism(gene_parallelism_statistics) G_score_list.append((taxon, observed_G, pvalue)) print(observed_G, pvalue) if pvalue >= 0.05: continue # Give each gene a p-value, get distribution gene_logpvalues = lt.calculate_parallelism_logpvalues(gene_parallelism_statistics) pooled_pvalues = [] for gene_name in gene_logpvalues.keys(): if (gene_parallelism_statistics[gene_name]['observed']>= nmin) and (float(gene_logpvalues[gene_name]) >= 0): pooled_pvalues.append( gene_logpvalues[gene_name] ) pooled_pvalues = np.array(pooled_pvalues) pooled_pvalues.sort() if len(pooled_pvalues) == 0: continue null_pvalue_survival = lt.NullGeneLogpSurvivalFunction.from_parallelism_statistics( gene_parallelism_statistics, nmin=nmin) observed_ps, observed_pvalue_survival = lt.calculate_unnormalized_survival_from_vector(pooled_pvalues, min_x=-4) # Pvalue version # remove negative minus log p values. neg_p_idx = np.where(observed_ps>=0) observed_ps_copy = observed_ps[neg_p_idx] observed_pvalue_survival_copy = observed_pvalue_survival[neg_p_idx] pvalue_pass_threshold = np.nonzero(null_pvalue_survival(observed_ps_copy)*1.0/observed_pvalue_survival_copy<FDR)[0] if len(pvalue_pass_threshold) == 0: continue threshold_idx = pvalue_pass_threshold[0] pstar = observed_ps_copy[threshold_idx] # lowest value where this is true num_significant = observed_pvalue_survival[threshold_idx] # make it log base 10 logpvalues_dict = {'P_value': observed_ps/math.log(10), 'Obs_num': observed_pvalue_survival, 'Null_num': null_pvalue_survival(observed_ps)} logpvalues_df = pd.DataFrame(logpvalues_dict) logpvalues_df_out = lt.get_path() + '/data/breseq/logpvalues/' + taxon + '.txt' logpvalues_df.to_csv(logpvalues_df_out, sep = '\t', index = True) p_star_dict[taxon] = (num_significant, pstar/math.log(10)) output_mult_gene_filename = lt.get_path() + '/data/breseq/mult_genes_nonsyn_sig/' + taxon + '.txt' output_mult_gene = open(output_mult_gene_filename,"w") output_mult_gene.write(",".join(["Gene", "Length", "Observed", "Expected", "Multiplicity", "-log10(P)"])) for gene_name in sorted(gene_parallelism_statistics, key=lambda x: gene_parallelism_statistics.get(x)['observed'],reverse=True): if gene_logpvalues[gene_name] >= pstar and gene_parallelism_statistics[gene_name]['observed']>=nmin: output_mult_gene.write("\n") # log base 10 transform the p-values here as well output_mult_gene.write("%s, %0.1f, %d, %0.2f, %0.2f, %g" % (gene_name, gene_parallelism_statistics[gene_name]['length'], gene_parallelism_statistics[gene_name]['observed'], gene_parallelism_statistics[gene_name]['expected'], gene_parallelism_statistics[gene_name]['multiplicity'], abs(gene_logpvalues[gene_name])/math.log(10) )) output_mult_gene.close() output_mult_syn_filename = lt.get_path() + '/data/breseq/mult_genes_all/' + taxon + '.txt' output_mult_syn = open(output_mult_syn_filename,"w") output_mult_syn.write(",".join(["Gene", "mult", "mult_syn", "mean_freq", "mean_freq_syn"])) for locus_tag_i in gene_parallelism_statistics.keys(): mult_i = gene_parallelism_statistics[locus_tag_i]['multiplicity'] mult_i_syn = gene_parallelism_statistics_syn[locus_tag_i]['multiplicity'] if (mult_i > 0) and (mult_i_syn > 0): freq_i = gene_parallelism_statistics[locus_tag_i]['mean_freq'] freq_i_syn = gene_parallelism_statistics_syn[locus_tag_i]['mean_freq'] output_mult_syn.write("\n") output_mult_syn.write("%s, %f, %f, %f, %f" % (locus_tag_i, mult_i, mult_i_syn, freq_i, freq_i_syn)) output_mult_syn.close() G_score_list_p_vales = [i[2] for i in G_score_list] reject, pvals_corrected, alphacSidak, alphacBonf = mt.multipletests(G_score_list_p_vales, alpha=0.05, method='fdr_bh') total_parallelism_path = lt.get_path() + '/data/breseq/total_parallelism.txt' total_parallelism = open(total_parallelism_path,"w") total_parallelism.write("\t".join(["Taxon", "G_score", "p_value", "p_value_BH"])) for i in range(len(pvals_corrected)): taxon_i = G_score_list[i][0] G_score_i = G_score_list[i][1] p_value_i = G_score_list[i][2] pvals_corrected_i = pvals_corrected[i] total_parallelism.write("\n") total_parallelism.write("\t".join([taxon_i, str(G_score_i), str(p_value_i), str(pvals_corrected_i)])) total_parallelism.close() with open(lt.get_path() + '/data/breseq/p_star.txt', 'wb') as file: file.write(pickle.dumps(p_star_dict)) # use `pickle.loads` to do the reverse
from scipy import stats from scipy.stats import t from scipy.integrate import odeint from decimal import Decimal import _pickle as pickle #from sklearn.model_selection import GridSearchCV #from sklearn.neighbors import KernelDensity import statsmodels.stats.multitest as mt import statsmodels.formula.api as smf from Bio import SeqIO from statsmodels.base.model import GenericLikelihoodModel filepath = lt.get_path() + '/data/demography/weibull_results_clean.csv' half_life_dict = {} count = 0 for line in open(filepath, 'r'): if count == 0: count += 1 continue line_split = line.strip().split(',') taxon = line_split[1] #print(line_split[3])
import os import pwlf import numpy as np import pandas as pd import ltde_tools as lt df_colors = pd.read_csv(lt.get_path() + '/data/colors.csv', sep=',') def piecewise_regression(): df = pd.read_csv( os.path.expanduser("~/GitHub/LTDE") + '/data/demography/longtermdormancy_20190528_nocomments.csv', sep=',') # KBS0721 rep df['N'] = (df['Colonies'] + 1) * (1000 / df['Inoculum']) * (10**(df['Dilution'])) df['Dormstart_date'] = pd.to_datetime(df['Dormstart_date'], format='%d-%b-%y') df['Firstread_date'] = pd.to_datetime(df['Firstread_date'], format='%d-%b-%y') df['Days'] = df['Firstread_date'].sub(df['Dormstart_date'], axis=0) df['Days'] = df['Days'].dt.days.astype('int') #sdf N_dead_list = [] delta_slope_list = [] time_split = [] slope1_list = [] slope2_list = [] taxa = list(set(df.Strain.to_list()))
def piecewise_regression(): df = pd.read_csv( os.path.expanduser("~/GitHub/LTDE") + '/data/demography/longtermdormancy_20190528_nocomments.csv', sep=',') # KBS0721 rep df['N'] = (df['Colonies'] + 1) * (1000 / df['Inoculum']) * (10**(df['Dilution'])) df['Dormstart_date'] = pd.to_datetime(df['Dormstart_date'], format='%d-%b-%y') df['Firstread_date'] = pd.to_datetime(df['Firstread_date'], format='%d-%b-%y') df['Days'] = df['Firstread_date'].sub(df['Dormstart_date'], axis=0) df['Days'] = df['Days'].dt.days.astype('int') #sdf N_dead_list = [] delta_slope_list = [] time_split = [] slope1_list = [] slope2_list = [] taxa = list(set(df.Strain.to_list())) #fig_all = plt.figure() flux_list = [] slope2_scale_list = [] df_out = open(lt.get_path() + '/data/demography/piecewise_regression.txt', 'w') df_out.write('\t'.join([ 'Species', 'rep', 'N0', 'slope1', 'slope2', 'time_split', 'N_split' ]) + '\n') for taxon in taxa: print(taxon) if taxon == 'KBS0714': continue if taxon == 'KBS0719': continue if taxon == 'KBS0711W': continue if taxon == 'KBS0816': continue if taxon == 'KBS0727': continue if taxon == 'KBS0704': continue #if taxon == 'KBS0725': # continue #if taxon == 'KBS0702': # continue #if taxon == 'KBS0712': # continue #if taxon == 'KBS0703': # continue #if taxon == 'KBS0706': # continue taxon_color = df_colors.loc[df_colors['strain'] == taxon].Color.to_list()[0] #if taxon != 'KBS0710': # continue #fig = plt.figure() df_taxon = df[(df["Strain"] == taxon)] reps = list(set(df_taxon.Rep.to_list())) for rep in reps: df_taxon_rep = df_taxon[(df_taxon["Rep"] == rep)] df_taxon_rep.sort_values('Days') x = df_taxon_rep.Days.values if len(x) < 20: continue y = np.log10(df_taxon_rep.N.values) N0 = df_taxon_rep.N.values[0] my_pwlf = pwlf.PiecewiseLinFit(x, y) # fit the data for four line segments res = my_pwlf.fit(2) # predict for the determined points xHat = np.linspace(min(x), max(x), num=10000) yHat = my_pwlf.predict(xHat) #print(taxon, len(res)) N_switch = my_pwlf.intercepts[0] + (my_pwlf.calc_slopes()[0] * res[1]) #N_flux = slopes = my_pwlf.calc_slopes() N_dead = (10**max(y)) - (10**N_switch) N_dead_flux = ((10**max(y)) * slopes[0]) #angle_switch = np.arctan(np.absolute( (slopes[1]-slopes[0]) /(1+ (slopes[0]*slopes[1])) )) delta_slope = (slopes[1] - slopes[0]) #print(taxon, rep, N_switch, delta_slope) N_dead_list.append(N_dead / res[1]) #N_dead_per_time_list.append(N_dead / ) slope1_list.append(slopes[0]) slope2_list.append(slopes[1]) delta_slope_list.append(delta_slope) time_split.append(res[1]) #taxon_color #print(delta_slope) #max(10**y) * slopes[0] #plt.scatter(np.log10(np.abs(N_dead_flux / res[1] ) ), np.log10(delta_slope), c = taxon_color) #plt.scatter(np.log10(np.abs(slopes[0]) ), np.log10(slopes[1]), c = taxon_color) flux = np.abs((10**my_pwlf.intercepts[0]) * slopes[0]) flux_list.append(flux) #if taxon == 'KBS0812': # print(flux, my_pwlf.intercepts[0], slopes[1]) slope2_scale_list.append(slopes[1] - slopes[0]) #df_out.write('\t'.join([ strain, str(round(weighted_mean_cov, 3)) ]) + '\n') df_out.write('\t'.join([ taxon, str(rep), str(N0), str(slopes[0]), str(slopes[1]), str(res[1]), str(10**N_switch) ]) + '\n') #plt.scatter(np.log10(flux ), np.log10(slopes[1] ), c = taxon_color) #plt.scatter(slopes[0], slopes[1], c = taxon_color) #plt.scatter(x, y) #plt.plot(xHat, yHat, '-') #plt.xscale('log',basex=10) #plt.yscale('log',basey=10) #plt.xlim(10**-2,10**10) #plt.xlabel('Number of dead cells', fontsize = 12) #plt.ylabel('absolute value of second slope, log10', fontsize = 12) #fig.savefig(lt.get_path() + '/figs/taxon_piece/'+taxon+'.png', bbox_inches = "tight", pad_inches = 0.4, dpi = 600) #plt.close() df_out.close()
import statsmodels.stats.multitest as mt import statsmodels.formula.api as smf from Bio import SeqIO from statsmodels.base.model import GenericLikelihoodModel # only plot taxa w/ significant g scores and at least 100 mutations fig = plt.figure() fig.subplots_adjust(hspace=0.35, wspace=0.35) for i in range(0, len(lt.taxa_to_plot)): taxon = lt.taxa_to_plot[i] taxon_color = lt.df_colors.loc[lt.df_colors['strain'] == taxon].Color.to_list()[0] df = pd.read_csv(lt.get_path() + '/data/breseq/mult_genes_all/' + taxon + '.txt', sep=',') ax = fig.add_subplot(3, 3, i + 1) x = df.mult_syn.values y = df.mult.values x_log10 = np.log10(x) y_log10 = np.log10(y) ax.scatter(x, y, c=taxon_color, marker = 'o', s = 70, \ linewidth = 0.6, alpha = 0.5, zorder=1, edgecolors='none') min_range = min([min(x), min(y)]) * 0.5 max_range = max([max(x), max(y)]) * 2 ax.set_xlim([min_range, max_range]) ax.set_ylim([min_range, max_range])
import matplotlib.ticker import datetime as dt #from sklearn.model_selection import GridSearchCV #from sklearn.neighbors import KernelDensity import statsmodels.stats.multitest as mt import statsmodels.formula.api as smf from Bio import SeqIO from statsmodels.base.model import GenericLikelihoodModel df = pd.read_csv(lt.get_path() + '/data/mzML_Files_forR/Bacillus_AA_Conc_1000d.csv', sep = ',') df = df.set_index('Name') aa_list = ['Ala', 'Gly', 'Val', 'Leu', 'Ile', 'Pro', 'Met', 'Ser', 'Thr', 'Phe', 'Asp', 'Glu', 'Orn', 'Lys', 'His', 'Tyr', 'Cys-Cys'] aa_dict = {'Ala':'Alanine', 'Gly':'Glycine', 'Val':'Valine', 'Leu':'Leucine', 'Ile': 'Isoleucine', 'Pro':'Proline', 'Met':'Methionine', 'Ser':'Serine', 'Thr':'Threonine', 'Phe':'Phenylalanine', 'Asp':'Aspartic Acid', 'Glu':'Glutamic acid', 'Orn':'Arginine', 'Lys':'Lysine', 'His':'Histidine', 'Tyr':'Tyrosine','Cys-Cys':'Cystine'} molar_mass_dict = {'Ala':89.094, 'Gly':75.07, 'Val':117.2, 'Leu':113.2, 'Ile':113.2, 'Pro':115.1, 'Met':149.2, 'Ser':105.1, 'Thr':119.1, 'Phe':165.2, 'Asp':133.1, 'Glu':147.1, 'Orn':174.2, 'Lys':146.2, 'His':155.2, 'Tyr':181.2, 'Cys-Cys':240.1} bio_reps = ['KBS0812A', 'KBS0812B', 'KBS0812C', 'KBS0812D']
import matplotlib.lines as mlines import matplotlib.ticker import datetime as dt #from sklearn.model_selection import GridSearchCV #from sklearn.neighbors import KernelDensity import statsmodels.stats.multitest as mt import statsmodels.formula.api as smf from Bio import SeqIO from statsmodels.base.model import GenericLikelihoodModel df_taxa = pd.read_csv(lt.get_path() + '/data/breseq/dN_dS_taxa.txt', sep='\t') df_taxa = df_taxa.sort_values(by=['dN_dS_total']) taxa_to_keep = df_taxa.Species.to_list() df_taxa_samples = pd.read_csv(lt.get_path() + '/data/breseq/genetic_diversity.txt', sep='\t', index_col=None) fig = plt.figure() for i, taxon in enumerate(taxa_to_keep): #print(taxon) #print(df_taxa_samples.loc[df_taxa_samples['Species'] == taxon]) dn_ds = df_taxa_samples.loc[df_taxa_samples['Species'] == taxon].dn_ds_total.values dn_ds_mean = np.mean(dn_ds) #print(dn_ds) dn_ds_sem = np.std(dn_ds) / np.sqrt(len(dn_ds))
def fig2(): df = lt.get_mean_time_death() fig = plt.figure() # alpha kde alpha = df.alpha.values grid_alpha = GridSearchCV(KernelDensity(), {'bandwidth': np.linspace(0.1, 10, 50)}, cv=20) # 20-fold cross-validation grid_alpha.fit(alpha[:, None]) x_grid_alpha = np.linspace(0, 2.5, 1000) kde_alpha = grid_alpha.best_estimator_ pdf_alpha = np.exp(kde_alpha.score_samples(x_grid_alpha[:, None])) ax1 = plt.subplot2grid((2, 2), (0, 0), colspan=1) pdf_alpha = [x / sum(pdf_alpha) for x in pdf_alpha] ax1.plot(x_grid_alpha, pdf_alpha, alpha=0.8, lw=2, color='#1f77b4') #, marker='o') ax1.axvline(x=1, color='darkgrey', linestyle='--', lw=2.5) ax1.axvline(x=np.mean(alpha), color='#1f77b4', linestyle='--', lw=2.5) ax1.set_xlim([-0.1, 2.6]) ax1.set_xlabel("Scale parameter, " + r'$\alpha$', fontsize=14) ax1.set_ylabel("Probability density", fontsize=14) # half life kde half_life = np.log10(df.half_life.values) grid_half_life = GridSearchCV(KernelDensity(), {'bandwidth': np.linspace(0.1, 10, 50)}, cv=20) # 20-fold cross-validation grid_half_life.fit(half_life[:, None]) x_grid_half_life = np.linspace(-10, 15, 1000) kde_half_life = grid_half_life.best_estimator_ pdf_half_life = np.exp( kde_half_life.score_samples(x_grid_half_life[:, None])) pdf_half_life = [x / sum(pdf_half_life) for x in pdf_half_life] ax2 = plt.subplot2grid((2, 2), (1, 0), colspan=1) ax2.plot(x_grid_half_life, pdf_half_life, color="orange", alpha=0.8) ax2.axvline(x=np.mean(half_life), color='orange', linestyle='--', lw=2.5) ax2.set_xlim([-11, 11]) ax2.set_xlabel("Half-life " + r'$\mathrm{(d^{-1}), \, log_{10}} $', fontsize=14) ax2.set_ylabel("Probability density", fontsize=14) ax3 = plt.subplot2grid((2, 2), (0, 1), rowspan=2) strains = list(set(df.strain.values)) half_lives = [ np.log10(df.loc[df['strain'] == strain].half_life.values) for strain in strains ] mean_half_life = [ np.median(np.log10(df.loc[df['strain'] == strain].half_life.values)) for strain in strains ] zipped_half_lives = list(zip(strains, half_lives, mean_half_life)) zipped_half_lives.sort(key=lambda x: int(x[2])) # reverse= True) zipped_half_lives_sorted = sorted(zipped_half_lives, key=lambda x: x[2]) strains = [x[0] for x in zipped_half_lives_sorted] half_lives = [x[1] for x in zipped_half_lives_sorted] mean_half_life = [x[2] for x in zipped_half_lives_sorted] strain_dict = lt.get_strain_genus_dict() genera_labels = list(reversed([strain_dict[x] for x in strains])) strain_labels = list(reversed([' sp. ' + x for x in strains])) #genera ax3.boxplot(half_lives, vert=False) ax3.yaxis.set_major_formatter(plt.NullFormatter()) ax3.set_xlim([-6, 11]) #ax3.yaxis.set_ticks_position('none') #ax3.gca().xaxis.set_major_locator(plt.NullLocator()) #ax3.axis('off') ax3.set_xlabel("Half-life " + r'$\mathrm{(d^{-1}), \, log_{10}} $', fontsize=14) for i in range(len(genera_labels)): genera_label = genera_labels[i] strain_label = strain_labels[i] y = len(genera_labels) - i - 0.1 if i == 0: ax3.text(-5, y, r"${" + genera_label + "} \, \mathrm{" + strain_label + "}$", fontsize=5.5) else: if genera_label == 'Janthinobacterium': fontsize = 5.2 else: fontsize = 5.5 ax3.text(3.2, y, r"${" + genera_label + "} \, \mathrm{" + strain_label + "}$", fontsize=fontsize) plt.tight_layout() fig_name = lt.get_path() + '/figs/fig2.png' fig.savefig(fig_name, bbox_inches="tight", pad_inches=0.4, dpi=600) plt.close()