def make1csv_errfree(output_csv_filename, sim_args_tsv): """ Puts all the collate_dnds, full population csv, expected dnds info into 1 csv for checking what causes inaccurate inferred dn/ds. Instead of putting read sequencing umberjack results in the csv, instead it puts umberjack results from the full population. Ie the windows are made on the sequences from the full population instead of the ART reads. :return: """ LOGGER.debug("Writing all collated inferred, expected dnds to " + output_csv_filename) with open(output_csv_filename, 'w') as fh_out: writer = csv.DictWriter(fh_out, fieldnames=["Window_Start", "Window_End", "CodonSite", "File", "Is_Break", # whether the site is a recombinant breakpoint (start of new strand) "BreakRatio.Act", # sum across breakpoints (ratio of bases on either side of breakpoint) "Reads.Act", # max read depth for entire slice "UnambigCodonRate.Act", # Total unambiguous codon (depth) at the codon site / max read depth for entire slice "AADepth.Act", # Total codons that code for only 1 amino acid at the codon site "PopSize.Act", # Population size "ConserveCodon.Act", "EntropyCodon.Act", # Excludes codons with N's and gaps "UnknownPerCodon.Act", # Average N or gaps per codon at this site "ErrPerCodon.Act", # Average erroneous bases per codon at this site "N.Act", "S.Act", "EN.Act", "ES.Act", "dNdS.Act", "dN_minus_dS.Act", "TreeLen.Act", # length of window tree in nucleotide subs/site "TreeDepth.Act", # depth of longest branch in nucleotide subs/site "Polytomy.Act", # distance from actual to expected tree in Robinson Foulds-branch lengths /reads "TreeDistPerRead.Act", "ConserveCodon.Exp", "EntropyCodon.Exp", "N.Exp", "S.Exp", "EN.Exp", "ES.Exp", "dNdS.Exp", "dN_minus_dS.Exp" ]) writer.writeheader() popn_groups, umberjack_group_to_args = run_sliding_window_tree.parse_sim_args_tsv(sim_args_tsv) for umberjackgroup, popn_groups_per_ugroup in umberjack_group_to_args.iteritems(): for popn_group in popn_groups_per_ugroup: # /home/thuy/gitrepo/Umberjack_Benchmark/simulations/data/simdatasetname sim_popn_name = popn_group.dataset sim_data = SimData(popn_group.config_file) sim_data_dir = sim_data.sim_data_dir # /home/thuy/gitrepo/Umberjack_Benchmark/simulations/data/simdatasetname/subs/simdatasetname.dnds.tsv full_popn_dnds_tsv = sim_data_dir + os.sep + "subs" + os.sep + sim_popn_name + ".dnds.tsv" # /home/thuy/gitrepo/Umberjack_Benchmark/simulations/data/simdatasetname/fullpopn/simdatasetname.conserve.csv full_popn_conserve_csv = sim_data_dir + os.sep + "fullpopn" + os.sep + sim_popn_name + "_TRUE.conserve.csv" # Instead of using umberjack window fasta made from reads, we use window made from error free ART reads with perfect alignment window_start = 1 window_end = NUCSITES umberjack_output_prefix = DATASET_OUT_DIR + os.sep + sim_popn_name + os.sep + "errfree" + os.sep + sim_popn_name + ".repro.errfree.{}_{}".format(window_start, window_end) window_fasta = umberjack_output_prefix + ".fasta" window_treefile = umberjack_output_prefix + ".nwk" window_dnds_tsv = umberjack_output_prefix + ".dnds.tsv" LOGGER.debug("Merge sim_name=" + sim_popn_name + " full popn window dnds tsv =" + window_dnds_tsv) total_indiv = popn_group.indiv total_codon_sites = popn_group.codonsites #CodonSite ConserveCodon Entropy NucDepth CodonDepth codonsite_2_full_cons = collect_training.read_codon_csv(csv_file=full_popn_conserve_csv, codon_site_field="CodonSite", is_base0=False) # File,Window_Start,Window_End,Reads,CodonSite,CodonDepth,AADepth,ConserveAllCodon,EntropyAllCodon,ConserveCodon,EntropyCodon,N,S,EN,ES,dN,dS,dN_minus_dS,Ambig,Pad,Err,Err_N,Err_S,Ambig_N,Ambig_S,TreeLen,T # Site Observed S Changes Observed NS Changes E[S Sites] E[NS Sites] dS dN dN-dS Scaled dN-dS codonsite_2_full_dnds= collect_training.read_codon_csv(csv_file=full_popn_dnds_tsv, codon_site_field="Site", is_base0=True, delimiter="\t") if (len(codonsite_2_full_dnds.keys()) != len(codonsite_2_full_cons.keys()) or len(codonsite_2_full_dnds.keys()) != total_codon_sites or len(codonsite_2_full_cons.keys()) != total_codon_sites): raise ValueError("full population dnds does not have same number of codon sites as conservation:", full_popn_dnds_tsv, ", ", full_popn_conserve_csv) aln = Utility.Consensus() aln.parse(msa_fasta_filename=window_fasta) window_reads = aln.get_total_seqs() window_tree_dist = TestTopology.calc_window_tree_dist(sim_data=sim_data, window_fasta=window_fasta, window_treefile=window_treefile, win_start=window_start, win_end=window_end) full_popn_breaks = sim_data.get_recombo_breaks() break_ratio = collect_training.get_break_ratio(sim_data=sim_data, win_start=window_start, win_end=window_end) polytomy_brlen_thresh = 1.0/(3 * total_codon_sites) # branch length treshold below which node is considered polytomy window_treelen, window_treedepth, total_polytomies = collect_training.get_tree_len_depth(window_fasta, polytomy_brlen_thresh=polytomy_brlen_thresh) with open(window_dnds_tsv, 'rU') as fh_actual: reader_act = csv.DictReader(fh_actual, delimiter="\t") for row_idx, row_act in enumerate(reader_act): act_codonsite_offset_base0 = int(row_act["Site"]) act_codonsite_base0 = act_codonsite_offset_base0 + window_start - 1 act_nucsite_offset_base0 = act_codonsite_base0 * 3 codonsite_base1 = act_codonsite_base0 + 1 unambig_codon_depth = aln.get_codon_depth(codon_pos_0based=act_codonsite_offset_base0, is_count_ambig=False, is_count_gaps=False, is_count_pad=False) outrow = dict() outrow["Window_Start"] = window_start outrow["Window_End"] = window_end outrow["CodonSite"] = codonsite_base1 outrow["File"] = "ErrFree_" + sim_data.name outrow["Reads.Act"] = window_reads outrow["UnambigCodonRate.Act"] = float(unambig_codon_depth)/window_reads outrow["AADepth.Act"] = aln.get_unambig_codon2aa_depth(codon_pos_0based=act_codonsite_offset_base0) outrow["PopSize.Act"] = total_indiv outrow["ConserveCodon.Act"] = aln.get_codon_conserve(codon_pos_0based=act_codonsite_offset_base0, is_count_ambig=False, is_count_gaps=False, is_count_pad=False) outrow["EntropyCodon.Act"] = aln.get_codon_shannon_entropy(codon_pos_0based=act_codonsite_offset_base0, is_count_ambig=False, is_count_gaps=False, is_count_pad=False) outrow["UnknownPerCodon.Act"] = float(aln.get_gap_count(pos_0based=act_nucsite_offset_base0) + aln.get_ambig_count(pos_0based=act_nucsite_offset_base0) + aln.get_pad_count(pos_0based=act_nucsite_offset_base0)) / window_reads outrow["ErrPerCodon.Act"] = 0 # If it never made it past FastTree into hyphy, then the substitutions will be empty string if row_act[hyphy.hyphy_handler.HYPHY_TSV_N_COL] and row_act[hyphy.hyphy_handler.HYPHY_TSV_S_COL]: outrow["N.Act"] = float(row_act[hyphy.hyphy_handler.HYPHY_TSV_N_COL]) outrow["S.Act"] = float(row_act[hyphy.hyphy_handler.HYPHY_TSV_S_COL]) outrow["EN.Act"] = float(row_act[hyphy.hyphy_handler.HYPHY_TSV_EXP_N_COL]) outrow["ES.Act"] = float(row_act[hyphy.hyphy_handler.HYPHY_TSV_EXP_S_COL]) if row_act["dS"] and float(row_act[hyphy.hyphy_handler.HYPHY_TSV_S_COL]) != 0: outrow["dNdS.Act"] = float(row_act[hyphy.hyphy_handler.HYPHY_TSV_DN_COL])/float(row_act[hyphy.hyphy_handler.HYPHY_TSV_DS_COL]) outrow["dN_minus_dS.Act"] = row_act[hyphy.hyphy_handler.HYPHY_TSV_SCALED_DN_MINUS_DS_COL] outrow["TreeLen.Act"] = window_treelen outrow["TreeDepth.Act"] = window_treedepth outrow["TreeDistPerRead.Act"] = float(window_tree_dist)/window_reads outrow["Is_Break"] = 0 for nuc_strand_start_wrt_ref_base1, nuc_strand_end_wrt_ref_base1 in full_popn_breaks: nuc_pos_wrt_ref_base1 = window_start + act_nucsite_offset_base0 # If there are no recombination breaks, full_popn_breaks still contains the full genome as a contiguous section # Don't consider first position as breakpoint if len(full_popn_breaks) > 1 and nuc_pos_wrt_ref_base1 == nuc_strand_start_wrt_ref_base1 > 1: outrow["Is_Break"] = 1 outrow["BreakRatio.Act"] = break_ratio outrow["Polytomy.Act"] = total_polytomies if not codonsite_2_full_cons.get(codonsite_base1): raise ValueError("Missing codon site" + str(codonsite_base1) + " in " + full_popn_conserve_csv) outrow["ConserveCodon.Exp"] = codonsite_2_full_cons[codonsite_base1]["ConserveCodon"] outrow["EntropyCodon.Exp"] = codonsite_2_full_cons[codonsite_base1]["EntropyCodon"] if not codonsite_2_full_dnds.get(codonsite_base1): raise ValueError("Missing codon site" + str(codonsite_base1) + " in " + window_dnds_tsv) outrow["N.Exp"] = codonsite_2_full_dnds[codonsite_base1][hyphy.hyphy_handler.HYPHY_TSV_N_COL] outrow["S.Exp"] = codonsite_2_full_dnds[codonsite_base1][hyphy.hyphy_handler.HYPHY_TSV_S_COL] outrow["EN.Exp"] = codonsite_2_full_dnds[codonsite_base1][hyphy.hyphy_handler.HYPHY_TSV_EXP_N_COL] outrow["ES.Exp"] = codonsite_2_full_dnds[codonsite_base1][hyphy.hyphy_handler.HYPHY_TSV_EXP_S_COL] if (codonsite_2_full_dnds[codonsite_base1][hyphy.hyphy_handler.HYPHY_TSV_S_COL] and float(codonsite_2_full_dnds[codonsite_base1][hyphy.hyphy_handler.HYPHY_TSV_S_COL]) != 0): outrow["dNdS.Exp"] = (float(codonsite_2_full_dnds[codonsite_base1][hyphy.hyphy_handler.HYPHY_TSV_DN_COL])/ float(codonsite_2_full_dnds[codonsite_base1][hyphy.hyphy_handler.HYPHY_TSV_DS_COL])) outrow["dN_minus_dS.Exp"] = codonsite_2_full_dnds[codonsite_base1][hyphy.hyphy_handler.HYPHY_TSV_SCALED_DN_MINUS_DS_COL] writer.writerow(outrow)
def collect_dnds(output_dir, output_csv_filename, sim_data_config, comments=None): """ Collects everything related to dnds into 1 table. Does not do any aggregation of values. Useful for debugging. :return: """ LOGGER.debug("Collect dnds for " + output_csv_filename) with open(output_csv_filename, 'w') as fh_out: sim_data = SimData(sim_data_config) full_popn_fasta = sim_data.get_fasta() full_popn_breaks = sim_data.get_recombo_breaks() if comments: fh_out.write(comments) writer = csv.DictWriter( fh_out, fieldnames=[ "Window_Start", "Window_End", "Reads", # Max read depth for the window (not necessary for the codon site) "CodonSite", # 1-based codon site "CodonDepth", # Total unambiguous codon (depth) at the codon site "AADepth", # Total depth of codons that code unambiguously for 1 AA. # "ConserveAllCodon", # Average per-base fraction of conservation across the codon. Includes N's and gaps. # "EntropyAllCodon", # Average per-base metric entropy across the codon. Includes N's and gaps. "ConserveCodon", # Average per-base fraction of conservation across the codon. Excludes N's and gaps "EntropyCodon", # Average per-base fraction of entropy across the codon. Excludes N's and gaps "N", # Observed Nonsynonymous substitutions "S", # Observed Nonsynonymous substitutions "EN", # Expected Nonsynonymous substitutions "ES", # Expected Synonymous substitutions "dN", "dS", "dN_minus_dS", # dN-dS scaled by the tree length "unscaled_dN_minus_dS", # dN-dS "Ambig", # N nucleotide "Pad", # left or right pad gap "Gap", # internal gap between true bases on both sides "Err", # Nucleotide errors within the codon "Err_N", # nonsynonymous AA change due to sequence error "Err_S", # synonymous AA change due to sequence error "Ambig_N", # Ambiguous base changes the AA. Should be always 0 "Ambig_S", # ambigous base does not change the AA. "TreeLen", # Tree length "TreeDepth", # deepest tip to root distance "TreeDist", # distance from actual to expected tree "Is_Break", # Whether a strand switch starts on this codon site "BreakRatio", # sum across window breakpoints (ratio of bases on either side of breakpoint) "Polytomy", # total polytomies in tree "P_SameCodonFreq", # log10 probability that sliced codon frequency distro is same as full population distro # Total substitutions that were resolved vs observed "ResolvedPerSub" ]) writer.writeheader() for slice_fasta_filename in glob.glob(output_dir + os.sep + "*.*_*.fasta"): # don't use hyphy ancestral fasta or fullgene msa fasta or expected files if (slice_fasta_filename.endswith(".anc.fasta") or slice_fasta_filename.endswith(".msa.fasta") or slice_fasta_filename.find("expected") >= 0): continue # *.{start bp}_{end bp}.fasta filenames use 1-based nucleotide position numbering slice_fasta_fileprefix = slice_fasta_filename.split('.fasta')[0] win_nuc_range = slice_fasta_fileprefix.split('.')[-1] # # Window ends at this 1-based nucleotide position with respect to the reference if win_nuc_range.find( "_" ) <= 0: # the full genome msa.fasta file won't have a window range continue win_start_nuc_pos_1based_wrt_ref, win_end_nuc_pos_1based_wrt_ref = [ int(x) for x in win_nuc_range.split('_') ] # Window starts at this 1-based codon position with respect to the reference win_start_codon_1based_wrt_ref = win_start_nuc_pos_1based_wrt_ref / Utility.NUC_PER_CODON + 1 break_ratio = get_break_ratio( sim_data=sim_data, win_start=win_start_nuc_pos_1based_wrt_ref, win_end=win_end_nuc_pos_1based_wrt_ref) slice_aln = Utility.Consensus() slice_aln.parse(slice_fasta_filename) codon_width = slice_aln.get_alignment_len( ) / Utility.NUC_PER_CODON # if the last codon doesn't have enuf chars, then hyphy ignores it tree_len = None tree_depth = None tree_dist = None total_polytomies = None slice_tree_filename = slice_fasta_fileprefix + ".nwk" if os.path.exists(slice_tree_filename): # NB: FastTree tree length in nucleotide substitutions / site. # HyPhy converts trees to codon substitution/site to count codon substitutions along phylogeny # Parse the HyPhy dnds tsv to get dN, dS, polytomy_brlen_thresh = 1.0 / ( 3 * codon_width ) # branch length treshold below which node is considered polytomy tree_len, tree_depth, total_polytomies = get_tree_len_depth( slice_tree_filename, polytomy_brlen_thresh=polytomy_brlen_thresh) # If there is recombination, there may be multiple trees. # Use the full population tree corresponding to slice portion of the genome. tree_dist = TestTopology.calc_window_tree_dist( sim_data=sim_data, window_fasta=slice_fasta_filename, window_treefile=slice_tree_filename, win_start=win_start_nuc_pos_1based_wrt_ref, win_end=win_end_nuc_pos_1based_wrt_ref) (seq_err, err_aa_change, err_aa_nochange, ambig_aa_change, ambig_aa_nochange) = error_by_codonpos( slice_fasta_filename, win_start_nuc_pos_1based_wrt_ref, full_popn_fasta) full_popn_aln = Utility.Consensus() full_popn_aln.parse(full_popn_fasta) dnds_tsv_filename = slice_fasta_fileprefix + ".dnds.tsv" subs_tsv_filename = slice_fasta_fileprefix + ".subst.tsv" fh_dnds_tsv = None reader = None site_to_subcounts = dict() try: if os.path.exists(dnds_tsv_filename) and os.path.getsize( dnds_tsv_filename): fh_dnds_tsv = open(dnds_tsv_filename, 'rU') site_to_subcounts = count_resolved(subs_tsv_filename) reader = csv.DictReader(fh_dnds_tsv, delimiter='\t') for codonoffset_0based in xrange(codon_width): nucoffset_0based = codonoffset_0based * Utility.NUC_PER_CODON outrow = dict() outrow["Window_Start"] = win_start_nuc_pos_1based_wrt_ref outrow["Window_End"] = win_end_nuc_pos_1based_wrt_ref outrow["Reads"] = slice_aln.get_total_seqs() outrow[ "CodonSite"] = win_start_codon_1based_wrt_ref + codonoffset_0based outrow["CodonDepth"] = slice_aln.get_codon_depth( codon_pos_0based=codonoffset_0based, is_count_ambig=False, is_count_gaps=False, is_count_pad=False) outrow["AADepth"] = slice_aln.get_unambig_codon2aa_depth( codon_pos_0based=codonoffset_0based) outrow["ConserveCodon"] = slice_aln.get_codon_conserve( codonoffset_0based, is_count_ambig=False, is_count_gaps=False, is_count_pad=False) outrow[ "EntropyCodon"] = slice_aln.get_codon_shannon_entropy( codonoffset_0based, is_count_ambig=False, is_count_gaps=False, is_count_pad=False) outrow["Ambig"] = (slice_aln.get_ambig_count( pos_0based=nucoffset_0based) + slice_aln.get_ambig_count( pos_0based=nucoffset_0based + 1) + slice_aln.get_ambig_count( pos_0based=nucoffset_0based + 2)) outrow["Pad"] = ( slice_aln.get_pad_count(pos_0based=nucoffset_0based) + slice_aln.get_pad_count(pos_0based=nucoffset_0based + 1) + slice_aln.get_pad_count(pos_0based=nucoffset_0based + 2)) outrow["Gap"] = ( slice_aln.get_gap_count(pos_0based=nucoffset_0based) + slice_aln.get_gap_count(pos_0based=nucoffset_0based + 1) + slice_aln.get_gap_count(pos_0based=nucoffset_0based + 2)) outrow["Err"] = seq_err[codonoffset_0based] outrow["Err_N"] = err_aa_change[codonoffset_0based] outrow["Err_S"] = err_aa_nochange[codonoffset_0based] outrow["Ambig_N"] = ambig_aa_change[codonoffset_0based] outrow["Ambig_S"] = ambig_aa_nochange[codonoffset_0based] outrow["TreeLen"] = tree_len outrow["TreeDepth"] = tree_depth outrow["TreeDist"] = tree_dist outrow["Is_Break"] = 0 for nuc_strand_start_wrt_ref_base1, nuc_strand_end_wrt_ref_base1 in full_popn_breaks: nuc_pos_wrt_ref_base1 = win_start_nuc_pos_1based_wrt_ref + nucoffset_0based # If there are no recombination breaks, full_popn_breaks still contains the full genome as a contiguous section # Don't consider first position as breakpoint if len( full_popn_breaks ) > 1 and nuc_pos_wrt_ref_base1 == nuc_strand_start_wrt_ref_base1 > 1: outrow["Is_Break"] = 1 outrow["BreakRatio"] = break_ratio outrow["Polytomy"] = total_polytomies # log-likelihood ratio test that codon count distributions are similar between window and full population full_popn_codon_freq = full_popn_aln.get_codon_freq( codon_pos_0based=win_start_codon_1based_wrt_ref + codonoffset_0based - 1, is_count_pad=False, is_count_gaps=False, is_count_ambig=False) slice_codon_freq = slice_aln.get_codon_freq( codon_pos_0based=codonoffset_0based, is_count_pad=False, is_count_gaps=False, is_count_ambig=False) pval_same = cmp_freq_distro(slice_codon_freq, full_popn_codon_freq, is_scale=False) outrow["P_SameCodonFreq"] = pval_same if reader: resolved_ns, resolved_s, obs_ns, obs_s = site_to_subcounts[ codonoffset_0based] total_subs = resolved_ns + resolved_s + obs_ns + obs_s if total_subs: outrow["ResolvedPerSub"] = ( resolved_ns + resolved_s) / float(total_subs) else: outrow["ResolvedPerSub"] = 0 dnds_info = reader.next( ) # Every codon site is a row in the *.dnds.tsv file if codonoffset_0based != int(dnds_info["Site"]): # dnds tsv specified the codon site in 0-based coordinates in Site field wrt Slice raise ValueError("Inconsistent site numbering " + str(codonoffset_0based) + " in " + dnds_tsv_filename) outrow["N"] = dnds_info[hyphy_handler.HYPHY_TSV_N_COL] outrow["S"] = dnds_info[hyphy_handler.HYPHY_TSV_S_COL] outrow["ES"] = dnds_info[ hyphy_handler.HYPHY_TSV_EXP_S_COL] outrow["EN"] = dnds_info[ hyphy_handler.HYPHY_TSV_EXP_N_COL] outrow["dN"] = dnds_info[ hyphy_handler.HYPHY_TSV_DN_COL] outrow["dS"] = dnds_info[ hyphy_handler.HYPHY_TSV_DS_COL] outrow["dN_minus_dS"] = dnds_info[ hyphy_handler.HYPHY_TSV_SCALED_DN_MINUS_DS_COL] outrow["unscaled_dN_minus_dS"] = dnds_info[ hyphy_handler.HYPHY_TSV_DN_MINUS_DS_COL] if abs( total_subs - (float(dnds_info[hyphy_handler.HYPHY_TSV_N_COL]) + float(dnds_info[hyphy_handler.HYPHY_TSV_S_COL])) ) > 1e-2: raise ValueError( "Inconsitent total subs at 0-based site " + str(codonoffset_0based) + " wrt " + subs_tsv_filename + " and " + dnds_tsv_filename + " " + str(total_subs) + " " + str( float(dnds_info[ hyphy_handler.HYPHY_TSV_N_COL]) + float(dnds_info[ hyphy_handler.HYPHY_TSV_S_COL]))) writer.writerow(outrow) if reader: try: dnds_info = reader.next() if dnds_info and len(dnds_info) > 0: raise ValueError( "dnds TSV has more codons than expected " + dnds_tsv_filename) except StopIteration: # We want the reader to have no more rows pass finally: if fh_dnds_tsv and not fh_dnds_tsv.closed: fh_dnds_tsv.close()