def add_hap_markers(self, hap_var): """ Populates the table with bases that are associated with haplogroups. """ for hap in hap_var: self.markers[hap] = dict() for var in hap_var[hap]: pos = phylotree.pos_from_var(var) der = phylotree.der_allele(var) if der != self.refseq[pos]: self.markers[hap][pos] = der return
def test_pos_from_var(self): self.assertEqual(phylotree.pos_from_var("T182C!"), 181) self.assertEqual(phylotree.pos_from_var("(T195C!)"), 194) self.assertEqual(phylotree.pos_from_var("C182T!!"), 181) self.assertEqual(phylotree.pos_from_var("(T195C)"), 194) self.assertEqual(phylotree.pos_from_var("T10454c"), 10453) self.assertEqual(phylotree.pos_from_var("T14034C"), 14033)
def write_variants(out, phylo, contribs, obs_tab, args): """ Write a table of the variants used in this analysis and note whether the position is expected to be polymorphic in the sample given the set of identified contributors. Args: out: File handle to write output to. phylo: The Phylotree object used in EM analysis contribs: Table of identified contributors with fields hap#, haplogroup, fraction args: The argparse namespace Returns: nothing """ haplogroups = [con[1] for con in contribs] variants = collections.defaultdict(list) for hap in haplogroups: for var in phylo.hap_var[hap]: pos = phylotree.pos_from_var(var) variants[pos].append("%s:%s" % (hap, var)) polymorphic = set(phylo.polymorphic_sites(haplogroups)) for ref_pos in range(len(phylo.refseq)): obs = obs_tab.obs_at(ref_pos) samp_status = "sample_fixed" threshold = max(args.min_var_reads, obs_tab.total_obs(pos) * args.frac_var_reads) if sum(obs[base] >= threshold for base in 'ACGT') > 1: samp_status = "variant" phy_status = "fixed" if ref_pos in polymorphic: phy_status = "polymorphic" out.write("%d\t%s\t%s\t%s\t%s\n" % (ref_pos + 1, '\t'.join([str(obs[base]) for base in 'ACGT']), phy_status, samp_status, ','.join(variants[ref_pos]))) return
def _check_contrib_phy_vars(phylo, obs_tab, contrib_prop, args): """ Checks if each candidate contributor from contribs passes our variant base check. The strategy for this is to start with the highest estimated contributors and an empty list of variant positions. For each contributor, we identify the variant bases that are unique from the previous candidates. We check the observation table to verify that those bases are observed in the sample. Args: phylo: the phylotree object that holds the variant information for these haplogroups obs_tab: Table of base observations for positions in the reference. contrib_prop: A list of lists, one for each contributor, containing: - The haplogroup ID for the contributor. - The proprotion estimate from EM args: argparse Namespace with user specified values for: min_var_reads: The minimum number of observations required to call a base as present in the mixture sample (int) frac_var_reads: The minimum fraction of observations required to call a base as present in the mixture sample (float) var_fraction: The minimum fraction of defining variants required to be observed to call a haplogroup a contributor (float) var_count: Call a haplogroup a contributor if there the number of observed variants is equal or greater than min_count (int) Returns: contrib_prop, with haplogroups that do not pass filters removed. """ used_vars = set() ignore_haps = set() if args.verbose: sys.stderr.write("Checking diagnostic variants:\n") for hap, _ in contrib_prop: # get variant for this haplogroup uniq_vars = set([(phylotree.pos_from_var(var), phylotree.der_allele(var)) for var in phylo.hap_var[hap]]) uniq_vars -= used_vars if args.verbose: sys.stderr.write("%s (%d unique variants)\n" % (hap, len(uniq_vars))) found_vars = set() for pos, der in sorted(uniq_vars): if args.verbose: var = "%d%s" % (pos + 1, der) sys.stderr.write(" %s: %d/%d\n" % (var.rjust(6), obs_tab.obs_at( pos, der), obs_tab.total_obs(pos))) threshold = max(args.min_var_reads, obs_tab.total_obs(pos) * args.frac_var_reads) if obs_tab.obs_at(pos, der) >= threshold: found_vars.add((pos, der)) if ((len(uniq_vars) == 0) or (args.var_count is not None and len(found_vars) >= args.var_count) or (float(len(found_vars)) / len(uniq_vars) >= args.var_fraction)): if args.verbose: sys.stderr.write( "Keeping '%s': " "%d/%d unique variant bases observed at " "least %d times.\n" % (hap, len(found_vars), len(uniq_vars), threshold)) # Looks good, these variants can't be used again. used_vars.update(found_vars) # Also add the ancestral bases for this haplogroup so we do not # mistake backmutations in another haplogroup as a novel allele. used_vars.update(phylo.get_ancestral(hap)) else: if args.verbose: sys.stderr.write( "Ignoring '%s': " "only %d/%d unique variant bases observed.\n" % (hap, len(found_vars), len(uniq_vars))) ignore_haps.add(hap) pass_contribs = [con for con in contrib_prop if con[0] not in ignore_haps] return pass_contribs