def test_observed_bases_obs_at_with_base_stranded_symmetry(self): self.alns[1].is_reverse = True res = observe.ObservedBases(self.alns, 20, 10) for base in 'ACGTN': self.assertEqual(res.obs_at(15, base, stranded=True), res.obs_at(15, base.lower(), stranded=True)) self.assertEqual(res.obs_at(17, '-', True), res.obs_at(17, '+', True))
def write_statistics(phylo, all_obs, contribs, contrib_reads, args): """ Write a bunch of files to use for plotting the results of our EM and assembly steps. These will include 1) base observations for each contributor and 2) sites from phylotree that were used to estimate mixture contributions and whether or not we think these should be polymorphic or not. Args: phylo: The phylotree object these assignments are based on. ref: The reference sequence. all_obs: ObservedBases object of observations per reference position. contribs: The contributor table returned by assembly.get_contributors, a list of (hap#, haplogroup, proportion) tuples. contrib_reads: a dictionary mapping hap#s to list of pysam AlignedSegments args: The argparse namespace, used for the stats_prefix filename prefix Returns: nothing """ haplogroups = {con[0]: con[1] for con in contribs} with open("%s.pos.tab" % (args.stats_prefix), 'w') as var_out: write_variants(var_out, phylo, contribs, all_obs, args) with open("%s.obs.tab" % (args.stats_prefix), 'w') as obs_out: for con in sorted(contrib_reads): obs_tab = observe.ObservedBases(contrib_reads[con], args.min_mq, args.min_bq) haplogroup = "unassigned" if con in haplogroups: haplogroup = haplogroups[con] write_base_obs(obs_out, obs_tab, phylo.refseq, "%s\t%s" % (con, haplogroup)) if len(contrib_reads) > 1: write_base_obs(obs_out, all_obs, phylo.refseq, "all\tmix") return
def test_observed_bases_update_after_init(self): res = observe.ObservedBases(self.alns, 20, 10) res.update(self.alns) exp = { 10: { 'A': 2 }, 11: { 'A': 2 }, 12: { 'A': 4 }, 13: { 'A': 4 }, 14: { 'A': 4 }, 15: { 'G': 2, 'T': 2 }, 16: { 'A': 4 }, 17: { 'A': 2, '-': 2 }, 18: { 'A': 2, '-': 2 }, 19: { 'A': 4 }, 20: { 'G': 2, 'T': 2 }, 21: { 'A': 4 }, 22: { 'A': 4 }, 23: { 'A': 4 }, 24: { 'A': 4 }, 25: { 'G': 2, 'T': 2 } } self.assertEqual(res.obs_tab, exp)
def test_observed_bases_obs_at_with_base_stranded(self): self.alns[1].is_reverse = True res = observe.ObservedBases(self.alns, 20, 10) self.assertEqual(res.obs_at(14, 'A', True), (1, 1)) self.assertEqual(res.obs_at(14, 'G', True), (0, 0)) self.assertEqual(res.obs_at(15, 'G', True), (0, 1)) self.assertEqual(res.obs_at(17, '-', True), (0, 1)) self.assertEqual(res.obs_at(17, '+', True), (0, 1))
def test_observed_bases_obs_at_with_base(self): self.alns[1].is_reverse = True res = observe.ObservedBases(self.alns, 20, 10) self.assertEqual(res.obs_at(14, 'A'), 2) self.assertEqual(res.obs_at(14, 'G'), 0) self.assertEqual(res.obs_at(15, 'G'), 1) self.assertEqual(res.obs_at(17, '-'), 1) self.assertEqual(res.obs_at(17, '+'), 1)
def test_observed_bases_init_from_alns(self): res = observe.ObservedBases(self.alns, 20, 10) exp = { 10: { 'A': 1 }, 11: { 'A': 1 }, 12: { 'A': 2 }, 13: { 'A': 2 }, 14: { 'A': 2 }, 15: { 'G': 1, 'T': 1 }, 16: { 'A': 2 }, 17: { 'A': 1, '-': 1 }, 18: { 'A': 1, '-': 1 }, 19: { 'A': 2 }, 20: { 'G': 1, 'T': 1 }, 21: { 'A': 2 }, 22: { 'A': 2 }, 23: { 'A': 2 }, 24: { 'A': 2 }, 25: { 'G': 1, 'T': 1 } } self.assertEqual(res.obs_tab, exp)
def setUp(self): parser = argparse.ArgumentParser() self.args = parser.parse_args([]) self.args.verbose = False self.args.min_reads = 1 self.args.min_var_reads = 1 self.args.frac_var_reads = 0.02 self.args.var_fraction = 0.5 self.args.var_count = None self.args.var_check = False self.args.contributors = None phy_in = [ 'I, A1G ,,', ',H, A3T A5T ,,', ',,F, A6T ,,', ',,,B, A8T ,,', ',,,C, T5A ,,', ',,G, A7T ,,', ',,,D, A9T ,,', ',,,E, A4T ,,', ',A, A2T A4T ,,' ] self.ref = "AAAAAAAAA" self.phy = phylotree.Phylotree(phy_in, refseq=self.ref) self.cons = [['A', 0.4], ['E', 0.3]] self.obs = observe.ObservedBases() self.obs.obs_tab[1]['T'] = 1 self.obs.obs_tab[3]['T'] = 2 self.obs.obs_tab[0]['G'] = 1 self.obs.obs_tab[6]['T'] = 1 self.obs.obs_tab[2]['T'] = 1 self.obs.obs_tab[4]['T'] = 1 self.wts = numpy.array([1, 1, 1]) self.haps = list('ABCDEFGHI') self.props = numpy.array( [0.40, 0.01, 0.01, 0.01, 0.3, 0.01, 0.01, 0.01, 0.01]) self.mix_mat = numpy.array( [[0.91, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01], [0.91, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01], [0.01, 0.01, 0.01, 0.01, 0.91, 0.01, 0.01, 0.01, 0.01]]) self.em_results = (self.props, self.mix_mat)
def call_consensus(refseq, alns, min_cov, args, strict=True): """ Generates a consensus sequence based on the list of AlignedSegments. Args: refseq: The reference sequence to which the fragments were aligned. alns: A list of pysam AlignedSegments min_cov: minimum coverage required to call a base. args: The argument values from mixemt's argparse results. strict: Whether to call a strict consensus or a majority base Returns: A string representing the consensus of the alignments in alns """ def consensus_base(base_counts): """ Given a Counter for a reference position, return the base that represents the consensus: N if coverage requirement is not met or if bases disagree or the observed base if all observations agree. """ base_counts['N'] = 0 # first, ignore an missing observations total_obs = sum(base_counts.values()) if total_obs < min_cov: return 'N' base, count = base_counts.most_common(1)[0] if strict and count != total_obs: return 'N' else: return base if not alns: # Sometimes alns can be empty. return "" obs_tab = observe.ObservedBases(alns, args.min_mq, args.min_bq) cons_bases = [ consensus_base(obs_tab.obs_at(pos)) for pos in range(len(refseq)) ] return str(''.join(cons_bases))
def test_observed_bases_init_empty(self): res = observe.ObservedBases() self.assertEqual(res.obs_tab, {})
def test_observed_bases_obs_at_bad_base(self): res = observe.ObservedBases(self.alns, 20, 10) with self.assertRaises(ValueError): res.obs_at(15, 'Q')
def test_observed_bases_obs_at_basic_no_base_stranded(self): self.alns[1].is_reverse = True res = observe.ObservedBases(self.alns, 20, 10) self.assertEqual(res.obs_at(10, stranded=True), {'A': 1}) self.assertEqual(res.obs_at(14, stranded=True), {'A': 1, 'a': 1}) self.assertEqual(res.obs_at(17, stranded=True), {'A': 1, '+': 1})
def test_observed_bases_obs_at_basic_no_base(self): self.alns[1].is_reverse = True res = observe.ObservedBases(self.alns, 20, 10) self.assertEqual(res.obs_at(10), {'A': 1}) self.assertEqual(res.obs_at(14), {'A': 2}) self.assertEqual(res.obs_at(17), {'A': 1, '-': 1})
def test_observed_bases_init_reverse_strand(self): self.alns[1].is_reverse = True res = observe.ObservedBases(self.alns, 20, 10) exp = { 10: { 'A': 1 }, 11: { 'A': 1 }, 12: { 'A': 1, 'a': 1 }, 13: { 'A': 1, 'a': 1 }, 14: { 'A': 1, 'a': 1 }, 15: { 'g': 1, 'T': 1 }, 16: { 'A': 1, 'a': 1 }, 17: { 'A': 1, '+': 1 }, 18: { 'A': 1, '+': 1 }, 19: { 'A': 1, 'a': 1 }, 20: { 'g': 1, 'T': 1 }, 21: { 'A': 1, 'a': 1 }, 22: { 'A': 1, 'a': 1 }, 23: { 'A': 1, 'a': 1 }, 24: { 'A': 1, 'a': 1 }, 25: { 'g': 1, 'T': 1 } } self.assertEqual(res.obs_tab, exp)