def test_002_test_filtered_yield(self): index = datastore.DataIndex([self.file.name]) region_specs = [ (None, 1), ([Region('contig1', 0, 5)], 1), ([Region('contig1', 5, 10)], 0), ([Region('contig2', None, None)], 0), ] for regs, exp_len in region_specs: samples = list(index.yield_from_feature_files(regions=regs)) self.assertEqual(len(samples), exp_len)
def test_050_read_groups(self): bam = tempfile.NamedTemporaryFile(suffix='.bam').name reads = list() region = Region('ref', start=0, end=8) for rg in ('first', 'second'): rg_reads = copy.deepcopy(simple_data['calls']) for read in rg_reads: read['tags']['RG'] = rg read['query_name'] += '_{}'.format(rg) reads.extend(rg_reads) create_simple_bam(bam, reads) # use everything counts, positions = medaka.features.pileup_counts(region, bam)[0] self.assertTrue(np.array_equal(counts, 2 * self.expected_counts)) self.assertTrue(np.array_equal(positions, self.expected_positions)) # use one or other for rg in ('first', 'second'): counts, positions = medaka.features.pileup_counts(region, bam, read_group=rg)[0] self.assertTrue(np.array_equal(counts, self.expected_counts)) self.assertTrue(np.array_equal(positions, self.expected_positions)) # use a missing one result = medaka.features.pileup_counts(region, bam, read_group='nonsense') self.assertTrue(len(result) == 0)
def setUpClass(cls): bam_fname = tempfile.NamedTemporaryFile(suffix='.bam').name create_simple_bam(bam_fname, simple_data['calls']) cls.region = Region( 'ref', start=0, end=8) cls.bam = bam_fname
def test_005_unchunked(self): region = Region('ref', start=0, end=9) n_chunks = len( list( medaka.features.get_trimmed_reads(region, self.bam, region_split=1000, chunk_overlap=0))) self.assertEqual(n_chunks, 1)
def setUpClass(cls): kwargs = {'normalise': None} encoder = medaka.features.HardRLEFeatureEncoder(**kwargs) # Create a bam file where we know the alignments RLE_bam = tempfile.NamedTemporaryFile(suffix='.bam').name create_rle_bam(RLE_bam) sample = encoder.bam_to_sample(RLE_bam, Region('ref', 0, 8)) cls.sample = sample[0] cls.num_qstrat = encoder.num_qstrat
def _quarantine_sample(self, sample): """Add sample name and pileup width to a list.""" # Note: the below assumes we haven't split a pileup on minor positions. # This should be the case: chunking on minor positions only occurs for # larger regions. start, _ = sample.first_pos end, _ = sample.last_pos end += 1 # end exclusive self._quarantined.append((Region(sample.ref_name, start, end), sample.size))
def test_labels_trimmed_back(self): # we should have two alignments which partially overlap # (318288, 417741) # (417732, 422799) # in this case, the first is >2 x longer than the second, so we trim back the second # check resulting positions and labels are non-overlapping alignments = TruthAlignment.bam_to_alignments( __truth_bam__, Region(__ref_name__, start=318288, end=422799)) self.assertEqual(alignments[0][0].start, 318288) self.assertEqual(alignments[0][0].end, 417741) self.assertEqual(alignments[1][0].start, 417741) self.assertEqual(alignments[1][0].end, 422799)
def setUpClass(cls): temp_file = tempfile.NamedTemporaryFile(suffix='.bam') bam_fname = temp_file.name region = Region('ref', 0, 100) create_simple_bam(bam_fname, simple_data['calls']) (counts_strat1, positions_strat1) = medaka.features.pileup_counts( region, bam_fname, num_qstrat=1, weibull_summation=True)[0] (counts_strat2, positions_strat2) = medaka.features.pileup_counts( region, bam_fname, num_qstrat=2, weibull_summation=True)[0] cls.counts_strat1 = counts_strat1 cls.positions_strat1 = positions_strat1 cls.counts_strat2 = counts_strat2 cls.positions_strat2 = positions_strat2
def test_031_bams_to_training_samples_regression(self): encoder = medaka.features.CountsFeatureEncoder(normalise='total') label_scheme = medaka.labels.HaploidLabelScheme() region = Region('utg000001l', 149744, 318288) result = encoder.bams_to_training_samples(__reads_truth__, __reads_bam__, region, label_scheme)[0] expected_feature_shape = (177981, 10) got_feature_shape = result.features.shape self.assertEqual(expected_feature_shape, got_feature_shape) expected_label_shape = (177981, ) got_label_shape = result.labels.shape self.assertEqual(expected_label_shape, got_label_shape)
def test_005_check_fwd_rev(self): """Split normalisation between fwd and rev reads. """ encoder = medaka.features.HardRLEFeatureEncoder(normalise='fwd_rev') region = Region('ref', 0, 11) sample = encoder.bam_to_sample(self.bam_fname, region)[0] values_for_positions = { 0.5: [(0, 4), (0, 14), (3, 1), (3, 9), (3, 43), (3, 47), (4, 0), (7, 7), (7, 17), (8, 6), (8, 16)], 1.0: [(0, 10), (1, 1), (1, 5), (2, 30), (2, 34), (5, 2), (5, 6), (6, 0), (6, 4), (7, 13), (8, 2)] } expected = np.zeros_like(sample.features) for value, positions in values_for_positions.items(): for pos in positions: expected[pos[0], pos[1]] = value np.testing.assert_equal(sample.features, expected)
def test_004_check_specifics(self): temp_file=tempfile.NamedTemporaryFile(suffix='.bam', delete=False) bam_fname = temp_file.name region = Region('ref', 0, 8) create_simple_bam( bam_fname, simple_data['calls'][0:1]) counts, _ = medaka.features.pileup_counts( region, bam_fname, num_qstrat=6, weibull_summation=True)[0] # should have only one non-zero per row non_zeroes = np.nonzero(counts) np.testing.assert_equal(non_zeroes[0], np.arange(0,8)) np.testing.assert_equal( non_zeroes[1], # acgtACGTdD... # 2A 1C 4A 5T 1G 1A 2T 1G np.array([14, 5, 34, 47, 6, 4, 17, 6]))
def setUpClass(cls): bam_fname = tempfile.NamedTemporaryFile(suffix='.bam').name create_simple_bam(bam_fname, simple_data['calls']) cls.region = Region('ref', start=0, end=8) cls.bam = bam_fname cls.expected_counts = np.array( [[2, 0, 0, 0, 2, 0, 0, 0, 0, 0], [0, 2, 0, 0, 0, 2, 0, 0, 0, 0], [2, 0, 0, 0, 2, 0, 0, 0, 0, 0], [0, 1, 0, 1, 0, 0, 0, 1, 0, 1], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 2, 0, 0, 0, 2, 0, 0, 0], [2, 0, 0, 0, 2, 0, 0, 0, 0, 0], [0, 0, 0, 2, 0, 0, 0, 2, 0, 0], [0, 0, 2, 0, 0, 0, 2, 0, 0, 0]], dtype=np.uint64) cls.expected_positions = np.array([(0, 0), (1, 0), (2, 0), (3, 0), (3, 1), (4, 0), (5, 0), (6, 0), (7, 0)], dtype=[('major', '<i8'), ('minor', '<i8')])
def test_030_bams_to_training_samples_simple(self): reads_bam = tempfile.NamedTemporaryFile(suffix='.bam').name truth_bam = tempfile.NamedTemporaryFile(suffix='.bam').name # we had a bug caused by missing qualities and bad indexing... data = copy.deepcopy(simple_data['calls']) data[0]['quality'] = None create_simple_bam(reads_bam, data) create_simple_bam(truth_bam, [simple_data['truth']]) encoder = medaka.features.CountsFeatureEncoder(normalise='total') label_scheme = medaka.labels.HaploidLabelScheme() region = Region('ref', 0, 100) result = encoder.bams_to_training_samples(truth_bam, reads_bam, region, label_scheme, min_length=0)[0] expected = Sample( ref_name='ref', features=np.array( [[0.5, 0., 0., 0., 0.5, 0., 0., 0., 0., 0.], [0., 0.5, 0., 0., 0., 0.5, 0., 0., 0., 0.], [0.5, 0., 0., 0., 0.5, 0., 0., 0., 0., 0.], [0., 0.25, 0., 0.25, 0., 0., 0., 0.25, 0., 0.25], [0.25, 0., 0., 0., 0., 0., 0., 0., 0., 0.], [0., 0., 0.5, 0., 0., 0., 0.5, 0., 0., 0.], [0.5, 0., 0., 0., 0.5, 0., 0., 0., 0., 0.], [0., 0., 0., 0.5, 0., 0., 0., 0.5, 0., 0.], [0., 0., 0.5, 0., 0., 0., 0.5, 0., 0., 0.]], dtype='float32'), # the two insertions with respect to the draft are dropped labels=np.array([1, 2, 1, 4, 1, 3, 1, 4, 3]), # A C A T A G A T C ref_seq=None, positions=np.array([(0, 0), (1, 0), (2, 0), (3, 0), (3, 1), (4, 0), (5, 0), (6, 0), (7, 0)], dtype=[('major', '<i8'), ('minor', '<i8')]), label_probs=None) np.testing.assert_equal(result.labels, expected.labels) np.testing.assert_equal(result.positions, expected.positions) np.testing.assert_equal(result.features, expected.features)
def test_004_trim_mid(self): region = Region('ref', start=1, end=7) reads = self.get_reads(region) orig = [x[1:-1] for x in self.reads] self.assertEqual(reads, orig)
def bams_to_training_samples(self, truth_bam, bam, region, reference=None, read_fraction=None): """Prepare training data chunks. :param truth_bam: .bam file of truth aligned to ref to generate labels. :param bam: input .bam file. :param region: `Region` obj. the reference will be parsed. :param reference: reference `.fasta`, should correspond to `bam`. :returns: tuple of `Sample` objects. .. note:: Chunks might be missing if `truth_bam` is provided and regions with multiple mappings were encountered. """ ref_rle = self.process_ref_seq(region.ref_name, reference) # filter truth alignments to restrict ourselves to regions of the ref where the truth # in unambiguous alignments = TruthAlignment.bam_to_alignments(truth_bam, region.ref_name, start=region.start, end=region.end) filtered_alignments = TruthAlignment.filter_alignments( alignments, start=region.start, end=region.end) if len(filtered_alignments) == 0: self.logger.info( "Filtering removed all alignments of truth to ref from {}.". format(region)) samples = [] for aln in filtered_alignments: mock_compr = self.max_hp_len > 1 and not self.is_compressed truth_pos, truth_labels = aln.get_positions_and_labels( ref_compr_rle=ref_rle, mock_compr=mock_compr, is_compressed=self.is_compressed, rle_dtype=True) aln_samples = self.bam_to_sample(bam, Region(region.ref_name, aln.start, aln.end), ref_rle, read_fraction=read_fraction) for sample in aln_samples: # Create labels according to positions in pileup pad = (encoding[_gap_], 1) if len(truth_labels.dtype) > 0 else encoding[_gap_] padder = itertools.repeat(pad) position_to_label = defaultdict( padder.__next__, zip([tuple(p) for p in truth_pos], [a for a in truth_labels])) padded_labels = np.fromiter( (position_to_label[tuple(p)] for p in sample.positions), dtype=truth_labels.dtype, count=len(sample.positions)) sample = sample._asdict() sample['labels'] = padded_labels samples.append(Sample(**sample)) return tuple(samples)
ref_start=None, ref_end=None, ref_len=None, query_seq='ACATGCAAGACACGAT', ref_seq='AAAGGCAAGACACGAT'): self.reference_start = ref_start self.reference_end = ref_end self.reference_length = ref_len self.query_sequence = query_seq self.reference_sequence = ref_seq def get_reference_sequence(self): return self.reference_sequence full_region = Region('Mock', 0, float('inf')) class TruthAlignmentTest(unittest.TestCase): def test_case1(self): # case 1: longer < 2 x len shorter and >= 50% of shorter overlaps longer both should be removed starts_ends = [(2000, 2999), (2500, 3000)] expected = [] alignments = [ TruthAlignment(MockAlignment(start, end, end - start)) for start, end in starts_ends ] filtered = [ (f.start, f.end) for f in TruthAlignment._filter_alignments(alignments, full_region)
import numpy as np import os import unittest from medaka.features import FeatureEncoder, pileup_counts from medaka.common import Region __reads_bam__ = os.path.join(os.path.dirname(__file__), 'data', 'test_reads.bam') __two_type_bam__ = os.path.join(os.path.dirname(__file__), 'data', 'test_two_type.bam') __gapped_bam__ = os.path.join(os.path.dirname(__file__), 'data', 'reads_gapped.bam') __region__ = Region('Consensus_Consensus_Consensus_Consensus_utg000001l', start=50000, end=100000) __region_start__ = Region('Consensus_Consensus_Consensus_Consensus_utg000001l', start=0, end=200) __kwargs__ = { 'consensus_as_ref': False, 'is_compressed': False, 'log_min': None, 'max_hp_len': 1, 'normalise': 'total', 'ref_mode': None, 'with_depth': False } class CountsTest(unittest.TestCase):
import numpy as np import pysam from .mock_data import simple_data, create_simple_bam import libmedaka import medaka.features from medaka.common import Region, Sample import medaka.labels __reads_bam__ = os.path.join(os.path.dirname(__file__), 'data', 'test_reads.bam') __reads_truth__ = os.path.join(os.path.dirname(__file__), 'data', 'truth_to_ref.bam') __gapped_bam__ = os.path.join(os.path.dirname(__file__), 'data', 'reads_gapped.bam') __region__ = Region('utg000001l', start=50000, end=100000) __region_start__ = Region('utg000001l', start=0, end=200) class CountsTest(unittest.TestCase): @classmethod def setUpClass(self): self.expected_width = 86294 def test_001_basic_counting(self): kwargs = {'normalise': None} encoder = medaka.features.CountsFeatureEncoder(**kwargs) sample = encoder.bam_to_sample(__reads_bam__, __region__) self.assertEqual(len(sample), 1) sample = sample[0] assert tuple(sample.positions.shape) == (self.expected_width, )
def test_003_trim_end(self): region = Region('ref', start=6, end=8) reads = self.get_reads(region) orig = [x[-2:] for x in self.reads] self.assertEqual(reads, orig)
def test_002_trim_start(self): region = Region('ref', start=0, end=2) reads = self.get_reads(region) orig = [x[0:2] for x in self.reads] self.assertEqual(reads, orig)
def test_001_full_region(self): region = Region('ref', start=0, end=100000) reads = self.get_reads(region) self.assertEqual(reads, self.reads)