Exemplo n.º 1
0
 def test_002_test_filtered_yield(self):
     index = datastore.DataIndex([self.file.name])
     region_specs = [
         (None, 1),
         ([Region('contig1', 0, 5)], 1),
         ([Region('contig1', 5, 10)], 0),
         ([Region('contig2', None, None)], 0),
     ]
     for regs, exp_len in region_specs:
         samples = list(index.yield_from_feature_files(regions=regs))
         self.assertEqual(len(samples), exp_len)
Exemplo n.º 2
0
    def test_050_read_groups(self):
        bam = tempfile.NamedTemporaryFile(suffix='.bam').name
        reads = list()
        region = Region('ref', start=0, end=8)
        for rg in ('first', 'second'):
            rg_reads = copy.deepcopy(simple_data['calls'])
            for read in rg_reads:
                read['tags']['RG'] = rg
                read['query_name'] += '_{}'.format(rg)
            reads.extend(rg_reads)
        create_simple_bam(bam, reads)

        # use everything
        counts, positions = medaka.features.pileup_counts(region, bam)[0]
        self.assertTrue(np.array_equal(counts, 2 * self.expected_counts))
        self.assertTrue(np.array_equal(positions, self.expected_positions))

        # use one or other
        for rg in ('first', 'second'):
            counts, positions = medaka.features.pileup_counts(region,
                                                              bam,
                                                              read_group=rg)[0]
            self.assertTrue(np.array_equal(counts, self.expected_counts))
            self.assertTrue(np.array_equal(positions, self.expected_positions))

        # use a missing one
        result = medaka.features.pileup_counts(region,
                                               bam,
                                               read_group='nonsense')
        self.assertTrue(len(result) == 0)
Exemplo n.º 3
0
 def setUpClass(cls):
     bam_fname = tempfile.NamedTemporaryFile(suffix='.bam').name
     create_simple_bam(bam_fname, simple_data['calls'])
     cls.region = Region(
         'ref',
         start=0, end=8)
     cls.bam = bam_fname
Exemplo n.º 4
0
 def test_005_unchunked(self):
     region = Region('ref', start=0, end=9)
     n_chunks = len(
         list(
             medaka.features.get_trimmed_reads(region,
                                               self.bam,
                                               region_split=1000,
                                               chunk_overlap=0)))
     self.assertEqual(n_chunks, 1)
Exemplo n.º 5
0
    def setUpClass(cls):
        kwargs = {'normalise': None}
        encoder = medaka.features.HardRLEFeatureEncoder(**kwargs)

        # Create a bam file where we know the alignments
        RLE_bam = tempfile.NamedTemporaryFile(suffix='.bam').name
        create_rle_bam(RLE_bam)
        sample = encoder.bam_to_sample(RLE_bam, Region('ref', 0, 8))
        cls.sample = sample[0]
        cls.num_qstrat = encoder.num_qstrat
Exemplo n.º 6
0
 def _quarantine_sample(self, sample):
     """Add sample name and pileup width to a list."""
     # Note: the below assumes we haven't split a pileup on minor positions.
     # This should be the case: chunking on minor positions only occurs for
     # larger regions.
     start, _ = sample.first_pos
     end, _ = sample.last_pos
     end += 1  # end exclusive
     self._quarantined.append((Region(sample.ref_name, start,
                                      end), sample.size))
Exemplo n.º 7
0
 def test_labels_trimmed_back(self):
     # we should have two alignments which partially overlap
     # (318288, 417741)
     # (417732, 422799)
     # in this case, the first is >2 x longer than the second, so we trim back the second
     # check resulting positions and labels are non-overlapping
     alignments = TruthAlignment.bam_to_alignments(
         __truth_bam__, Region(__ref_name__, start=318288, end=422799))
     self.assertEqual(alignments[0][0].start, 318288)
     self.assertEqual(alignments[0][0].end, 417741)
     self.assertEqual(alignments[1][0].start, 417741)
     self.assertEqual(alignments[1][0].end, 422799)
Exemplo n.º 8
0
    def setUpClass(cls):
        temp_file = tempfile.NamedTemporaryFile(suffix='.bam')
        bam_fname = temp_file.name
        region = Region('ref', 0, 100)
        create_simple_bam(bam_fname, simple_data['calls'])
        (counts_strat1, positions_strat1) = medaka.features.pileup_counts(
            region, bam_fname, num_qstrat=1, weibull_summation=True)[0]
        (counts_strat2, positions_strat2) = medaka.features.pileup_counts(
            region, bam_fname, num_qstrat=2, weibull_summation=True)[0]

        cls.counts_strat1 = counts_strat1
        cls.positions_strat1 = positions_strat1
        cls.counts_strat2 = counts_strat2
        cls.positions_strat2 = positions_strat2
Exemplo n.º 9
0
    def test_031_bams_to_training_samples_regression(self):
        encoder = medaka.features.CountsFeatureEncoder(normalise='total')
        label_scheme = medaka.labels.HaploidLabelScheme()
        region = Region('utg000001l', 149744, 318288)
        result = encoder.bams_to_training_samples(__reads_truth__,
                                                  __reads_bam__, region,
                                                  label_scheme)[0]

        expected_feature_shape = (177981, 10)
        got_feature_shape = result.features.shape
        self.assertEqual(expected_feature_shape, got_feature_shape)

        expected_label_shape = (177981, )
        got_label_shape = result.labels.shape
        self.assertEqual(expected_label_shape, got_label_shape)
Exemplo n.º 10
0
 def test_005_check_fwd_rev(self):
     """Split normalisation between fwd and rev reads. """
     encoder = medaka.features.HardRLEFeatureEncoder(normalise='fwd_rev')
     region = Region('ref', 0, 11)
     sample = encoder.bam_to_sample(self.bam_fname, region)[0]
     values_for_positions = {
         0.5: [(0, 4), (0, 14), (3, 1), (3, 9), (3, 43), (3, 47), (4, 0),
               (7, 7), (7, 17), (8, 6), (8, 16)],
         1.0: [(0, 10), (1, 1), (1, 5), (2, 30), (2, 34), (5, 2), (5, 6),
               (6, 0), (6, 4), (7, 13), (8, 2)]
     }
     expected = np.zeros_like(sample.features)
     for value, positions in values_for_positions.items():
         for pos in positions:
             expected[pos[0], pos[1]] = value
     np.testing.assert_equal(sample.features, expected)
Exemplo n.º 11
0
    def test_004_check_specifics(self):
        temp_file=tempfile.NamedTemporaryFile(suffix='.bam', delete=False)
        bam_fname = temp_file.name
        region = Region('ref', 0, 8)
        create_simple_bam(
            bam_fname, simple_data['calls'][0:1])
        counts, _ = medaka.features.pileup_counts(
            region, bam_fname, num_qstrat=6, weibull_summation=True)[0]

        # should have only one non-zero per row
        non_zeroes = np.nonzero(counts)
        np.testing.assert_equal(non_zeroes[0], np.arange(0,8))
        np.testing.assert_equal(
            non_zeroes[1],
            # acgtACGTdD...
            #         2A  1C  4A  5T  1G  1A  2T  1G
            np.array([14,  5, 34, 47,  6,  4, 17,  6]))
Exemplo n.º 12
0
 def setUpClass(cls):
     bam_fname = tempfile.NamedTemporaryFile(suffix='.bam').name
     create_simple_bam(bam_fname, simple_data['calls'])
     cls.region = Region('ref', start=0, end=8)
     cls.bam = bam_fname
     cls.expected_counts = np.array(
         [[2, 0, 0, 0, 2, 0, 0, 0, 0, 0], [0, 2, 0, 0, 0, 2, 0, 0, 0, 0],
          [2, 0, 0, 0, 2, 0, 0, 0, 0, 0], [0, 1, 0, 1, 0, 0, 0, 1, 0, 1],
          [1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 2, 0, 0, 0, 2, 0, 0, 0],
          [2, 0, 0, 0, 2, 0, 0, 0, 0, 0], [0, 0, 0, 2, 0, 0, 0, 2, 0, 0],
          [0, 0, 2, 0, 0, 0, 2, 0, 0, 0]],
         dtype=np.uint64)
     cls.expected_positions = np.array([(0, 0), (1, 0), (2, 0), (3, 0),
                                        (3, 1), (4, 0), (5, 0), (6, 0),
                                        (7, 0)],
                                       dtype=[('major', '<i8'),
                                              ('minor', '<i8')])
Exemplo n.º 13
0
    def test_030_bams_to_training_samples_simple(self):
        reads_bam = tempfile.NamedTemporaryFile(suffix='.bam').name
        truth_bam = tempfile.NamedTemporaryFile(suffix='.bam').name

        # we had a bug caused by missing qualities and bad indexing...
        data = copy.deepcopy(simple_data['calls'])
        data[0]['quality'] = None

        create_simple_bam(reads_bam, data)
        create_simple_bam(truth_bam, [simple_data['truth']])
        encoder = medaka.features.CountsFeatureEncoder(normalise='total')
        label_scheme = medaka.labels.HaploidLabelScheme()
        region = Region('ref', 0, 100)
        result = encoder.bams_to_training_samples(truth_bam,
                                                  reads_bam,
                                                  region,
                                                  label_scheme,
                                                  min_length=0)[0]

        expected = Sample(
            ref_name='ref',
            features=np.array(
                [[0.5, 0., 0., 0., 0.5, 0., 0., 0., 0., 0.],
                 [0., 0.5, 0., 0., 0., 0.5, 0., 0., 0., 0.],
                 [0.5, 0., 0., 0., 0.5, 0., 0., 0., 0., 0.],
                 [0., 0.25, 0., 0.25, 0., 0., 0., 0.25, 0., 0.25],
                 [0.25, 0., 0., 0., 0., 0., 0., 0., 0., 0.],
                 [0., 0., 0.5, 0., 0., 0., 0.5, 0., 0., 0.],
                 [0.5, 0., 0., 0., 0.5, 0., 0., 0., 0., 0.],
                 [0., 0., 0., 0.5, 0., 0., 0., 0.5, 0., 0.],
                 [0., 0., 0.5, 0., 0., 0., 0.5, 0., 0., 0.]],
                dtype='float32'),
            # the two insertions with respect to the draft are dropped
            labels=np.array([1, 2, 1, 4, 1, 3, 1, 4, 3]),  # A C A T A G A T C
            ref_seq=None,
            positions=np.array([(0, 0), (1, 0), (2, 0), (3, 0), (3, 1), (4, 0),
                                (5, 0), (6, 0), (7, 0)],
                               dtype=[('major', '<i8'), ('minor', '<i8')]),
            label_probs=None)

        np.testing.assert_equal(result.labels, expected.labels)
        np.testing.assert_equal(result.positions, expected.positions)
        np.testing.assert_equal(result.features, expected.features)
Exemplo n.º 14
0
 def test_004_trim_mid(self):
     region = Region('ref', start=1, end=7)
     reads = self.get_reads(region)
     orig = [x[1:-1] for x in self.reads]
     self.assertEqual(reads, orig)
Exemplo n.º 15
0
    def bams_to_training_samples(self,
                                 truth_bam,
                                 bam,
                                 region,
                                 reference=None,
                                 read_fraction=None):
        """Prepare training data chunks.

        :param truth_bam: .bam file of truth aligned to ref to generate labels.
        :param bam: input .bam file.
        :param region: `Region` obj.
            the reference will be parsed.
        :param reference: reference `.fasta`, should correspond to `bam`.

        :returns: tuple of `Sample` objects.

        .. note:: Chunks might be missing if `truth_bam` is provided and
            regions with multiple mappings were encountered.

        """
        ref_rle = self.process_ref_seq(region.ref_name, reference)

        # filter truth alignments to restrict ourselves to regions of the ref where the truth
        # in unambiguous
        alignments = TruthAlignment.bam_to_alignments(truth_bam,
                                                      region.ref_name,
                                                      start=region.start,
                                                      end=region.end)
        filtered_alignments = TruthAlignment.filter_alignments(
            alignments, start=region.start, end=region.end)
        if len(filtered_alignments) == 0:
            self.logger.info(
                "Filtering removed all alignments of truth to ref from {}.".
                format(region))

        samples = []
        for aln in filtered_alignments:
            mock_compr = self.max_hp_len > 1 and not self.is_compressed
            truth_pos, truth_labels = aln.get_positions_and_labels(
                ref_compr_rle=ref_rle,
                mock_compr=mock_compr,
                is_compressed=self.is_compressed,
                rle_dtype=True)
            aln_samples = self.bam_to_sample(bam,
                                             Region(region.ref_name, aln.start,
                                                    aln.end),
                                             ref_rle,
                                             read_fraction=read_fraction)
            for sample in aln_samples:
                # Create labels according to positions in pileup
                pad = (encoding[_gap_],
                       1) if len(truth_labels.dtype) > 0 else encoding[_gap_]
                padder = itertools.repeat(pad)
                position_to_label = defaultdict(
                    padder.__next__,
                    zip([tuple(p) for p in truth_pos],
                        [a for a in truth_labels]))
                padded_labels = np.fromiter(
                    (position_to_label[tuple(p)] for p in sample.positions),
                    dtype=truth_labels.dtype,
                    count=len(sample.positions))

                sample = sample._asdict()
                sample['labels'] = padded_labels
                samples.append(Sample(**sample))
        return tuple(samples)
Exemplo n.º 16
0
                 ref_start=None,
                 ref_end=None,
                 ref_len=None,
                 query_seq='ACATGCAAGACACGAT',
                 ref_seq='AAAGGCAAGACACGAT'):
        self.reference_start = ref_start
        self.reference_end = ref_end
        self.reference_length = ref_len
        self.query_sequence = query_seq
        self.reference_sequence = ref_seq

    def get_reference_sequence(self):
        return self.reference_sequence


full_region = Region('Mock', 0, float('inf'))


class TruthAlignmentTest(unittest.TestCase):
    def test_case1(self):
        # case 1: longer < 2 x len shorter and >= 50% of shorter overlaps longer both should be removed
        starts_ends = [(2000, 2999), (2500, 3000)]
        expected = []

        alignments = [
            TruthAlignment(MockAlignment(start, end, end - start))
            for start, end in starts_ends
        ]
        filtered = [
            (f.start, f.end)
            for f in TruthAlignment._filter_alignments(alignments, full_region)
Exemplo n.º 17
0
import numpy as np
import os
import unittest
from medaka.features import FeatureEncoder, pileup_counts
from medaka.common import Region

__reads_bam__ = os.path.join(os.path.dirname(__file__), 'data',
                             'test_reads.bam')
__two_type_bam__ = os.path.join(os.path.dirname(__file__), 'data',
                                'test_two_type.bam')
__gapped_bam__ = os.path.join(os.path.dirname(__file__), 'data',
                              'reads_gapped.bam')
__region__ = Region('Consensus_Consensus_Consensus_Consensus_utg000001l',
                    start=50000,
                    end=100000)
__region_start__ = Region('Consensus_Consensus_Consensus_Consensus_utg000001l',
                          start=0,
                          end=200)

__kwargs__ = {
    'consensus_as_ref': False,
    'is_compressed': False,
    'log_min': None,
    'max_hp_len': 1,
    'normalise': 'total',
    'ref_mode': None,
    'with_depth': False
}


class CountsTest(unittest.TestCase):
Exemplo n.º 18
0
import numpy as np
import pysam

from .mock_data import simple_data, create_simple_bam
import libmedaka
import medaka.features
from medaka.common import Region, Sample
import medaka.labels

__reads_bam__ = os.path.join(os.path.dirname(__file__), 'data',
                             'test_reads.bam')
__reads_truth__ = os.path.join(os.path.dirname(__file__), 'data',
                               'truth_to_ref.bam')
__gapped_bam__ = os.path.join(os.path.dirname(__file__), 'data',
                              'reads_gapped.bam')
__region__ = Region('utg000001l', start=50000, end=100000)
__region_start__ = Region('utg000001l', start=0, end=200)


class CountsTest(unittest.TestCase):
    @classmethod
    def setUpClass(self):
        self.expected_width = 86294

    def test_001_basic_counting(self):
        kwargs = {'normalise': None}
        encoder = medaka.features.CountsFeatureEncoder(**kwargs)
        sample = encoder.bam_to_sample(__reads_bam__, __region__)
        self.assertEqual(len(sample), 1)
        sample = sample[0]
        assert tuple(sample.positions.shape) == (self.expected_width, )
Exemplo n.º 19
0
 def test_003_trim_end(self):
     region = Region('ref', start=6, end=8)
     reads = self.get_reads(region)
     orig = [x[-2:] for x in self.reads]
     self.assertEqual(reads, orig)
Exemplo n.º 20
0
 def test_002_trim_start(self):
     region = Region('ref', start=0, end=2)
     reads = self.get_reads(region)
     orig = [x[0:2] for x in self.reads]
     self.assertEqual(reads, orig)
Exemplo n.º 21
0
 def test_001_full_region(self):
     region = Region('ref', start=0, end=100000)
     reads = self.get_reads(region)
     self.assertEqual(reads, self.reads)