Exemplo n.º 1
0
    def test_030_bams_to_training_samples_simple(self):
        reads_bam = tempfile.NamedTemporaryFile(suffix='.bam').name
        truth_bam = tempfile.NamedTemporaryFile(suffix='.bam').name

        # we had a bug caused by missing qualities and bad indexing...
        data = copy.deepcopy(simple_data['calls'])
        data[0]['quality'] = None

        create_simple_bam(reads_bam, data)
        create_simple_bam(
            truth_bam, [simple_data['truth']])
        encoder = medaka.features.CountsFeatureEncoder(normalise='total')
        label_scheme = medaka.labels.HaploidLabelScheme()
        region = Region('ref', 0, 100)
        result = encoder.bams_to_training_samples(
            truth_bam, reads_bam, region, label_scheme, min_length=0)[0]

        expected = Sample(
            ref_name='ref',
            features=np.array([
                [0.5 , 0.  , 0.  , 0.  , 0.5 , 0.  , 0.  , 0.  , 0.  , 0.  ],
                [0.  , 0.5 , 0.  , 0.  , 0.  , 0.5 , 0.  , 0.  , 0.  , 0.  ],
                [0.5 , 0.  , 0.  , 0.  , 0.5 , 0.  , 0.  , 0.  , 0.  , 0.  ],
                [0.  , 0.25, 0.  , 0.25, 0.  , 0.  , 0.  , 0.25, 0.  , 0.25],
                [0.25, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ],
                [0.  , 0.  , 0.5 , 0.  , 0.  , 0.  , 0.5 , 0.  , 0.  , 0.  ],
                [0.5 , 0.  , 0.  , 0.  , 0.5 , 0.  , 0.  , 0.  , 0.  , 0.  ],
                [0.  , 0.  , 0.  , 0.5 , 0.  , 0.  , 0.  , 0.5 , 0.  , 0.  ],
                [0.  , 0.  , 0.5 , 0.  , 0.  , 0.  , 0.5 , 0.  , 0.  , 0.  ]],
                dtype='float32'),
            # the two insertions with respect to the draft are dropped
            labels=np.array([1, 2, 1, 4, 1, 3, 1, 4, 3]),  # A C A T A G A T C
            ref_seq=None,
            positions=np.array([
                (0, 0), (1, 0), (2, 0), (3, 0), (3, 1), (4, 0), (5, 0), (6, 0), (7, 0)],
                dtype=[('major', '<i8'), ('minor', '<i8')]),
            label_probs=None
        )

        np.testing.assert_equal(result.labels, expected.labels)
        np.testing.assert_equal(result.positions, expected.positions)
        np.testing.assert_equal(result.features, expected.features)
Exemplo n.º 2
0
 def test_size(self):
     a = Region('contig1', 50, 100)
     self.assertEqual(a.size, 50)
Exemplo n.º 3
0
    def bams_to_training_samples(self,
                                 truth_bam,
                                 bam,
                                 region,
                                 reference=None,
                                 read_fraction=None):
        """Prepare training data chunks.

        :param truth_bam: .bam file of truth aligned to ref to generate labels.
        :param bam: input .bam file.
        :param region: `Region` obj.
            the reference will be parsed.
        :param reference: reference `.fasta`, should correspond to `bam`.

        :returns: tuple of `Sample` objects.

        .. note:: Chunks might be missing if `truth_bam` is provided and
            regions with multiple mappings were encountered.

        """
        ref_rle = self.process_ref_seq(region.ref_name, reference)

        # filter truth alignments to restrict ourselves to regions of the ref where the truth
        # in unambiguous
        alignments = TruthAlignment.bam_to_alignments(truth_bam,
                                                      region.ref_name,
                                                      start=region.start,
                                                      end=region.end)
        filtered_alignments = TruthAlignment.filter_alignments(
            alignments, start=region.start, end=region.end)
        if len(filtered_alignments) == 0:
            self.logger.info(
                "Filtering removed all alignments of truth to ref from {}.".
                format(region))

        samples = []
        for aln in filtered_alignments:
            mock_compr = self.max_hp_len > 1 and not self.is_compressed
            truth_pos, truth_labels = aln.get_positions_and_labels(
                ref_compr_rle=ref_rle,
                mock_compr=mock_compr,
                is_compressed=self.is_compressed,
                rle_dtype=True)
            aln_samples = self.bam_to_sample(bam,
                                             Region(region.ref_name, aln.start,
                                                    aln.end),
                                             ref_rle,
                                             read_fraction=read_fraction)
            for sample in aln_samples:
                # Create labels according to positions in pileup
                pad = (encoding[_gap_],
                       1) if len(truth_labels.dtype) > 0 else encoding[_gap_]
                padder = itertools.repeat(pad)
                position_to_label = defaultdict(
                    padder.__next__,
                    zip([tuple(p) for p in truth_pos],
                        [a for a in truth_labels]))
                padded_labels = np.fromiter(
                    (position_to_label[tuple(p)] for p in sample.positions),
                    dtype=truth_labels.dtype,
                    count=len(sample.positions))

                sample = sample._asdict()
                sample['labels'] = padded_labels
                samples.append(Sample(**sample))
        return tuple(samples)
Exemplo n.º 4
0
                 ref_start=None,
                 ref_end=None,
                 ref_len=None,
                 query_seq='ACATGCAAGACACGAT',
                 ref_seq='AAAGGCAAGACACGAT'):
        self.reference_start = ref_start
        self.reference_end = ref_end
        self.reference_length = ref_len
        self.query_sequence = query_seq
        self.reference_sequence = ref_seq

    def get_reference_sequence(self):
        return self.reference_sequence


full_region = Region('Mock', 0, float('inf'))


class TruthAlignmentTest(unittest.TestCase):
    def test_case1(self):
        # case 1: longer < 2 x len shorter and >= 50% of shorter overlaps longer both should be removed
        starts_ends = [(2000, 2999), (2500, 3000)]
        expected = []

        alignments = [
            TruthAlignment(MockAlignment(start, end, end - start))
            for start, end in starts_ends
        ]
        filtered = [
            (f.start, f.end)
            for f in TruthAlignment._filter_alignments(alignments, full_region)
Exemplo n.º 5
0
import numpy as np
import os
import unittest
from medaka.features import FeatureEncoder, pileup_counts
from medaka.common import Region

__reads_bam__ = os.path.join(os.path.dirname(__file__), 'data',
                             'test_reads.bam')
__two_type_bam__ = os.path.join(os.path.dirname(__file__), 'data',
                                'test_two_type.bam')
__gapped_bam__ = os.path.join(os.path.dirname(__file__), 'data',
                              'reads_gapped.bam')
__region__ = Region('Consensus_Consensus_Consensus_Consensus_utg000001l',
                    start=50000,
                    end=100000)
__region_start__ = Region('Consensus_Consensus_Consensus_Consensus_utg000001l',
                          start=0,
                          end=200)

__kwargs__ = {
    'consensus_as_ref': False,
    'is_compressed': False,
    'log_min': None,
    'max_hp_len': 1,
    'normalise': 'total',
    'ref_mode': None,
    'with_depth': False
}


class CountsTest(unittest.TestCase):
Exemplo n.º 6
0
 def test_004_trim_mid(self):
     region = Region('ref', start=1, end=7)
     reads = self.get_reads(region)
     orig = [x[1:-1] for x in self.reads]
     self.assertEqual(reads, orig)
Exemplo n.º 7
0
 def test_003_trim_end(self):
     region = Region('ref', start=6, end=8)
     reads = self.get_reads(region)
     orig = [x[-2:] for x in self.reads]
     self.assertEqual(reads, orig)
Exemplo n.º 8
0
 def test_002_trim_start(self):
     region = Region('ref', start=0, end=2)
     reads = self.get_reads(region)
     orig = [x[0:2] for x in self.reads]
     self.assertEqual(reads, orig)
Exemplo n.º 9
0
 def test_001_full_region(self):
     region = Region('ref', start=0, end=100000)
     reads = self.get_reads(region)
     self.assertEqual(reads, self.reads)
Exemplo n.º 10
0
import numpy as np
import pysam

from .mock_data import simple_data, create_simple_bam
import libmedaka
import medaka.features
from medaka.common import Region, Sample
import medaka.labels

__reads_bam__ = os.path.join(os.path.dirname(__file__), 'data',
                             'test_reads.bam')
__reads_truth__ = os.path.join(os.path.dirname(__file__), 'data',
                               'truth_to_ref.bam')
__gapped_bam__ = os.path.join(os.path.dirname(__file__), 'data',
                              'reads_gapped.bam')
__region__ = Region('utg000001l', start=50000, end=100000)
__region_start__ = Region('utg000001l', start=0, end=200)


class CountsTest(unittest.TestCase):
    @classmethod
    def setUpClass(self):
        self.expected_width = 86294

    def test_001_basic_counting(self):
        kwargs = {'normalise': None}
        encoder = medaka.features.CountsFeatureEncoder(**kwargs)
        sample = encoder.bam_to_sample(__reads_bam__, __region__)
        self.assertEqual(len(sample), 1)
        sample = sample[0]
        assert tuple(sample.positions.shape) == (self.expected_width, )
Exemplo n.º 11
0
def main():
    # Entry point for testing/checking
    logging.basicConfig(format='[%(asctime)s - %(name)s] %(message)s', datefmt='%H:%M:%S', level=logging.INFO)
    np.set_printoptions(precision=4, linewidth=100)
    
    
    parser = argparse.ArgumentParser('medaka', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('bam', help='alignment file.')
    parser.add_argument('region', help='alignment region to sample.')
    parser.add_argument('--print', action='store_true', help='print counts.')
    parser.add_argument('--dtypes', nargs='+', help='perform a multi-datatype tests.')
    parser.add_argument('--norm', nargs='+', help='additional normalisation tests. (total, fwd_rev)')

    args = parser.parse_args()
    
    region = Region.from_string(args.region)
    
    kwargs={
        'log_min': None,
        'max_hp_len': 1,
        'is_compressed': False,
        'consensus_as_ref': False,
        'ref_mode': None,
        'with_depth': False,
    }
    
    def _print(samples):
       if args.print:
           for p, f in zip(samples.positions, samples.features):
               print('{}\t{}\t0\t{}\t{}'.format(p[0], p[1], '\t'.join('{:.3f}'.format(x) if x>0.0 else '-' for x in f), sum(f)))
    
    dtype_options = [('',)]
    if args.dtypes is not None:
        dtype_options.append(args.dtypes)
    norm_options = [None, ]
    if args.norm is not None:
        norm_options.extend(args.norm)
    
    for dtypes in dtype_options:
        kwargs['dtypes'] = dtypes
        for norm in norm_options:
            kwargs['normalise'] = norm
    
            print("###########################################################")
            print(kwargs)
            encoder = FeatureEncoder(**kwargs)
        
            # py-style
            t0=now()
            samples = encoder.bam_to_sample(args.bam, region, force_py=True)[0]
            t1=now()
            if not samples.is_empty:
                print(samples.features.shape)
                _print(samples)
            else:
                print("Samples is empty")
            print("---------------------")
        
            # C-style
            t2=now()
            samples = encoder.bam_to_sample(args.bam, region)[0]
            t3=now()
            if not samples.is_empty:
                print(samples.features.shape)
                _print(samples)
            else:
                print("Samples is empty")
        
            print("pysam time:", t1 - t0)
            print("hts time:", t3 - t2)