def test_case2(self): # case 2: longer < 2 x len shorter and < 50% shorter overlaps longer, trim both starts_ends = [(5000, 5999), (5900, 6401)] alignments = [ TruthAlignment(MockAlignment(start, end, end - start)) for start, end in starts_ends ] filtered = [(f.start, f.end) for f in TruthAlignment._filter_alignments( alignments, full_region, min_length=401)] expected = [(5000, 5900), (5999, 6401)] if filtered != expected: raise AssertionError('got {}, expected {}'.format( filtered, expected)) filtered = [(f.start, f.end) for f in TruthAlignment._filter_alignments( alignments, full_region, min_length=899)] expected = [(5000, 5900)] if filtered != expected: raise AssertionError('got {}, expected {}'.format( filtered, expected)) filtered = [(f.start, f.end) for f in TruthAlignment._filter_alignments( alignments, full_region, min_length=1000)] expected = [] if filtered != expected: raise AssertionError('got {}, expected {}'.format( filtered, expected))
def test_case3(self): # case 3: longer >= 2 x len shorter and < 50% shorter overlaps longer, remove shorter starts_ends = [(7000, 8000), (7501, 8000)] expected = [(7000, 8000)] alignments = [ TruthAlignment(MockAlignment(start, end, end - start)) for start, end in starts_ends ] filtered = [ (f.start, f.end) for f in TruthAlignment._filter_alignments(alignments, full_region) ] if filtered != expected: raise AssertionError('got {}, expected {}'.format( filtered, expected)) # case 3 (contained): shorter contained within longer, shorter should be removed starts_ends = [(0, 1000), (100, 200)] expected = [(0, 1000)] alignments = [ TruthAlignment(MockAlignment(start, end, end - start)) for start, end in starts_ends ] filtered = [ (f.start, f.end) for f in TruthAlignment._filter_alignments(alignments, full_region) ] if filtered != expected: raise AssertionError('got {}, expected {}'.format( filtered, expected))
def test_labels_trimmed_back(self): # we should have two alignments which partially overlap # (318288, 417741) # (417732, 422799) # in this case, the first is >2 x longer than the second, so we trim back the second # check resulting positions and labels are non-overlapping alignments = TruthAlignment.bam_to_alignments(__truth_bam__, __ref_name__, start=318288, end=422799) assert alignments[0].start == 318288 assert alignments[0].end == 417741 assert alignments[1].start == 417732 assert alignments[1].end == 422799 filtered_alignments = TruthAlignment.filter_alignments(alignments) assert filtered_alignments[0].start == 318288 assert filtered_alignments[0].end == 417741 assert filtered_alignments[1].start == 417741 assert filtered_alignments[1].end == 422799 p1_positions, p1_labels = filtered_alignments[ 0].get_positions_and_labels() p2_positions, p2_labels = filtered_alignments[ 1].get_positions_and_labels() assert p1_positions[0]['major'] == filtered_alignments[0].start assert p1_positions[-1]['major'] == filtered_alignments[0].end - 1 assert p2_positions[0]['major'] == filtered_alignments[1].start assert p2_positions[-1]['major'] == filtered_alignments[1].end - 1
def test_case4(self): # case 4: longer >= 2 x len shorter and < 50% shorter overlaps longer, trim shorter starts_ends = [(3000, 4000), (3800, 4299)] expected = [(3000, 4000), (4000, 4299)] alignments = [TruthAlignment(MockAlignment(start, end, end-start)) for start, end in starts_ends] filtered = [(f.start, f.end) for f in TruthAlignment.filter_alignments(alignments, min_length=298)] if filtered != expected: raise AssertionError('got {}, expected {}'.format(filtered, expected))
def test_case1(self): # case 1: longer < 2 x len shorter and >= 50% of shorter overlaps longer both should be removed starts_ends = [(2000, 2999), (2500, 3000)] expected = [] alignments = [TruthAlignment(MockAlignment(start, end, end-start)) for start, end in starts_ends] filtered = [(f.start, f.end) for f in TruthAlignment.filter_alignments(alignments)] if filtered != expected: raise AssertionError('got {}, expected {}'.format(filtered, expected))
def test_many(self): starts_ends = [(0, 1000), (100, 200), (3000, 4000), (3800, 4299), (7000, 8000), (7501, 8000), (2000, 2999), (2500, 3000), (5000, 5999), (5900, 6401),] alignments = [TruthAlignment(MockAlignment(start, end, end-start)) for start, end in starts_ends] filtered = [(f.start, f.end) for f in TruthAlignment.filter_alignments(alignments, min_length=1)] expected = [(0, 1000), (3000, 4000), (4000, 4299), (5000, 5900), (5999, 6401), (7000, 8000)] if filtered != expected: raise AssertionError('got {}, expected {}'.format(filtered, expected)) starts_ends_m = starts_ends + [(0, 5000)] alignments = [TruthAlignment(MockAlignment(start, end, end-start)) for start, end in starts_ends_m] filtered = [(f.start, f.end) for f in TruthAlignment.filter_alignments(alignments, min_length=1)] expected = [(0, 5000), (5000, 5900), (5999, 6401), (7000, 8000)] if filtered != expected: raise AssertionError('got {}, expected {}'.format(filtered, expected)) starts_ends_m = starts_ends + [(0, 5200)] alignments = [TruthAlignment(MockAlignment(start, end, end-start)) for start, end in starts_ends_m] filtered = [(f.start, f.end) for f in TruthAlignment.filter_alignments(alignments, min_length=1)] expected = [(0, 5200), (5200, 5900), (5999, 6401), (7000, 8000)] if filtered != expected: raise AssertionError('got {}, expected {}'.format(filtered, expected)) starts_ends_m = starts_ends + [(0, 10000)] alignments = [TruthAlignment(MockAlignment(start, end, end-start)) for start, end in starts_ends_m] filtered = [(f.start, f.end) for f in TruthAlignment.filter_alignments(alignments, min_length=1)] expected = [(0, 10000)] if filtered != expected: raise AssertionError('got {}, expected {}'.format(filtered, expected))
def test_labels_trimmed_back(self): # we should have two alignments which partially overlap # (318288, 417741) # (417732, 422799) # in this case, the first is >2 x longer than the second, so we trim back the second # check resulting positions and labels are non-overlapping alignments = TruthAlignment.bam_to_alignments( __truth_bam__, Region(__ref_name__, start=318288, end=422799)) self.assertEqual(alignments[0][0].start, 318288) self.assertEqual(alignments[0][0].end, 417741) self.assertEqual(alignments[1][0].start, 417741) self.assertEqual(alignments[1][0].end, 422799)
def bams_to_training_samples(self, truth_bam, bam, region, reference=None, read_fraction=None): """Prepare training data chunks. :param truth_bam: .bam file of truth aligned to ref to generate labels. :param bam: input .bam file. :param region: `Region` obj. the reference will be parsed. :param reference: reference `.fasta`, should correspond to `bam`. :returns: tuple of `Sample` objects. .. note:: Chunks might be missing if `truth_bam` is provided and regions with multiple mappings were encountered. """ ref_rle = self.process_ref_seq(region.ref_name, reference) # filter truth alignments to restrict ourselves to regions of the ref where the truth # in unambiguous alignments = TruthAlignment.bam_to_alignments(truth_bam, region.ref_name, start=region.start, end=region.end) filtered_alignments = TruthAlignment.filter_alignments( alignments, start=region.start, end=region.end) if len(filtered_alignments) == 0: self.logger.info( "Filtering removed all alignments of truth to ref from {}.". format(region)) samples = [] for aln in filtered_alignments: mock_compr = self.max_hp_len > 1 and not self.is_compressed truth_pos, truth_labels = aln.get_positions_and_labels( ref_compr_rle=ref_rle, mock_compr=mock_compr, is_compressed=self.is_compressed, rle_dtype=True) aln_samples = self.bam_to_sample(bam, Region(region.ref_name, aln.start, aln.end), ref_rle, read_fraction=read_fraction) for sample in aln_samples: # Create labels according to positions in pileup pad = (encoding[_gap_], 1) if len(truth_labels.dtype) > 0 else encoding[_gap_] padder = itertools.repeat(pad) position_to_label = defaultdict( padder.__next__, zip([tuple(p) for p in truth_pos], [a for a in truth_labels])) padded_labels = np.fromiter( (position_to_label[tuple(p)] for p in sample.positions), dtype=truth_labels.dtype, count=len(sample.positions)) sample = sample._asdict() sample['labels'] = padded_labels samples.append(Sample(**sample)) return tuple(samples)