Exemplo n.º 1
0
    def test_read_pair_inversion_overlapping_query_coverage(self):
        # seq AAATTTCCCGGGAATTCCGGATCGATCGAT
        # r1  AAATTTCCCGGGAATTCCGGAtcgatcgat +
        # r2c aaatttcccgggaattccGGATCGATCGAT -
        # i   ------------------GGA---------
        # r2  ATCTATCGATCCggaattcccgggaaattt 100+12 = 111 - 3 = 108
        seq = 'AAATTTCCCGGGAATTCCGGATCGATCGAT'  # 30
        r1 = MockRead(
            reference_id=0,
            reference_name='1',
            reference_start=0,
            cigar=[(CIGAR.M, 21), (CIGAR.S, 9)],
            query_sequence=seq,
            is_reverse=False,
        )

        r2 = MockRead(
            reference_id=0,
            reference_name='1',
            reference_start=99,
            cigar=[(CIGAR.M, 12), (CIGAR.S, 18)],
            query_sequence=reverse_complement(seq),
            is_reverse=True,
        )
        bpp = align.call_paired_read_event(r1, r2, is_stranded=True)
        assert bpp.break1.strand == STRAND.POS
        assert bpp.break2.strand == STRAND.NEG
        assert bpp.break1.orient == ORIENT.LEFT
        assert bpp.break2.orient == ORIENT.LEFT
        assert bpp.untemplated_seq == ''
        assert bpp.break1.start == 21
        assert bpp.break2.start == 108
        assert bpp.break1.seq == 'AAATTTCCCGGGAATTCCGGA'
        assert bpp.break2.seq == reverse_complement('TCGATCGAT')
Exemplo n.º 2
0
    def test_read_pair_inversion_gap_in_query_coverage(self):
        # seq AAATTTCCCGGGAATTCCGGATCGATCGAT
        # r1  AAATTTCCCGGGAATTccggatcgatcgat +
        # r2c aaatttcccgggaattccGGATCGATCGAT -
        # i   ----------------CC------------
        # r2  ATCTATCGATCCggaattcccgggaaattt 100+12 = 111 - 3 = 108
        seq = 'AAATTTCCCGGGAATTCCGGATCGATCGAT'  # 30
        r1 = MockRead(reference_id=0,
                      reference_name='1',
                      reference_start=0,
                      cigar=[(CIGAR.M, 16), (CIGAR.S, 14)],
                      query_sequence=seq,
                      is_reverse=False)

        r2 = MockRead(reference_id=0,
                      reference_name='1',
                      reference_start=99,
                      cigar=[(CIGAR.M, 12), (CIGAR.S, 18)],
                      query_sequence=reverse_complement(seq),
                      is_reverse=True)
        bpp = align.call_paired_read_event(r1, r2)
        self.assertEqual(STRAND.POS, bpp.break1.strand)
        self.assertEqual(STRAND.NEG, bpp.break2.strand)
        self.assertEqual(ORIENT.LEFT, bpp.break1.orient)
        self.assertEqual(ORIENT.LEFT, bpp.break2.orient)
        self.assertEqual('CC', bpp.untemplated_seq)
        self.assertEqual(16, bpp.break1.start)
        self.assertEqual(111, bpp.break2.start)
        self.assertEqual('AAATTTCCCGGGAATT', bpp.break1.seq)
        self.assertEqual(reverse_complement('GGATCGATCGAT'), bpp.break2.seq)
Exemplo n.º 3
0
    def test_read_pair_large_inversion_overlapping_query_coverage(self):
        s = 'CTGAGCATGAAAGCCCTGTAAACACAGAATTTGGATTCTTTCCTGTTTGGTTCCTGGTCGTGAGTGGCAGGTGCCATCATGTTTCATTCTGCCTGAGAGCAGTCTACCTAAATATATAGCTCTGCTCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACACTTCAGATAATGGCTTCCTACATATTGTTGGTTATGAAATTTCAGGGTTTTCATTTCTGTATGTTAAT'

        read1 = MockRead(reference_id=3,
                         reference_start=1114,
                         cigar=[(CIGAR.S, 125), (CIGAR.EQ, 120)],
                         query_sequence=s,
                         is_reverse=False)
        read2 = MockRead(reference_id=3,
                         reference_start=2187,
                         cigar=[(CIGAR.S, 117), (CIGAR.EQ, 8), (CIGAR.D, 1),
                                (CIGAR.M, 120)],
                         query_sequence=reverse_complement(s),
                         is_reverse=True)
        bpp = align.call_paired_read_event(read1, read2)
        self.assertEqual(STRAND.POS, bpp.break1.strand)
        self.assertEqual(STRAND.NEG, bpp.break2.strand)
        self.assertEqual(ORIENT.RIGHT, bpp.break1.orient)
        self.assertEqual(ORIENT.RIGHT, bpp.break2.orient)
        self.assertEqual('', bpp.untemplated_seq)
        self.assertEqual(1115, bpp.break1.start)
        self.assertEqual(2188 + 3, bpp.break2.start)
        print(bpp.break1.seq)
        print(bpp.break2.seq)
        self.assertEqual(
            'TCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACACTTCAGATAATGGCTTCCTACATATTGTTGGTTATGAAATTTCAG'
            'GGTTTTCATTTCTGTATGTTAAT', bpp.break1.seq)
        self.assertEqual(
            'GCAGAGCTATATATTTAGGTAGACTGCTCTCAGGCAGAATGAAACATGATGGCACCTGCCACTCACGACCAGGAACCAAACAGGAAAGAATCCA'
            'AATTCTGTGTTTACAGGGCTTTCATGCTCAG', bpp.break2.seq)
Exemplo n.º 4
0
 def test_blat_contigs_deletion_revcomp(self):
     ev = GenomeEvidence(Breakpoint('fake', 1714, orient=ORIENT.LEFT),
                         Breakpoint('fake', 2968, orient=ORIENT.RIGHT),
                         opposing_strands=False,
                         bam_cache=BAM_CACHE,
                         reference_genome=REFERENCE_GENOME,
                         read_length=40,
                         stdev_fragment_size=25,
                         median_fragment_size=100)
     seq = 'GGTATATATTTCTCAGATAAAAGATATTTTCCCTTTTATCTTTCCCTAAGCTCACACTACATATATTGCATTTATCTTATATCTGCTTTAAAACCTATTTAT' \
           'TATGTCATTTAAATATCTAGAAAAGTTATGACTTCACCAGGTATGAAAAATATAAAAAGAACTCTGTCAAGAAT'
     ev.contigs = [Contig(reverse_complement(seq), 0)]
     align.select_contig_alignments(
         ev,
         align.align_sequences({'seq': ev.contigs[0].seq},
                               BAM_CACHE,
                               REFERENCE_GENOME,
                               aligner_reference=REFERENCE_GENOME_FILE_2BIT,
                               aligner='blat'))
     print('alignments:', ev.contigs[0].alignments)
     alignment = list(ev.contigs[0].alignments)[0]
     print(alignment)
     self.assertTrue(alignment.read2 is None)
     self.assertEqual(0, alignment.read1.reference_id)
     self.assertTrue(alignment.read1.is_reverse)
     self.assertEqual(seq, alignment.read1.query_sequence)
     self.assertEqual(Interval(0, 175),
                      align.query_coverage_interval(alignment.read1))
     self.assertEqual(1612, alignment.read1.reference_start)
     self.assertEqual([(CIGAR.EQ, 102), (CIGAR.D, 1253), (CIGAR.EQ, 74)],
                      alignment.read1.cigar)
Exemplo n.º 5
0
 def test_inversion_and_deletion(self):
     s = 'CTGAGCATGAAAGCCCTGTAAACACAGAATTTGGATTCTTTCCTGTTTGGTTCCTGGTCGTGAGTGGCAGGTGCCATCATGTTTCATTCTGCCTGAGAGCAGTCTACCTAAATATATAGCTCTGCTCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACACTTCAGATAATGGCTTCCTACATATTGTTGGTTATGAAATTTCAGGGTTTTCATTTCTGTATGTTAAT'
     evidence = MockObject(
         interchromosomal=False,
         opposing_strands=True,
         break1=MockObject(orient=ORIENT.RIGHT, chr='3'),
         break2=MockObject(orient=ORIENT.RIGHT, chr='3'),
         contigs=[MockObject(seq=s, alignments=set())],
         standardize_read=lambda x: x,
         contig_aln_max_event_size=DEFAULTS.contig_aln_max_event_size,
         contig_aln_merge_inner_anchor=5,
         contig_aln_merge_outer_anchor=DEFAULTS.contig_aln_merge_outer_anchor,
         contig_aln_min_query_consumption=0.9,
         contig_aln_min_extend_overlap=DEFAULTS.contig_aln_min_extend_overlap,
         contig_aln_min_anchor_size=DEFAULTS.contig_aln_min_anchor_size,
         contig_aln_min_score=DEFAULTS.contig_aln_min_score,
         outer_window1=Interval(1000, 1200),
         outer_window2=Interval(2000, 2200),
         reference_genome=None,
         bam_cache=mock.Mock(stranded=False)
     )
     read1 = SamRead(
         reference_id=3, reference_start=1114, cigar=[(CIGAR.S, 125), (CIGAR.EQ, 120)], query_sequence=s,
         is_reverse=False, reference_name='3', alignment_rank=0
     )
     read2 = SamRead(
         reference_id=3, reference_start=2187, cigar=[(CIGAR.S, 117), (CIGAR.EQ, 8), (CIGAR.D, 1), (CIGAR.EQ, 120)],
         query_sequence=reverse_complement(s), is_reverse=True, reference_name='3', alignment_rank=1
     )
     raw_alignments = {s: [read1, read2]}
     align.select_contig_alignments(evidence, raw_alignments)
     alignments = list(evidence.contigs[0].alignments)
     self.assertEqual(2, len(alignments))
Exemplo n.º 6
0
 def test_gimap4_reverse(self):
     gimap4 = EXAMPLE_GENES['GIMAP4']
     gimap4_seq = reverse_complement(gimap4.seq)
     donors = predict_splice_sites(gimap4_seq, True)
     for d in donors:
         assert gimap4_seq[d.start - 1:d.end] == d.seq
     assert len(donors) == 5
Exemplo n.º 7
0
    def test_read_pair_large_inversion_overlapping_query_coverage(self):
        s = 'CTGAGCATGAAAGCCCTGTAAACACAGAATTTGGATTCTTTCCTGTTTGGTTCCTGGTCGTGAGTGGCAGGTGCCATCATGTTTCATTCTGCCTGAGAGCAGTCTACCTAAATATATAGCTCTGCTCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACACTTCAGATAATGGCTTCCTACATATTGTTGGTTATGAAATTTCAGGGTTTTCATTTCTGTATGTTAAT'

        read1 = MockRead(
            reference_id=3,
            reference_start=1114,
            cigar=[(CIGAR.S, 125), (CIGAR.EQ, 120)],
            query_sequence=s,
            is_reverse=False,
        )
        read2 = MockRead(
            reference_id=3,
            reference_start=2187,
            cigar=[(CIGAR.S, 117), (CIGAR.EQ, 8), (CIGAR.D, 1), (CIGAR.M, 120)],
            query_sequence=reverse_complement(s),
            is_reverse=True,
        )
        bpp = align.call_paired_read_event(read1, read2, is_stranded=True)
        assert bpp.break1.strand == STRAND.POS
        assert bpp.break2.strand == STRAND.NEG
        assert bpp.break1.orient == ORIENT.RIGHT
        assert bpp.break2.orient == ORIENT.RIGHT
        assert bpp.untemplated_seq == ''
        assert bpp.break1.start == 1115
        assert bpp.break2.start == 2188 + 3
        print(bpp.break1.seq)
        print(bpp.break2.seq)
        assert (
            bpp.break1.seq
            == 'TCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACACTTCAGATAATGGCTTCCTACATATTGTTGGTTATGAAATTTCAGGGTTTTCATTTCTGTATGTTAAT'
        )
        assert (
            bpp.break2.seq
            == 'GCAGAGCTATATATTTAGGTAGACTGCTCTCAGGCAGAATGAAACATGATGGCACCTGCCACTCACGACCAGGAACCAAACAGGAAAGAATCCAAATTCTGTGTTTACAGGGCTTTCATGCTCAG'
        )
Exemplo n.º 8
0
 def test_gimap4_reverse(self):
     gimap4 = EXAMPLE_GENES['GIMAP4']
     gimap4_seq = reverse_complement(gimap4.seq)
     donors = predict_splice_sites(gimap4_seq, True)
     for d in donors:
         self.assertEqual(d.seq, gimap4_seq[d.start - 1:d.end])
     self.assertEqual(5, len(donors))
Exemplo n.º 9
0
 def test_bwa_contigs(self):
     ev = GenomeEvidence(
         Breakpoint('reference3', 1114, orient=ORIENT.RIGHT),
         Breakpoint('reference3', 2187, orient=ORIENT.RIGHT),
         opposing_strands=True,
         bam_cache=BAM_CACHE,
         reference_genome=REFERENCE_GENOME,
         read_length=40,
         stdev_fragment_size=25,
         median_fragment_size=100,
         stdev_count_abnormal=2,
         min_splits_reads_resolution=1,
         min_flanking_pairs_resolution=1,
     )
     ev.contigs = [
         Contig(
             'CTGAGCATGAAAGCCCTGTAAACACAGAATTTGGATTCTTTCCTGTTTGGTTCCTGGTCGTGAGTGGCAGGTGCCATCATGTTTCATTCTGCCTGAGAGCAG'
             'TCTACCTAAATATATAGCTCTGCTCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACACTTCAGATAATGGCTTCCTACATATTG'
             'TTGGTTATGAAATTTCAGGGTTTTCATTTCTGTATGTTAAT',
             0,
         )
     ]
     print(ev.contigs[0].seq)
     seq = align.align_sequences(
         {'seq': ev.contigs[0].seq},
         BAM_CACHE,
         REFERENCE_GENOME,
         aligner_reference=get_data('mock_reference_genome.fa'),
         aligner='bwa mem',
         aligner_output_file='mem.out',
         aligner_fa_input_file='mem.in.fa',
     )
     align.select_contig_alignments(ev, seq)
     print(ev.contigs[0].alignments)
     alignment = list(ev.contigs[0].alignments)[0]
     self.assertEqual(reverse_complement(alignment.read1.query_sequence),
                      alignment.read2.query_sequence)
     self.assertEqual('reference3', alignment.read1.reference_name)
     self.assertEqual('reference3', alignment.read2.reference_name)
     self.assertEqual(1, alignment.read1.reference_id)
     self.assertEqual(1, alignment.read2.reference_id)
     self.assertEqual(Interval(125, 244),
                      align.query_coverage_interval(alignment.read1))
     self.assertEqual(Interval(117, 244),
                      align.query_coverage_interval(alignment.read2))
     self.assertEqual(1114, alignment.read1.reference_start)
     self.assertEqual(2187, alignment.read2.reference_start)
     self.assertEqual([(CIGAR.S, 125), (CIGAR.EQ, 120)],
                      alignment.read1.cigar)
     self.assertEqual([(CIGAR.S, 117), (CIGAR.EQ, 128)],
                      alignment.read2.cigar)
Exemplo n.º 10
0
 def test_simple(self):
     row = {
         'match': 142,
         'mismatch': 0,
         'repmatch': 0,
         'ncount': 0,
         'qgap_count': 0,
         'qgap_bases': 0,
         'tgap_count': 0,
         'tgap_bases': 0,
         'strand': '-',
         'qname': 'seq1',
         'qsize': 204,
         'qstart': 0,
         'qend': 142,
         'tname': '17',
         'tsize': 81195210,
         'tstart': 32673408,
         'tend': 32673550,
         'block_count': 1,
         'block_sizes': [142],
         'qstarts': [62],
         'tstarts': [32673408],
         '_index': 880,
         'score': 142,
         'percent_ident': 100.0,
         'qseq_full': (
             'ACATGTGCACAACGTGCAGGTTTGTTACATATGTATACATGTGCCATGTTGGTTTGCTGCACCCATTAACTCGTCCTAGTTTATTACTAGTCTTCAGACATC'
             'CAGAAAATAGAGTAAGATACTAGGTAGACATAACACCTAGATACATCCGTAAGGCATTTGTTTCCTATCACATGGCCCATTCTAGCTTAACACCCACCAACT'
         )}
     refseq = {'17': Mock(seq=MockLongString(
         'ACTAGGTGTTATGTCTACCTAGTATCTTACTCTATTTTCTGGATGTCTGAAGACTAGTAATAAACTAGGACGAGTTAATGGGTGCAGCAAACCAACATGGCACATG'
         'TATACATATGTAACAAACCTGCACGTTGTGCACATGTACCCTAAAACTTAAAGTATAAAAAAAAATTTCACTGAGCATAAGACTTCAGACACAAAAGAGTGCATGC'
         'CATATAATTCCATTTATGTGAATTTCAAGAACAATCAGTGATGACAGAAGTCAAAGTAGTGGTCACCTCTGGAAGGTGGGACATTGACC',
         32673407))}
     cache = Mock(reference_id=MockFunction(16))
     read = Blat.pslx_row_to_pysam(row, cache, refseq)
     self.assertEqual(16, read.reference_id)
     self.assertEqual('17', read.reference_name)
     self.assertEqual(row['qseq_full'], reverse_complement(read.query_sequence))
     self.assertEqual([(CIGAR.S, 62), (CIGAR.EQ, 142)], read.cigar)
Exemplo n.º 11
0
 def test_blat_contigs_deletion_revcomp(self):
     ev = GenomeEvidence(
         Breakpoint('fake', 1714, orient=ORIENT.LEFT),
         Breakpoint('fake', 2968, orient=ORIENT.RIGHT),
         opposing_strands=False,
         bam_cache=BAM_CACHE,
         reference_genome=REFERENCE_GENOME,
         read_length=40,
         stdev_fragment_size=25,
         median_fragment_size=100,
     )
     seq = (
         'GGTATATATTTCTCAGATAAAAGATATTTTCCCTTTTATCTTTCCCTAAGCTCACACTACATATATTGCATTTATCTTATATCTGCTTTAAAACCTATTTAT'
         'TATGTCATTTAAATATCTAGAAAAGTTATGACTTCACCAGGTATGAAAAATATAAAAAGAACTCTGTCAAGAAT'
     )
     ev.contigs = [Contig(reverse_complement(seq), 0)]
     align.select_contig_alignments(
         ev,
         align.align_sequences(
             {'seq': ev.contigs[0].seq},
             BAM_CACHE,
             REFERENCE_GENOME,
             aligner_reference=get_data('mock_reference_genome.2bit'),
             aligner='blat',
         ),
     )
     print('alignments:', ev.contigs[0].alignments)
     alignment = list(ev.contigs[0].alignments)[0]
     print(alignment)
     assert alignment.read2 is None
     assert alignment.read1.reference_id == 0
     assert alignment.read1.is_reverse
     assert alignment.read1.query_sequence == seq
     assert align.query_coverage_interval(alignment.read1) == Interval(0, 175)
     assert alignment.read1.reference_start == 1612
     assert alignment.read1.cigar == [(CIGAR.EQ, 102), (CIGAR.D, 1253), (CIGAR.EQ, 74)]
Exemplo n.º 12
0
    def test_pslx_row_to_pysam_inversion(self):
        s = 'CTGAGCATGAAAGCCCTGTAAACACAGAATTTGGATTCTTTCCTGTTTGGTTCCTGGTCGTGAGTGGCAGGTGCCATCATGTTTCATTCTGCCTGAGAGCAGTCTACCTAAATATATAGCTCTGCTCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACACTTCAGATAATGGCTTCCTACATATTGTTGGTTATGAAATTTCAGGGTTTTCATTTCTGTATGTTAAT'
        # first part of the inversion
        pslx_row = {
            'block_count':
            1,
            'tstarts': [1114],
            'block_sizes': [120],
            'qname':
            'seq1',
            'tname':
            'reference3',
            'qstarts': [125],
            'strand':
            '+',
            'qseq_full':
            s,
            'score':
            1,
            'qseqs': [
                'TCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACACTTCAGATAATGGCTTCCTACATATTGTTGGTTATGAAATTTCAGGG'
                'TTTTCATTTCTGTATGTTAAT'
            ],
            'tseqs': [
                'TCACAGTTTCCCTGCAATGCATAATTAAAATAGCACTATGCAGTTGCTTACACTTCAGATAATGGCTTCCTACATATTGTTGGTTATGAAATTTCAGGG'
                'TTTTCATTTCTGTATGTTAAT'
            ],
        }
        read1 = Blat.pslx_row_to_pysam(pslx_row, self.cache, REFERENCE_GENOME)
        self.assertEqual(3, read1.reference_id)
        self.assertEqual(Interval(125, 244), query_coverage_interval(read1))
        self.assertEqual(1114, read1.reference_start)
        self.assertEqual([(CIGAR.S, 125), (CIGAR.EQ, 120)], read1.cigar)

        # second part of the inversion
        pslx_row = {
            'block_count':
            1,
            'tstarts': [2187],
            'block_sizes': [128],
            'qname':
            'seq1',
            'tname':
            'reference3',
            'qstarts': [117],
            'strand':
            '-',
            'qseq_full':
            s,
            'score':
            1,
            'qseqs': [
                'TGAGCAGAGCTATATATTTAGGTAGACTGCTCTCAGGCAGAATGAAACATGATGGCACCTGCCACTCACGACCAGGAACCAAACAGGAAAGAATCCAAAT'
                'TCTGTGTTTACAGGGCTTTCATGCTCAG'
            ],
            'tseqs': [
                'TGAGCAGAGCTATATATTTAGGTAGACTGCTCTCAGGCAGAATGAAACATGATGGCACCTGCCACTCACGACCAGGAACCAAACAGGAAAGAATCCAAAT'
                'TCTGTGTTTACAGGGCTTTCATGCTCAG'
            ],
        }
        read2 = Blat.pslx_row_to_pysam(pslx_row, self.cache, REFERENCE_GENOME)
        self.assertEqual(3, read2.reference_id)
        self.assertEqual(2187, read2.reference_start)
        self.assertEqual([(CIGAR.S, 117), (CIGAR.EQ, 128)], read2.cigar)
        self.assertEqual(Interval(117, 244), query_coverage_interval(read2))
        self.assertEqual(read1.query_sequence,
                         reverse_complement(read2.query_sequence))
Exemplo n.º 13
0
 def test_reverse_complement(self):
     assert reverse_complement('CGAT') == 'ATCG'
     assert reverse_complement('') == ''
Exemplo n.º 14
0
 def test_reverse_complement(self):
     self.assertEqual('ATCG', reverse_complement('CGAT'))
     self.assertEqual('', reverse_complement(''))