示例#1
0
    def test_homopolymer_even_odd(self):
        ref = 'ATCGAGAT' + 'A' * 15 + 'TCGAGAT'
        read = MockRead(
            'name',
            1,
            1,
            query_sequence='ATCGAGATA' + 'A' * 12 + 'TCGAGAT',
            cigar=[(CIGAR.EQ, 8), (CIGAR.D, 2), (CIGAR.EQ, 20)],
        )
        self.assertEqual([(CIGAR.EQ, 9 + 12), (CIGAR.D, 2), (CIGAR.EQ, 7)],
                         hgvs_standardize_cigar(read, ref))
        ref = (
            'CCCCGGCTCATGTCTGGTTTTGTTTTCCGGGGGCGGGGGGGCTCCCTGGGGATGATGGTGATTTTTTTTTTTTTTTAATCCTCAACTAGGAGAGAAAA'
            'TGAGGCAGAGACAATGTGGGGAGCGAGAGAGGGGAAAAGGACGGGGGAGG')

        read = MockRead(
            'name',
            '1',
            0,
            149,
            query_sequence=
            ('CCCCGGCTCATGTCTGGTTTTGTTTTCCGGGGGCGGGGGGGCTCCCTGGGGATGATGGTGATTTTTTTTTTTTTTTTTAATCCTCAACTAGGAGAGAAAA'
             'TGAGGCAGAGACAATGTGGGGAGCGAGAGAGGGGAAAAGGACGGGGGAGG'),
            cigar=[(CIGAR.EQ, 61), (CIGAR.I, 2), (CIGAR.EQ, 87)],
        )
        self.assertEqual(
            [(CIGAR.EQ, 61 + 15), (CIGAR.I, 2), (CIGAR.EQ, 87 - 15)],
            hgvs_standardize_cigar(read, ref),
        )

        ref = (
            'CCTCCTCGGTCGGGCAGATCTTTCAGAAGCAGGAGCCCAGGATCATGTCTGGTTTTGTTTTCCGAGGGCGAGGGGGCTCCCTGAGGATGATGGTGATTT'
            'TTTTTTTTTTTTAATCCTCAACTAGGAGAGAAAATGAGGCAGAGACA')

        read = MockRead(
            'name',
            '1',
            0,
            149,
            query_sequence=
            ('CCCCTCCTCGGTCGGGCAGATCTTTCAGAAGCAGGAGCCCAGGATCATGTCTGGTTTTGTTTTCCGAGGGCGAGGGGGCTCCCTGAGGATGATGGTGATTTT'
             'TTTTTTTTTTTTTAATCCTCAACTAGGAGAGAAAATGAGGCAGAGACA'),
            cigar=[(CIGAR.S, 2), (CIGAR.EQ, 96), (CIGAR.I, 2), (CIGAR.EQ, 50)],
        )
        self.assertEqual(
            [(CIGAR.S, 2), (CIGAR.EQ, 96 + 15), (CIGAR.I, 2),
             (CIGAR.EQ, 50 - 15)],
            hgvs_standardize_cigar(read, ref),
        )
示例#2
0
 def test_bubble_sort_indel_sections_drop_mismatch_with_hardclipping(self):
     ref = 'ATAGGC' 'ATCT' 'ACGA' 'ACGA' 'ACGA' 'GATCGCTACG'
     # original
     # ATAGGCATCTACG   AA  CGAACGAGATCGCTACG
     #       ATCTC  TTT  TTCGAACG
     # expected
     # ATAGGCATCT      ACGAACGAACGAGATCGCTACG
     #       ATCTCTTTTT     CGAACG
     read = MockRead(
         'name',
         1,
         6,
         reference_name='1',
         query_sequence='ATCTCTTTTTCGAACG',
         cigar=[
             (CIGAR.H, 10),
             (CIGAR.EQ, 4),
             (CIGAR.X, 1),
             (CIGAR.D, 2),
             (CIGAR.I, 3),
             (CIGAR.D, 2),
             (CIGAR.I, 2),
             (CIGAR.EQ, 6),
         ],
     )
     print(SamRead.deletion_sequences(read, {'1': MockObject(seq=ref)}))
     print(SamRead.insertion_sequences(read))
     print(read.query_sequence, len(read.query_sequence))
     self.assertEqual(
         [(CIGAR.H, 10), (CIGAR.EQ, 4), (CIGAR.I, 6), (CIGAR.D, 5),
          (CIGAR.EQ, 6)],
         hgvs_standardize_cigar(read, ref),
     )
示例#3
0
 def test_complex(self):
     qseq = (
         'TATTTGGAAATATTTGTAAGATAGATGTCTCTG' 'C'
         'CTCCTTCTGTTTCTGTCTCTGTCTCTTGCACTCTCTCTCTCCCTCTCTT'
         'TCTCTCTCTCTCTCTCTCTCTCTCTC'
         'TCTATATATATATATATA'
         'T' 'A' 'T' 'C' 'T'
         'ACACACACACACACACAC')
     rseq = (
         'TATTTGGAAATATTTGTAAGATAGATGTCTCTG' 'T'
         'CTCCTTCTGTTTCTGTCTCTGTCTCTTGCACTCTCTCTCTCCCTCTCTT'
         'TCTATATATATATATATA'
         'C' 'A' 'C'
         'ACACACACACACACACAC')
     read = MockRead(
         'name', reference_name='mock', reference_start=0, query_sequence=qseq,
         cigar=[
             (CIGAR.EQ, 33), (CIGAR.X, 1), (CIGAR.EQ, 49), (CIGAR.I, 26),
             (CIGAR.EQ, 18), (CIGAR.X, 1), (CIGAR.EQ, 1), (CIGAR.I, 1),
             (CIGAR.EQ, 1), (CIGAR.I, 1), (CIGAR.EQ, 18)]
     )
     print(rseq)
     print(read.query_sequence[:83], read.query_sequence[83 + 26: 83 + 26 + 20], read.query_sequence[83 + 26 + 22:])
     print(read.query_sequence)
     print(SamRead.insertion_sequences(read))
     new_cigar = [
         (CIGAR.EQ, 33), (CIGAR.X, 1), (CIGAR.EQ, 52), (CIGAR.I, 26),
         (CIGAR.EQ, 15), (CIGAR.X, 1), (CIGAR.EQ, 1), (CIGAR.I, 1),
         (CIGAR.EQ, 1), (CIGAR.I, 1), (CIGAR.EQ, 18)]
     std_cigar = hgvs_standardize_cigar(read, rseq)
     print(new_cigar)
     print(std_cigar)
     self.assertEqual(new_cigar, std_cigar)
示例#4
0
 def test_deletion_repeat(self):
     qseq = (
         'GAGT'
         'GAGACTCTGT'
         'GAA'
         'AAAGAAAAAAAAAA'
         'A'
         'ATATATATATATATAAATATA'
         'C'
         'ATATTATGTATCAAATATATAT'
         'TATGTGTAATATACATCATGTATCAAATATATATTATGTATAATATACATCATATATCAAATATATATTATGTG'
     )
     # deleted reference: TATGTGTAATATACATCATGTATCAAA
     print(qseq[:76], qseq[76:])
     read = MockRead('name',
                     reference_name='11_86018001-86018500',
                     reference_start=28,
                     cigar=[(CIGAR.S, 4), (CIGAR.EQ, 10), (CIGAR.X, 3),
                            (CIGAR.EQ, 14), (CIGAR.X, 1), (CIGAR.EQ, 21),
                            (CIGAR.X, 1), (CIGAR.EQ, 22), (CIGAR.D, 27),
                            (CIGAR.EQ, 74)],
                     query_sequence=qseq)
     expected_cigar = [(CIGAR.S, 4), (CIGAR.EQ, 10), (CIGAR.X, 3),
                       (CIGAR.EQ, 14), (CIGAR.X, 1), (CIGAR.EQ, 21),
                       (CIGAR.X, 1), (CIGAR.EQ, 22 + 30), (CIGAR.D, 27),
                       (CIGAR.EQ, 74 - 30)]
     std_cigar = hgvs_standardize_cigar(
         read, REFERENCE_GENOME[read.reference_name].seq)
     print(SamRead.deletion_sequences(read, REFERENCE_GENOME))
     read.cigar = std_cigar
     print(SamRead.deletion_sequences(read, REFERENCE_GENOME))
     self.assertEqual(expected_cigar, std_cigar)
示例#5
0
 def no_change_aligned(self):
     ref = 'AAATTTGGGCCCAATT'
     read = MockRead('name',
                     '1',
                     1,
                     cigar=[(CIGAR.M, 10)],
                     query_sequence='AAATTTGGGC')
     self.assertEqual([(CIGAR.M, 10)], hgvs_standardize_cigar(read, ref))
示例#6
0
 def test_deletion_partial_repeat(self):
     qseq = ('ATCTTAGCCAGGT'          'AGTTACATACATATC')
     rseq = ('ATCTTAGCCAGGT' 'AGCTAT' 'AGTTACATACATATC')
     read = MockRead(
         'name', reference_name='mock', reference_start=0, query_sequence=qseq,
         cigar=convert_string_to_cigar('13=6D15=')
     )
     self.assertEqual(convert_string_to_cigar('15=6D13='), hgvs_standardize_cigar(read, rseq))
示例#7
0
 def test_deletion_in_repeat(self):
     ref = 'ATAGGC' 'ATCT' 'ACGA' 'ACGA' 'ACGA' 'GATCGCTACG'
     read = MockRead(
         'name',
         1,
         6,
         query_sequence='ATCT' 'ACGA' 'ACGA' 'GATC',
         cigar=[(CIGAR.EQ, 4), (CIGAR.D, 4), (CIGAR.EQ, 12)]
     )
     self.assertEqual([(CIGAR.EQ, 12), (CIGAR.D, 4), (CIGAR.EQ, 4)], hgvs_standardize_cigar(read, ref))
示例#8
0
 def test_bubble_sort_indel_sections(self):
     rseq = 'ATAGGC' 'ATCT' 'GG' 'GA' 'GCGA' 'GATCGCTACG'
     qseq = 'ATCT' 'TTT' 'TT' 'GCGA' 'GATC'
     read = MockRead('name',
                     1,
                     6,
                     query_sequence=qseq,
                     cigar=[(CIGAR.EQ, 4), (CIGAR.D, 2), (CIGAR.I, 3),
                            (CIGAR.D, 2), (CIGAR.I, 2), (CIGAR.EQ, 8)])
     self.assertEqual([(CIGAR.EQ, 4), (CIGAR.I, 5), (CIGAR.D, 4),
                       (CIGAR.EQ, 8)], hgvs_standardize_cigar(read, rseq))
示例#9
0
 def no_change_proper_indel(self):
     ref = 'ATAGGC' 'ATCTACGAG' 'ATCGCTACG'
     read = MockRead(
         'name',
         1,
         6,
         query_sequence='ATCTAC' 'CCC' 'ATCG',
         cigar=[(CIGAR.EQ, 6), (CIGAR.I, 3), (CIGAR.D, 3), (CIGAR.EQ, 4)]
     )
     self.assertEqual(
         [(CIGAR.EQ, 6), (CIGAR.I, 3), (CIGAR.D, 3), (CIGAR.EQ, 4)], hgvs_standardize_cigar(read, ref))
示例#10
0
 def ins_after_deletion(self):
     ref = 'ATAGGC' 'ATCTACGAG' 'ATCGCTACG'
     read = MockRead(
         'name',
         1,
         6,
         query_sequence='ATCTAC' 'CCC' 'ATCG',
         cigar=[(CIGAR.EQ, 6), (CIGAR.D, 3), (CIGAR.I, 3), (CIGAR.EQ, 4)]
     )
     self.assertEqual(
         [(CIGAR.EQ, 6), (CIGAR.I, 3), (CIGAR.D, 3), (CIGAR.EQ, 4)], hgvs_standardize_cigar(read, ref))
示例#11
0
 def test_unecessary_indel_end_match2(self):
     rseq = 'GGGTGCAGTGGCTTACACCT' 'GTAATCCAAACACCTTGGGAGCCGCCCCCTGAG' 'CCTCCAGGCCCGGGACAGA'
     qseq = 'GGGTGCAGTGGCTTACACCT' 'CCAGG' 'CCTCCAGGCCCGGGACAGA'
     read = MockRead('name',
                     reference_name='1',
                     reference_start=0,
                     cigar=convert_string_to_cigar('20=5I33D19='),
                     query_sequence=qseq)
     reference_genome = {'1': MockObject(seq=rseq)}
     exp = convert_string_to_cigar('20=4I32D20=')
     new_cigar = hgvs_standardize_cigar(read, rseq)
     self.assertEqual(exp, new_cigar)
示例#12
0
 def test_unecessary_indel_end_match(self):
     rseq = 'qwertyuiopasdfghjklzxcvbnm'
     qseq = 'qwertyuiopasdfkmkghjklzxcvbnm'
     read = MockRead('name',
                     reference_name='1',
                     reference_start=0,
                     cigar=convert_string_to_cigar('14=5I2D10='),
                     query_sequence=qseq)
     reference_genome = {'1': MockObject(seq=rseq)}
     exp = convert_string_to_cigar('14=3I12=')
     new_cigar = hgvs_standardize_cigar(read, rseq)
     self.assertEqual(exp, new_cigar)
示例#13
0
 def test_unecessary_indel_end_match(self):
     rseq = 'qwertyuiopasdfghjklzxcvbnm'
     qseq = 'qwertyuiopasdfkmkghjklzxcvbnm'
     read = MockRead(
         'name',
         reference_name='1',
         reference_start=0,
         cigar=convert_string_to_cigar('14=5I2D10='),
         query_sequence=qseq,
     )
     exp = convert_string_to_cigar('14=3I12=')
     new_cigar = hgvs_standardize_cigar(read, rseq)
     assert new_cigar == exp
示例#14
0
 def test_bubble_sort_indel_sections_drop_mismatch(self):
     rseq = 'ATAGGC' 'ATCT' 'A' 'CGA' 'AGCAT' 'ACGA' 'GATCGCTACG'
     #                ATCT   CTTTT                 TACGA
     qseq = 'ATCT' 'C' 'TT' 'TTT' 'ACGA' 'GATC'
     read = MockRead('name',
                     1,
                     6,
                     query_sequence=qseq,
                     cigar=[(CIGAR.EQ, 4), (CIGAR.X, 1), (CIGAR.D, 3),
                            (CIGAR.I, 2), (CIGAR.D, 5), (CIGAR.I, 3),
                            (CIGAR.EQ, 8)])
     self.assertEqual([(CIGAR.EQ, 4), (CIGAR.I, 5), (CIGAR.D, 8),
                       (CIGAR.EQ, 9)], hgvs_standardize_cigar(read, rseq))
示例#15
0
 def test_indel_repeat(self):
     qseq = 'ATCTTAGCCAGGT' 'C' 'AGTTACATACATATC'
     rseq = 'ATCTTAGCCAGGT' 'AGCTAT' 'AGTTACATACATATC'
     print(qseq)
     print(rseq)
     read = MockRead(
         'name',
         reference_name='mock',
         reference_start=0,
         query_sequence=qseq,
         cigar=convert_string_to_cigar('13=1I6D15='),
     )
     self.assertEqual(convert_string_to_cigar('13=1I6D15='),
                      hgvs_standardize_cigar(read, rseq))
示例#16
0
 def test_insertion_in_repeat(self):
     ref = 'ATAGGC' 'ATCT' 'ACGA' 'GATCGCTACG'
     read = MockRead(
         'name',
         1,
         6,
         query_sequence='ATCT'
         'ACGA'
         'ACGA'
         'GATC',
         cigar=[(CIGAR.EQ, 4), (CIGAR.I, 4), (CIGAR.EQ, 8)],
     )
     assert [(CIGAR.EQ, 8), (CIGAR.I, 4),
             (CIGAR.EQ, 4)] == hgvs_standardize_cigar(read, ref)
示例#17
0
 def test_even_insertion_in_repeat(self):
     rseq = 'AAAGAAAAAAAAAAAAT' 'ATATATATATATA' 'AATATACATATTATGTATCAAATATATATTATGTGTAATATACATCATGTATC'
     qseq = 'TTTTAAAAAAAAAAAAT' 'ATATATATATATA' 'TAAATATACATATTATGTATCAAATATATATTATGTGTAATATACATCATGTATC'
     print(len(qseq) - 13 - 4)
     read = MockRead('name',
                     reference_name='1',
                     reference_start=4,
                     cigar=convert_string_to_cigar('4S13=2I66='),
                     query_sequence=qseq)
     reference_genome = {'1': MockObject(seq=rseq)}
     exp = convert_string_to_cigar('4S26=2I53=')
     new_cigar = hgvs_standardize_cigar(read, rseq)
     read.cigar = new_cigar
     self.assertEqual(exp, new_cigar)
示例#18
0
 def test_odd_deletion_in_repeat(self):
     rseq = 'AAAGAAAAAAAAAAAAT' 'ATATATATATA' 'TAAATATACATATTATGTATCAAATATATATTATGTGTAATATACATCATGTATC'
     qseq = 'TTTTAAAAAAAAAAAAT' 'ATATATATATA' 'ATATACATATTATGTATCAAATATATATTATGTGTAATATACATCATGTATC'
     print(len(qseq) - 28)
     read = MockRead('name',
                     reference_name='1',
                     reference_start=4,
                     cigar=convert_string_to_cigar('4S13=3D63='),
                     query_sequence=qseq)
     reference_genome = {'1': MockObject(seq=rseq)}
     exp = convert_string_to_cigar('4S24=3D52=')
     new_cigar = hgvs_standardize_cigar(read, rseq)
     print(SamRead.deletion_sequences(read, reference_genome))
     read.cigar = new_cigar
     print(SamRead.deletion_sequences(read, reference_genome))
     self.assertEqual(exp, new_cigar)
示例#19
0
 def test_shift_complex_indel(self):
     refseq = 'ATATATCTATTTTTTTCTTTCTTTTTTTTACTTTCATTAAGTGCCACTAAAAAATTAGGTTCAATTAAACTTTATTAATCTCTTCTGAGTTTTGATTGAGTATATATATATATATACCCAGTTTCAAGCAGGTATCTGCCTTTAAAGATAAGAGACCTCCTAAATGCTTTCTTTTATTAGTTGCCCTGTTTCAGATTCAGCTTTGTATCTATATCACCTGTTAATATGTGTGGACTCACAGAAATGATCATTGAGGGAATGCACCCTGTTTGGGTGTAAGTAGCTCAGGGAAAAAATCCTAG'
     read = MockRead(
         'name',
         reference_name='18',
         reference_start=40237946 - 40237890,
         query_sequence=
         'AGGTTCAATTAAACTTTATTAATCTCTTCTGAGTTTTGATTGAGTGTATATATATATATATATATATATATATATATACCCAGTTTCAAGCAGGTATCTGCCTTTAAAGATAAGAGACCTCCTAAGTGCTTTCTTTTATTAGTGGCCCTG',
         cigar=convert_string_to_cigar('44M18I88M'),
     )
     print(_read.convert_cigar_to_string(read.cigar))
     read.cigar = recompute_cigar_mismatch(read, refseq)
     assert read.cigar == convert_string_to_cigar('44=18I63=1X17=1X6=')
     print(_read.convert_cigar_to_string(read.cigar))
     read.cigar = hgvs_standardize_cigar(read, refseq)
     print(_read.convert_cigar_to_string(read.cigar))
     assert read.cigar == convert_string_to_cigar('45=18I62=1X17=1X6=')
示例#20
0
 def test_even_insertion_in_repeat(self):
     rseq = ('AAAGAAAAAAAAAAAAT'
             'ATATATATATATA'
             'AATATACATATTATGTATCAAATATATATTATGTGTAATATACATCATGTATC')
     qseq = ('TTTTAAAAAAAAAAAAT'
             'ATATATATATATA'
             'TAAATATACATATTATGTATCAAATATATATTATGTGTAATATACATCATGTATC')
     print(len(qseq) - 13 - 4)
     read = MockRead(
         'name',
         reference_name='1',
         reference_start=4,
         cigar=convert_string_to_cigar('4S13=2I66='),
         query_sequence=qseq,
     )
     exp = convert_string_to_cigar('4S26=2I53=')
     new_cigar = hgvs_standardize_cigar(read, rseq)
     read.cigar = new_cigar
     assert new_cigar == exp
示例#21
0
 def test_even_deletion_in_repeat(self):
     rseq = ('AAAGAAAAAAAAAAAAT'
             'ATATATATATA'
             'TAAATATACATATTATGTATCAAATATATATTATGTGTAATATACATCATGTATC')
     qseq = ('TTTTAAAAAAAAAAAAT'
             'ATATATATATA'
             'AATATACATATTATGTATCAAATATATATTATGTGTAATATACATCATGTATC')
     print(len(qseq) - 28)
     read = MockRead(
         'name',
         reference_name='1',
         reference_start=4,
         cigar=convert_string_to_cigar('4S13=2D64='),
         query_sequence=qseq,
     )
     reference_genome = {'1': MockObject(seq=rseq)}
     exp = convert_string_to_cigar('4S24=2D53=')
     new_cigar = hgvs_standardize_cigar(read, rseq)
     print(SamRead.deletion_sequences(read, reference_genome))
     read.cigar = new_cigar
     print(SamRead.deletion_sequences(read, reference_genome))
     assert new_cigar == exp