def test_shift_overaligned(self): # qwertyuiopas---kkkkk------dfghjklzxcvbnm # .......... ................ gene = Gene('1', 1, 1000, strand='+') transcript = PreTranscript(exons=[(1, 12), (20, 28)], gene=gene, strand='+') for spl_patt in transcript.generate_splicing_patterns(): transcript.transcripts.append(Transcript(transcript, spl_patt)) gene.transcripts.append(transcript) read = SamRead( reference_name='1', reference_start=0, cigar=_cigar.convert_string_to_cigar('14=7D12='), query_sequence='qwertyuiopasdfghjklzxcvbnm', ) evidence = TranscriptomeEvidence( annotations={}, reference_genome={'1': MockObject(seq='qwertyuiopasdfkkkkkdfghjklzxcvbnm')}, bam_cache=MockObject(get_read_reference_name=lambda r: r.reference_name), break1=Breakpoint('1', 1, orient='L', strand='+'), break2=Breakpoint('1', 10, orient='R', strand='+'), read_length=75, stdev_fragment_size=75, median_fragment_size=220, ) evidence.overlapping_transcripts.add(transcript) new_read = evidence.standardize_read(read) assert new_read.cigar == _cigar.convert_string_to_cigar('12=7N14=')
def test_deletion_partial_repeat(self): qseq = ('ATCTTAGCCAGGT' 'AGTTACATACATATC') rseq = ('ATCTTAGCCAGGT' 'AGCTAT' 'AGTTACATACATATC') read = MockRead( 'name', reference_name='mock', reference_start=0, query_sequence=qseq, cigar=convert_string_to_cigar('13=6D15=') ) self.assertEqual(convert_string_to_cigar('15=6D13='), hgvs_standardize_cigar(read, rseq))
def test_unecessary_indel_end_match2(self): rseq = 'GGGTGCAGTGGCTTACACCT' 'GTAATCCAAACACCTTGGGAGCCGCCCCCTGAG' 'CCTCCAGGCCCGGGACAGA' qseq = 'GGGTGCAGTGGCTTACACCT' 'CCAGG' 'CCTCCAGGCCCGGGACAGA' read = MockRead('name', reference_name='1', reference_start=0, cigar=convert_string_to_cigar('20=5I33D19='), query_sequence=qseq) reference_genome = {'1': MockObject(seq=rseq)} exp = convert_string_to_cigar('20=4I32D20=') new_cigar = hgvs_standardize_cigar(read, rseq) self.assertEqual(exp, new_cigar)
def test_unecessary_indel_end_match(self): rseq = 'qwertyuiopasdfghjklzxcvbnm' qseq = 'qwertyuiopasdfkmkghjklzxcvbnm' read = MockRead('name', reference_name='1', reference_start=0, cigar=convert_string_to_cigar('14=5I2D10='), query_sequence=qseq) reference_genome = {'1': MockObject(seq=rseq)} exp = convert_string_to_cigar('14=3I12=') new_cigar = hgvs_standardize_cigar(read, rseq) self.assertEqual(exp, new_cigar)
def test_unecessary_indel_end_match(self): rseq = 'qwertyuiopasdfghjklzxcvbnm' qseq = 'qwertyuiopasdfkmkghjklzxcvbnm' read = MockRead( 'name', reference_name='1', reference_start=0, cigar=convert_string_to_cigar('14=5I2D10='), query_sequence=qseq, ) exp = convert_string_to_cigar('14=3I12=') new_cigar = hgvs_standardize_cigar(read, rseq) assert new_cigar == exp
def test_even_insertion_in_repeat(self): rseq = 'AAAGAAAAAAAAAAAAT' 'ATATATATATATA' 'AATATACATATTATGTATCAAATATATATTATGTGTAATATACATCATGTATC' qseq = 'TTTTAAAAAAAAAAAAT' 'ATATATATATATA' 'TAAATATACATATTATGTATCAAATATATATTATGTGTAATATACATCATGTATC' print(len(qseq) - 13 - 4) read = MockRead('name', reference_name='1', reference_start=4, cigar=convert_string_to_cigar('4S13=2I66='), query_sequence=qseq) reference_genome = {'1': MockObject(seq=rseq)} exp = convert_string_to_cigar('4S26=2I53=') new_cigar = hgvs_standardize_cigar(read, rseq) read.cigar = new_cigar self.assertEqual(exp, new_cigar)
def test_indel_repeat(self): qseq = 'ATCTTAGCCAGGT' 'C' 'AGTTACATACATATC' rseq = 'ATCTTAGCCAGGT' 'AGCTAT' 'AGTTACATACATATC' print(qseq) print(rseq) read = MockRead( 'name', reference_name='mock', reference_start=0, query_sequence=qseq, cigar=convert_string_to_cigar('13=1I6D15='), ) self.assertEqual(convert_string_to_cigar('13=1I6D15='), hgvs_standardize_cigar(read, rseq))
def test_odd_deletion_in_repeat(self): rseq = 'AAAGAAAAAAAAAAAAT' 'ATATATATATA' 'TAAATATACATATTATGTATCAAATATATATTATGTGTAATATACATCATGTATC' qseq = 'TTTTAAAAAAAAAAAAT' 'ATATATATATA' 'ATATACATATTATGTATCAAATATATATTATGTGTAATATACATCATGTATC' print(len(qseq) - 28) read = MockRead('name', reference_name='1', reference_start=4, cigar=convert_string_to_cigar('4S13=3D63='), query_sequence=qseq) reference_genome = {'1': MockObject(seq=rseq)} exp = convert_string_to_cigar('4S24=3D52=') new_cigar = hgvs_standardize_cigar(read, rseq) print(SamRead.deletion_sequences(read, reference_genome)) read.cigar = new_cigar print(SamRead.deletion_sequences(read, reference_genome)) self.assertEqual(exp, new_cigar)
def test_insertions(self): exp = ['kkk', 'kkkk'] read = MockRead(reference_start=0, reference_name='1', query_sequence='abcdekkkfghijklmnopqkkkkrstuvwxyz', cigar=convert_string_to_cigar('5=3I12=4I9=')) self.assertEqual(exp, SamRead.insertion_sequences(read))
def test_break_left_deletion(self): b = Breakpoint('10', 1030, 1030, orient=ORIENT.LEFT) read = MockRead(cigar=_cigar.convert_string_to_cigar('35M10D5I20M'), reference_start=999, reference_name='10') align.SplitAlignment.breakpoint_contig_remapped_depth( b, self.contig, read)
def test_deletions(self): exp = ['cde', 'nopq'] read = MockRead( reference_start=0, reference_name='1', query_sequence='', cigar=convert_string_to_cigar('2=3D8=4D9=') ) self.assertEqual(exp, SamRead.deletion_sequences(read, self.reference_genome))
def test_shift_complex_indel(self): refseq = 'ATATATCTATTTTTTTCTTTCTTTTTTTTACTTTCATTAAGTGCCACTAAAAAATTAGGTTCAATTAAACTTTATTAATCTCTTCTGAGTTTTGATTGAGTATATATATATATATACCCAGTTTCAAGCAGGTATCTGCCTTTAAAGATAAGAGACCTCCTAAATGCTTTCTTTTATTAGTTGCCCTGTTTCAGATTCAGCTTTGTATCTATATCACCTGTTAATATGTGTGGACTCACAGAAATGATCATTGAGGGAATGCACCCTGTTTGGGTGTAAGTAGCTCAGGGAAAAAATCCTAG' read = MockRead( 'name', reference_name='18', reference_start=40237946 - 40237890, query_sequence= 'AGGTTCAATTAAACTTTATTAATCTCTTCTGAGTTTTGATTGAGTGTATATATATATATATATATATATATATATATACCCAGTTTCAAGCAGGTATCTGCCTTTAAAGATAAGAGACCTCCTAAGTGCTTTCTTTTATTAGTGGCCCTG', cigar=convert_string_to_cigar('44M18I88M'), ) print(_read.convert_cigar_to_string(read.cigar)) read.cigar = recompute_cigar_mismatch(read, refseq) assert read.cigar == convert_string_to_cigar('44=18I63=1X17=1X6=') print(_read.convert_cigar_to_string(read.cigar)) read.cigar = hgvs_standardize_cigar(read, refseq) print(_read.convert_cigar_to_string(read.cigar)) assert read.cigar == convert_string_to_cigar('45=18I62=1X17=1X6=')
def test_bwa_mem(self): # SamRead(1:224646710-224646924, 183=12D19=, TCAGCTCTCT...) TCAGCTCTCTTAGGGCACACCCTCCAAGGTGCCTAAATGCCATCCCAGGATTGGTTCCAGTGTCTATTATCTGTTTGACTCCAAATGGCCAAACACCTGACTTCCTCTCTGGTAGCCTGGCTTTTATCTTCTAGGACATCCAGGGCCCCTCTCTTTGCCTTCCCCTCTTTCTTCCTTCTACTGCTTCAGCAGACATCATGTG # std SamRead(1:224646710-224646924, 183=12D19=, TCAGCTCTCT...) TCAGCTCTCTTAGGGCACACCCTCCAAGGTGCCTAAATGCCATCCCAGGATTGGTTCCAGTGTCTATTATCTGTTTGACTCCAAATGGCCAAACACCTGACTTCCTCTCTGGTAGCCTGGCTTTTATCTTCTAGGACATCCAGGGCCCCTCTCTTTGCCTTCCCCTCTTTCTTCCTTCTACTGCTTCAGCAGACATCATGTG # > BPP(Breakpoint(1:224646893L-), Breakpoint(1:224646906R-), opposing=False, seq='') read = SamRead(reference_name='1') read.query_sequence = 'TCAGCTCTCTTAGGGCACACCCTCCAAGGTGCCTAAATGCCATCCCAGGATTGGTTCCAGTGTCTATTATCTGTTTGACTCCAAATGGCCAAACACCTGACTTCCTCTCTGGTAGCCTGGCTTTTATCTTCTAGGACATCCAGGGCCCCTCTCTTTGCCTTCCCCTCTTTCTTCCTTCTACTGCTTCAGCAGACATCATGTG' read.reference_start = 224646710 read.reference_id = 0 print(_cigar.convert_string_to_cigar('183=12D19=')) read.cigar = _cigar.join(_cigar.convert_string_to_cigar('183=12D19=')) read.query_name = 'name' read.mapping_quality = NA_MAPPING_QUALITY std_read = Evidence.standardize_read(self.mock_evidence, read) print(SamRead.__repr__(read)) print(SamRead.__repr__(std_read)) self.assertEqual(_cigar.convert_string_to_cigar('186=12D16='), std_read.cigar) self.assertEqual(read.reference_start, std_read.reference_start)
def test_small_exact_match(self): cigar = convert_string_to_cigar( '283M17506D5M21275D596M17506D5M21275D313M') # [(0, 283), (2, 17506), (0, 5), (2, 21275), (0, 596), (2, 17506), (0, 5), (2, 21275), (0, 313)] new_cigar = merge_internal_events(cigar, 20, 15) exp = [(CIGAR.M, 283), (CIGAR.I, 5), (CIGAR.D, 17506 + 21275 + 5), (CIGAR.M, 596), (CIGAR.I, 5), (CIGAR.D, 17506 + 21275 + 5), (CIGAR.M, 313)] self.assertEqual(exp, new_cigar)
def test_shift_no_transcripts(self): read = SamRead(reference_name='1', reference_start=0, cigar=_cigar.convert_string_to_cigar('14=7D18='), query_sequence='qwertyuiopasdfdfghjklzxcvbnm') evidence = TranscriptomeEvidence( annotations={}, reference_genome={ '1': MockObject(seq='qwertyuiopasdfkkkkkdfghjklzxcvbnm') }, bam_cache=None, break1=Breakpoint('1', 1, orient='L', strand='+'), break2=Breakpoint('1', 10, orient='R', strand='+'), read_length=75, stdev_fragment_size=75, median_fragment_size=220) new_cigar = evidence.exon_boundary_shift_cigar(read) self.assertEqual(_cigar.convert_string_to_cigar('14=7D18='), new_cigar)
def test_read_with_exons(self): contig = MockRead( query_sequence='CTTGAAGGAAACTGAATTCAAAAAGATCAAAGTGCTGGGCTCCGGTGCGTTCGGCACGGTGTATAAGGGACTCTGGATCCCAGAAGGTGAGAAAGTTAAAATTCCCGTCGCTATCAAGACATCTCCGAAAGCCAACAAGGAAATCCTCGATGAAGCCTACGTGATGGCCAGCGTGGACAACCCCCACGTGTGCCGCCTGCTGGGCATCTGCCTCACCTCCACCGTGCAGCTCATCATGCAGCTCATGCCCTTCGGCTGCCTCCTGGACTATGTCCGGGAACACAAAGACAATATTGGCTCCCAGTACCTGCTCAACTGGTGTGTGCAGATCGCAAAGGGCATGAACTACTTGGAGGACCGTCGCTTGGTGCACCGCGACCTGGCAGCCAGGAACGTACTGGTGAAAACACCGCAGCATGTCAAGATCACAGATTTTGGGCTGGCCAAACTGCTGGGTGCGGAAGAGAAAGAATACCATGCAGAAGGAGGCAAAGTGCCTATCAAGTGGATGGCATTGGAATCAATTTTACACAGAATCTATACCCACCAGAGTGATGTCTGGAGCTACGGGGTGACCGTTTGGGAGTTGATGACCTTTGGATCCAA', cigar=_cigar.convert_string_to_cigar('68M678D50M15D34M6472D185M10240D158M891D74M8I5883D29M'), reference_name='7', reference_id=6, reference_start=55241669 ) self.assertEqual(6, len(align.call_read_events(contig)))
def test_deletions(self): exp = ['cde', 'nopq'] read = MockRead( reference_start=0, reference_name='1', query_sequence='', cigar=convert_string_to_cigar('2=3D8=4D9='), ) assert (SamRead.deletion_sequences( read, {'1': MockObject(seq='abcdefghijklmnopqrstuvwxyz')}) == exp)
def test_even_insertion_in_repeat(self): rseq = ('AAAGAAAAAAAAAAAAT' 'ATATATATATATA' 'AATATACATATTATGTATCAAATATATATTATGTGTAATATACATCATGTATC') qseq = ('TTTTAAAAAAAAAAAAT' 'ATATATATATATA' 'TAAATATACATATTATGTATCAAATATATATTATGTGTAATATACATCATGTATC') print(len(qseq) - 13 - 4) read = MockRead( 'name', reference_name='1', reference_start=4, cigar=convert_string_to_cigar('4S13=2I66='), query_sequence=qseq, ) exp = convert_string_to_cigar('4S26=2I53=') new_cigar = hgvs_standardize_cigar(read, rseq) read.cigar = new_cigar assert new_cigar == exp
def test_even_deletion_in_repeat(self): rseq = ('AAAGAAAAAAAAAAAAT' 'ATATATATATA' 'TAAATATACATATTATGTATCAAATATATATTATGTGTAATATACATCATGTATC') qseq = ('TTTTAAAAAAAAAAAAT' 'ATATATATATA' 'AATATACATATTATGTATCAAATATATATTATGTGTAATATACATCATGTATC') print(len(qseq) - 28) read = MockRead( 'name', reference_name='1', reference_start=4, cigar=convert_string_to_cigar('4S13=2D64='), query_sequence=qseq, ) reference_genome = {'1': MockObject(seq=rseq)} exp = convert_string_to_cigar('4S24=2D53=') new_cigar = hgvs_standardize_cigar(read, rseq) print(SamRead.deletion_sequences(read, reference_genome)) read.cigar = new_cigar print(SamRead.deletion_sequences(read, reference_genome)) assert new_cigar == exp
def test_break_left_deletion(self): contig = Contig(' ' * 60, None) contig.add_mapped_sequence(MockObject(reference_start=0, reference_end=10)) contig.add_mapped_sequence(MockObject(reference_start=0, reference_end=20)) contig.add_mapped_sequence(MockObject(reference_start=50, reference_end=60)) b = Breakpoint('10', 1030, 1030, orient=ORIENT.LEFT) read = MockRead( cigar=_cigar.convert_string_to_cigar('35M10D5I20M'), reference_start=999, reference_name='10', ) align.SplitAlignment.breakpoint_contig_remapped_depth(b, contig, read)
def test_bwa_mem(self): mock_evidence = MockObject( reference_genome={ '1': MockObject( seq=MockLongString( 'TGGGTATCAGACACACTGGGTAGCTGAGTGCTCAGAGGAAGATGCGAGGTATTCAGGGAAAGTGTCAGTGGGGTCTCCCAGTGCCTGTTTGGTCCACAGTTAGGAGA' 'GGCCCTGCTTGCACTTCTAATACAGTCCCGGAAAGACGGGGCCAGAACTTAGGAGGGGAGCGCTTTGCAGCAACTTTTCAAGAAAAGGGGAAAATTTAAGCACCATA' 'CTGTTATGTGGTCCTTGTACCCAGAGGCCCTGTTCAGCTCCAGTGATCAGCTCTCTTAGGGCACACCCTCCAAGGTGCCTAAATGCCATCCCAGGATTGGTTCCAGT' 'GTCTATTATCTGTTTGACTCCAAATGGCCAAACACCTGACTTCCTCTCTGGTAGCCTGGCTTTTATCTTCTAGGACATCCAGGGCCCCTCTCTTTGCCTTCCCCTCT' 'TTCTTCCTTCTACTGCTTAGATCAAGTCTTCAGCAGACATCATGTGACCTTGAGGATGGATGTCACATGCTGGAGGAAACAGAAGGCCGAAACCCTGATGACTTCAC' 'AGAGCTGCCAAAACAGTTCCTGACTGTTTATTCCGGGTCTTTAACAAAGTGATGAAAAGAAATCCTTGCAGTATGAAAACAACTTTTCTATTCCATGGAGCCAAACC' 'TCATTATAACAGATAACGTGACCCTCAGCGATATCCCAAGTATTTTCCTGTTCTCATCTATACTATGGCAAAGGGGCAAATACCTCTCAGTAAAGAAAGAAATAACA' 'ACTTCTATCTTGGGCGAGGCATTTCTTCTGTTAGAACTTTGTACACGGAATAAAATAGATCTGTTTGTGCTTATCTTTCTCCTTAGAATTATTGAATTTGAAGTCTT' 'TCCCAGGGTGGGGGTGGAGTGAAGCTGGGGTTTCATAAGCACATAGATAGTAGTG', offset=224646450, ) ) }, bam_cache=MockObject(get_read_reference_name=lambda x: x.reference_name), config={ 'validate.contig_aln_merge_inner_anchor': 10, 'validate.contig_aln_merge_outer_anchor': 20, **DEFAULTS, }, ) # SamRead(1:224646710-224646924, 183=12D19=, TCAGCTCTCT...) TCAGCTCTCTTAGGGCACACCCTCCAAGGTGCCTAAATGCCATCCCAGGATTGGTTCCAGTGTCTATTATCTGTTTGACTCCAAATGGCCAAACACCTGACTTCCTCTCTGGTAGCCTGGCTTTTATCTTCTAGGACATCCAGGGCCCCTCTCTTTGCCTTCCCCTCTTTCTTCCTTCTACTGCTTCAGCAGACATCATGTG # std SamRead(1:224646710-224646924, 183=12D19=, TCAGCTCTCT...) TCAGCTCTCTTAGGGCACACCCTCCAAGGTGCCTAAATGCCATCCCAGGATTGGTTCCAGTGTCTATTATCTGTTTGACTCCAAATGGCCAAACACCTGACTTCCTCTCTGGTAGCCTGGCTTTTATCTTCTAGGACATCCAGGGCCCCTCTCTTTGCCTTCCCCTCTTTCTTCCTTCTACTGCTTCAGCAGACATCATGTG # > BPP(Breakpoint(1:224646893L-), Breakpoint(1:224646906R-), opposing=False, seq='') read = SamRead(reference_name='1') read.query_sequence = 'TCAGCTCTCTTAGGGCACACCCTCCAAGGTGCCTAAATGCCATCCCAGGATTGGTTCCAGTGTCTATTATCTGTTTGACTCCAAATGGCCAAACACCTGACTTCCTCTCTGGTAGCCTGGCTTTTATCTTCTAGGACATCCAGGGCCCCTCTCTTTGCCTTCCCCTCTTTCTTCCTTCTACTGCTTCAGCAGACATCATGTG' read.reference_start = 224646710 read.reference_id = 0 print(_cigar.convert_string_to_cigar('183=12D19=')) read.cigar = _cigar.join(_cigar.convert_string_to_cigar('183=12D19=')) read.query_name = 'name' read.mapping_quality = NA_MAPPING_QUALITY std_read = Evidence.standardize_read(mock_evidence, read) assert std_read.cigar == _cigar.convert_string_to_cigar('186=12D16=') assert std_read.reference_start == read.reference_start
def test(self): string = '283M' '17506D' '5M' '21275D' '596M' '17506D' '5M' '21275D' '313M' exp = [ (CIGAR.M, 283), (CIGAR.D, 17506), (CIGAR.M, 5), (CIGAR.D, 21275), (CIGAR.M, 596), (CIGAR.D, 17506), (CIGAR.M, 5), (CIGAR.D, 21275), (CIGAR.M, 313), ] self.assertEqual(exp, convert_string_to_cigar(string))
def test_hardclipping(self): read = SamRead(reference_name='15') read.reference_start = 71491944 read.cigar = _cigar.convert_string_to_cigar('12=1D25=113H') read.query_sequence = 'GTGTGTGGTGTGGGGTGTGTGGTGTGTGTGGTGTGTG' read.is_reverse = True expected_bpp = BreakpointPair( Breakpoint('15', 71491956, orient='L', strand='-'), Breakpoint('15', 71491958, orient='R', strand='-'), untemplated_seq='') events = align.call_read_events(read, is_stranded=True) self.assertEqual(1, len(events)) self.assertEqual(expected_bpp.break1, events[0].break1) self.assertEqual(expected_bpp.break2, events[0].break2)
def setUp(self): self.contig_read = MockRead( cigar=_cigar.convert_string_to_cigar('275M18I12041D278M'), reference_start=89700025, reference_name='10', )
def test_softclipped_right(self): c = convert_string_to_cigar( '70=2X1=8X4=1X1=4X1=6X1=4X1=4X2=5X3=3X1=4X1=3X1=14X1=1X2=1S') cnew, prefix = extend_softclipping(c, 6) assert prefix == 0 assert cnew == convert_string_to_cigar('70=80S')
def test_softclipped_right(self): c = convert_string_to_cigar( '70=2X1=8X4=1X1=4X1=6X1=4X1=4X2=5X3=3X1=4X1=3X1=14X1=1X2=1S') cnew, prefix = extend_softclipping(c, 6) self.assertEqual(0, prefix) self.assertEqual(convert_string_to_cigar('70=80S'), cnew)
def contig_read(): return MockRead( cigar=_cigar.convert_string_to_cigar('275M18I12041D278M'), reference_start=89700025, reference_name='10', )
def test_mismatch_only(self): exp = _cigar.convert_string_to_cigar('39=1X16=1X71=22S') assert _cigar.merge_internal_events(exp, 20, 15) == exp