def test_nocall(): # Intentionally mismatched qfile = data_file('phony-deletion-01.contig.fa') tfile = data_file('phony-insertion-01.gdna.fa') qinstream = kevlar.parse_augmented_fastx(kevlar.open(qfile, 'r')) query = [record for record in qinstream][0] tinstream = kevlar.reference.load_refr_cutouts(kevlar.open(tfile, 'r')) target = [record for record in tinstream][0] aln = VariantMapping(query, target, 1e6, '25D5M22I5M46D8M13D2M35I') assert aln.offset is None assert aln.targetshort is None assert aln.match is None assert aln.leftflank is None assert aln.indel is None assert aln.indeltype is None assert aln.rightflank is None variants = list(aln.call_variants(21)) assert len(variants) == 1 assert variants[0].vcf == ( 'yourchr\t801\t.\t.\t.\t.\tInscrutableCigar\t' 'CIGAR=25D5M22I5M46D8M13D2M35I;KSW2=1000000.0;CONTIG=AACTGGTGGGCTCAAGA' 'CTAAAAAGACTTTTTTGGTGACAAGCAGGGCGGCCTGCCCTTCCTGTAGTGCAAGAAAAT')
def test_call_near_end(query, target, dist, n, trimcount): contig = next( kevlar.parse_augmented_fastx(kevlar.open(data_file(query), 'r'))) cutout = next( kevlar.reference.load_refr_cutouts(kevlar.open(data_file(target), 'r'))) aln = VariantMapping(contig, cutout) calls = list(aln.call_variants(31, mindist=dist)) assert len(calls) == n assert aln.trimmed == trimcount
def test_call_num_interesting_kmers(): contig = next( kevlar.parse_augmented_fastx( kevlar.open(data_file('iktest.contig.fa'), 'r'))) cutout = next( kevlar.reference.load_refr_cutouts( kevlar.open(data_file('iktest.gdna.fa'), 'r'))) aln = VariantMapping(contig, cutout) calls = list(aln.call_variants(29)) assert len(calls) == 1 assert calls[0].attribute('IKMERS') == '1'
def test_no_margin(query, target, refr, alt): contig = next( kevlar.parse_augmented_fastx(kevlar.open(data_file(query), 'r'))) cutout = next( kevlar.reference.load_refr_cutouts(kevlar.open(data_file(target), 'r'))) aln = VariantMapping(contig, cutout) calls = list(aln.call_variants(31)) assert len(calls) == 1 assert calls[0].filterstr == 'PASS' assert calls[0]._refr == refr assert calls[0]._alt == alt
def test_passenger_screen(): contig = next( kevlar.parse_augmented_fastx( kevlar.open(data_file('wasp-pass.contig.augfasta'), 'r'))) cutout = next( kevlar.reference.load_refr_cutouts( kevlar.open(data_file('wasp.gdna.fa'), 'r'))) aln = VariantMapping(contig, cutout) calls = list(aln.call_variants(29)) assert len(calls) == 2 assert calls[0].filterstr == 'PASS' assert calls[1].filterstr == 'PassengerVariant'
def test_variant_window(prefix, cigar, refrwindow, altwindow): qfile = data_file(prefix + '.contig.fa') tfile = data_file(prefix + '.gdna.fa') qinstream = kevlar.parse_augmented_fastx(kevlar.open(qfile, 'r')) query = [record for record in qinstream][0] tinstream = kevlar.reference.load_refr_cutouts(kevlar.open(tfile, 'r')) target = [record for record in tinstream][0] aln = VariantMapping(query, target, 1e6, cigar) variants = list(aln.call_variants(21)) assert len(variants) == 1 assert variants[0].window == altwindow assert variants[0].refrwindow == refrwindow
def test_call_ssc_1bpdel(): """Test 1bp deletion""" qfile = data_file('ssc218.contig.augfasta') tfile = data_file('ssc218.gdna.fa') qinstream = kevlar.parse_augmented_fastx(kevlar.open(qfile, 'r')) query = [record for record in qinstream][0] tinstream = kevlar.reference.load_refr_cutouts(kevlar.open(tfile, 'r')) target = [record for record in tinstream][0] aln = VariantMapping(query, target, 1e6, '50D132M1D125M50D') variants = list(aln.call_variants(31)) assert len(variants) == 1 assert str(variants[0]) == '6:23230160:1D'
def test_drop_numerous_mismatches(): contig = next( kevlar.parse_augmented_fastx( kevlar.open(data_file('drop-polysnp-contig.augfasta'), 'r'))) cutout = next( kevlar.reference.load_refr_cutouts( kevlar.open(data_file('drop-polysnp-gdna.fa'), 'r'))) aln = VariantMapping(contig, cutout) calls = list(aln.call_variants(21)) for c in calls: print(c.vcf) assert len(calls) == 1 assert calls[0].filterstr == 'NumerousMismatches' assert calls[0]._refr == '.' assert calls[0]._alt == '.'
def prelim_call(targetlist, querylist, match=1, mismatch=2, gapopen=5, gapextend=0, ksize=31, refrfile=None, debug=False, mindist=5, logstream=sys.stderr): """Implement the `kevlar call` procedure as a generator function.""" for query in sorted(querylist, reverse=True, key=len): alignments = list() for target in sorted(targetlist, key=lambda cutout: cutout.defline): mapping = VariantMapping( query, target, match=match, mismatch=mismatch, gapopen=gapopen, gapextend=gapextend ) alignments.append(mapping) aligns2report = alignments_to_report(alignments) if len(aligns2report) > 1: if refrfile and len(query.mates) > 0: mate_pos = list(align_mates(query, refrfile)) if len(mate_pos) > 0: for aln in aligns2report: aln.matedist = mate_distance(mate_pos, aln.interval) aligns2report.sort(key=lambda aln: aln.matedist) for n, alignment in enumerate(aligns2report): if debug: print('DEBUG ', alignment.cutout.defline, ' vs ', alignment.contig.name, '\n', str(alignment), sep='', end='\n\n', file=logstream) for varcall in alignment.call_variants(ksize, mindist, logstream): if alignment.matedist: varcall.annotate('MATEDIST', alignment.matedist) yield varcall
def test_call_ssc_two_proximal_snvs(): """Test two proximal SNVs Currently this serves as a negative control for calling isolated SNVs, but distinguishing which (if any) of a set of proximal SNVs is novel will be supported soon, and this test will need to be updated. """ qfile = data_file('ssc107.contig.augfasta.gz') tfile = data_file('ssc107.gdna.fa.gz') qinstream = kevlar.parse_augmented_fastx(kevlar.open(qfile, 'r')) query = [record for record in qinstream][0] tinstream = kevlar.reference.load_refr_cutouts(kevlar.open(tfile, 'r')) target = [record for record in tinstream][0] aln = VariantMapping(query, target, 1e6, '25D263M25D') variants = list(aln.call_variants(31)) assert len(variants) == 2
def test_call_truncated_windows(query, target, vw, rw): contig = next( kevlar.parse_augmented_fastx(kevlar.open(data_file(query), 'r'))) cutout = next( kevlar.reference.load_refr_cutouts(kevlar.open(data_file(target), 'r'))) aln = VariantMapping(contig, cutout) if aln.vartype == 'snv': assert aln.leftflank is None assert aln.indeltype is None assert aln.indel is None assert aln.rightflank is None calls = list(aln.call_variants(31)) assert len(calls) == 1 print('VW:', calls[0].window, file=sys.stderr) print('RW:', calls[0].refrwindow, file=sys.stderr) assert calls[0].window == vw assert calls[0].refrwindow == rw
def test_call_indel_snv(): contig = next( kevlar.parse_augmented_fastx( kevlar.open(data_file('indel-snv.contig.augfasta'), 'r'))) cutout = next( kevlar.reference.load_refr_cutouts( kevlar.open(data_file('indel-snv.gdna.fa'), 'r'))) aln = VariantMapping(contig, cutout) calls = list(aln.call_variants(31)) assert len(calls) == 2 assert calls[0]._refr == 'CA' assert calls[0]._alt == 'C' assert calls[0]._pos == 501 - 1 assert calls[1]._refr == 'C' assert calls[1]._alt == 'A' assert calls[1]._pos == 474 - 1 calls = list(aln.call_variants(31, mindist=None)) assert len(calls) == 2
def test_variant_mapping(): contig = screed.Record( name='contig1', sequence='CCTGAGCCCTCTCAAGTCGGGTCCTGGCCCGGTCTGCCCATGAGGCTGGGCCTGAGCCCC' ) cutout = kevlar.reference.ReferenceCutout( defline='chr1_10000-10060', sequence='CCTGAGCCCTCTCAAGTCGGGTCCTGGCCCAGTCTGCCCATGAGGCTGGGCCTGAGCCCC' ) mapping = VariantMapping(contig, cutout, score=1e6, cigar='60M') assert mapping.seqid == 'chr1' assert mapping.interval == ('chr1', 10000, 10060)
def test_varmap_str(): contig = next( kevlar.parse_augmented_fastx( kevlar.open(data_file('wasp-pass.contig.augfasta'), 'r'))) cutout = next( kevlar.reference.load_refr_cutouts( kevlar.open(data_file('wasp.gdna.fa'), 'r'))) aln = VariantMapping(contig, cutout) alignstr = kevlar.open(data_file('wasp-align.txt'), 'r').read().strip() print(str(aln), file=sys.stderr) print(alignstr, file=sys.stderr) assert str(aln) == alignstr
def prelim_call(targetlist, querylist, partid=None, match=1, mismatch=2, gapopen=5, gapextend=0, ksize=31, refrfile=None, debug=False, mindist=5, homopolyfilt=True, maxtargetlen=10000): """Implement the `kevlar call` procedure as a generator function.""" for query in sorted(querylist, reverse=True, key=len): alignments = list() for target in sorted(targetlist, key=lambda cutout: cutout.defline): nocall = False if maxtargetlen and len(target) > maxtargetlen: nocall = True mapping = VariantMapping( query, target, match=match, mismatch=mismatch, gapopen=gapopen, gapextend=gapextend, homopolyfilt=homopolyfilt, nocall=nocall, ) alignments.append(mapping) aligns2report = alignments_to_report(alignments) for n, alignment in enumerate(aligns2report): if debug: kevlar.plog( 'DEBUG ', alignment.cutout.defline, ' vs ', alignment.contig.name, '\n', str(alignment), sep='', end='\n\n', ) for varcall in alignment.call_variants(ksize, mindist): if partid is not None: varcall.annotate('PART', partid) yield varcall