def test_call_homopolymer_filter_disabled(): contigfile = data_file('homopolymer/12175-3parts.contigs.augfasta') contigstream = kevlar.parse_augmented_fastx(kevlar.open(contigfile, 'r')) partstream = kevlar.parse_partitioned_reads(contigstream) contigs = kevlar.call.load_contigs(partstream) gdnafile = data_file('homopolymer/12175-3parts.targets.fasta') gdnastream = kevlar.reference.load_refr_cutouts(kevlar.open(gdnafile, 'r')) partstream = kevlar.parse_partitioned_reads(gdnastream) targets = kevlar.call.load_contigs(partstream) prelimcalls = list() for partid in contigs: contiglist = contigs[partid] gdnalist = targets[partid] caller = kevlar.call.call( gdnalist, contiglist, partid=partid, homopolyfilt=False ) prelimcalls.extend(list(caller)) kid = kevlar.sketch.load(data_file('homopolymer/12175-kid.sct')) mom = kevlar.sketch.load(data_file('homopolymer/12175-mom.sct')) dad = kevlar.sketch.load(data_file('homopolymer/12175-dad.sct')) refr = kevlar.sketch.load(data_file('homopolymer/12175-refr.sct')) scorer = kevlar.simlike.simlike( prelimcalls, kid, [mom, dad], refr, samplelabels=['Proband', 'Mother', 'Father'], ) calls = list(scorer) assert len(calls) == 6 for c in calls: assert 'Homopolymer' not in c.filterstr
def test_call_max_target_length(contigs, gdnas, maxtargetlen, numpassing): contigfile = data_file(contigs) contigstream = kevlar.parse_augmented_fastx(kevlar.open(contigfile, 'r')) partstream = kevlar.parse_partitioned_reads(contigstream) contigs = kevlar.call.load_contigs(partstream) gdnafile = data_file(gdnas) gdnastream = kevlar.reference.load_refr_cutouts(kevlar.open(gdnafile, 'r')) partstream = kevlar.parse_partitioned_reads(gdnastream) targets = kevlar.call.load_contigs(partstream) calls = list() for partid in contigs: contiglist = contigs[partid] gdnalist = targets[partid] caller = kevlar.call.call( gdnalist, contiglist, partid=partid, maxtargetlen=maxtargetlen ) calls.extend(list(caller)) nocalls = [c for c in calls if c.seqid == '.'] passcalls = [c for c in calls if c.seqid != '.'] assert len(passcalls) == numpassing for c in nocalls: assert c.seqid == c.position == '.' assert sorted(c.info.keys()) == ['CONTIG', 'IKMERS', 'PART']
def test_split_cli(): infile = data_file('fiveparts.augfastq.gz') tempdir = mkdtemp() print(tempdir) arglist = ['split', infile, '3', tempdir + '/out'] args = kevlar.cli.parser().parse_args(arglist) kevlar.split.main(args) outfile = tempdir + '/out.0.augfastx.gz' readstream = kevlar.parse_augmented_fastx(kevlar.open(outfile, 'r')) partstream = kevlar.parse_partitioned_reads(readstream) partitions = list(partstream) partitions = [part for partid, part in partitions] assert len(partitions) == 2 assert len(partitions[0]) == 67 assert len(partitions[1]) == 12 outfile = tempdir + '/out.1.augfastx.gz' readstream = kevlar.parse_augmented_fastx(kevlar.open(outfile, 'r')) partstream = kevlar.parse_partitioned_reads(readstream) partitions = list(partstream) partitions = [part for partid, part in partitions] assert len(partitions) == 2 assert len(partitions[0]) == 23 assert len(partitions[1]) == 11 outfile = tempdir + '/out.2.augfastx.gz' readstream = kevlar.parse_augmented_fastx(kevlar.open(outfile, 'r')) partstream = kevlar.parse_partitioned_reads(readstream) partitions = list(partstream) partitions = [part for partid, part in partitions] assert len(partitions) == 1 assert len(partitions[0]) == 15 rmtree(tempdir)
def test_partition_reader_simple(): infile = kevlar.tests.data_file('part-reads-simple.fa') readstream = kevlar.parse_augmented_fastx(kevlar.open(infile, 'r')) partitions = [p for p in kevlar.parse_partitioned_reads(readstream)] assert len(partitions) == 2 assert len(partitions[0]) == 4 assert len(partitions[1]) == 2
def test_alac_bigpart(): readfile = data_file('fiveparts.augfastq.gz') refrfile = data_file('fiveparts-refr.fa.gz') readstream = kevlar.parse_augmented_fastx(kevlar.open(readfile, 'r')) partstream = kevlar.parse_partitioned_reads(readstream) calls = list(kevlar.alac.alac(partstream, refrfile, maxreads=20)) assert len(calls) == 3
def main(args): readstream = kevlar.parse_augmented_fastx(kevlar.open(args.infile, 'r')) if args.part_id: pstream = kevlar.parse_single_partition(readstream, args.part_id) else: pstream = kevlar.parse_partitioned_reads(readstream) outstream = kevlar.open(args.out, 'w') workflow = alac(pstream, args.refr, threads=args.threads, ksize=args.ksize, bigpart=args.bigpart, delta=args.delta, seedsize=args.seed_size, maxdiff=args.max_diff, match=args.match, mismatch=args.mismatch, gapopen=args.open, gapextend=args.extend, min_ikmers=args.min_ikmers, logstream=args.logfile) writer = kevlar.vcf.VCFWriter( outstream, source='kevlar::alac', refr=args.refr, ) writer.write_header() for varcall in workflow: writer.write(varcall)
def test_call_homopolymers_mixed_results(): contigfile = data_file('homopolymer/12175-3parts.contigs.augfasta') contigstream = kevlar.parse_augmented_fastx(kevlar.open(contigfile, 'r')) partstream = kevlar.parse_partitioned_reads(contigstream) contigs = kevlar.call.load_contigs(partstream) gdnafile = data_file('homopolymer/12175-3parts.targets.fasta') gdnastream = kevlar.reference.load_refr_cutouts(kevlar.open(gdnafile, 'r')) partstream = kevlar.parse_partitioned_reads(gdnastream) targets = kevlar.call.load_contigs(partstream) prelimcalls = list() for partid in contigs: contiglist = contigs[partid] gdnalist = targets[partid] caller = kevlar.call.call(gdnalist, contiglist, partid=partid) prelimcalls.extend(list(caller)) kid = kevlar.sketch.load(data_file('homopolymer/12175-kid.sct')) mom = kevlar.sketch.load(data_file('homopolymer/12175-mom.sct')) dad = kevlar.sketch.load(data_file('homopolymer/12175-dad.sct')) refr = kevlar.sketch.load(data_file('homopolymer/12175-refr.sct')) scorer = kevlar.simlike.simlike( prelimcalls, kid, [mom, dad], refr, samplelabels=['Proband', 'Mother', 'Father'], ) calls = list(scorer) assert len(calls) == 6 for c in calls: print(c.vcf) unintrstng = [c for c in calls if c.filterstr in ('PASS', 'Homopolymer')] assert len(unintrstng) == 3 call1, call2, call3 = unintrstng assert call1.position == 123651924 assert call1.filterstr == 'PASS' # negative control assert call1._refr == 'TAA' assert call1._alt == 'T' assert call2.position == 124641259 assert call2.filterstr == 'PASS' # borderline assert call2._refr == 'TAAA' assert call2._alt == 'T' assert call3.position == 128660727 assert call3.filterstr == 'Homopolymer' # positive control
def test_no_reference_match(capsys): readfile = data_file('pico-4.augfastq.gz') reads = kevlar.parse_augmented_fastx(kevlar.open(readfile, 'r')) partitions = kevlar.parse_partitioned_reads(reads) refr = data_file('localize-refr.fa') baldwin = kevlar.alac.alac(partitions, refr) calls = list(baldwin) out, err = capsys.readouterr() assert 'WARNING: no reference matches' in err
def test_localize_no_match(capsys): refr_file = data_file('fiveparts-refr.fa.gz') contig_file = data_file('wasp-pass.contig.augfasta') contigstream = kevlar.parse_augmented_fastx(kevlar.open(contig_file, 'r')) pstream = kevlar.parse_partitioned_reads(contigstream) localizer = localize(pstream, refr_file, seedsize=41, debug=True) cutoutdata = list(localizer) assert cutoutdata == [] out, err = capsys.readouterr() assert 'WARNING: no reference matches' in err
def main(args): partfile = kevlar.open(args.infile, 'r') readstream = kevlar.parse_augmented_fastx(partfile) partstream = kevlar.parse_partitioned_reads(readstream) outstreams = list() for i in range(args.numfiles): outfile = '{:s}.{:d}'.format(args.base, i + 1) os = kevlar.open(outfile, 'w') outstreams.append(os) split(partstream, outstreams)
def main(args): readstream = kevlar.parse_augmented_fastx(kevlar.open(args.augfastq, 'r')) if args.part_id: pstream = kevlar.parse_single_partition(readstream, args.part_id) else: pstream = kevlar.parse_partitioned_reads(readstream) outstream = kevlar.open(args.out, 'w') assembler = assemble(pstream, maxreads=args.max_reads) for partid, contig in assembler: kevlar.print_augmented_fastx(contig, outstream)
def test_pico_calls(cc, pos, ref, alt): reads = data_file('pico-var/cc{:d}.afq.gz'.format(cc)) readstream = kevlar.parse_augmented_fastx(kevlar.open(reads, 'r')) pstream = kevlar.parse_partitioned_reads(readstream) refrfile = data_file('human-random-pico.fa.gz') caller = kevlar.alac.alac(pstream, refrfile, ksize=25, delta=50) calls = [v for v in caller] assert len(calls) == 1 assert calls[0]._pos == pos assert calls[0]._refr == ref assert calls[0]._alt == alt
def test_split(): infile = data_file('fiveparts.augfastq.gz') partfile = kevlar.open(infile, 'r') readstream = kevlar.parse_augmented_fastx(partfile) partstream = kevlar.parse_partitioned_reads(readstream) outstreams = [StringIO(), StringIO(), StringIO()] kevlar.split.split(partstream, outstreams) assert 'kvcc=1' in outstreams[0].getvalue() assert 'kvcc=2' in outstreams[1].getvalue() assert 'kvcc=3' in outstreams[2].getvalue() assert 'kvcc=4' in outstreams[0].getvalue() assert 'kvcc=5' in outstreams[1].getvalue()
def test_maxdiff(X, numtargets): contigstream = kevlar.parse_partitioned_reads( kevlar.parse_augmented_fastx( kevlar.open(data_file('maxdiff-contig.augfasta'), 'r'))) refrfile = data_file('maxdiff-refr.fa.gz') targeter = kevlar.localize.localize(contigstream, refrfile, seedsize=51, delta=50, maxdiff=X) targets = [cutout for partid, cutout in targeter] print([t.defline for t in targets]) assert len(targets) == numtargets
def main(args): readstream = kevlar.parse_augmented_fastx(kevlar.open(args.infile, 'r')) pstream = kevlar.parse_partitioned_reads(readstream) outstream = kevlar.open(args.out, 'w') workflow = alac( pstream, args.refr, ksize=args.ksize, delta=args.delta, maxdiff=args.max_diff, match=args.match, mismatch=args.mismatch, gapopen=args.open, gapextend=args.extend, greedy=args.greedy, logstream=args.logfile ) for varcall in workflow: print(varcall.vcf, file=outstream)
def test_ikmer_filter_python(): """ Smoke test for filtering based in number of supporting ikmers. Each partition in the data set has only 2 supporting interesting k-mers. The supplied reference file doesn't actually correspond to the reads, so if this test passes it's because the filtering worked correctly and the `localize` code is never invoked. """ readfile = data_file('min_ikmers_filt.augfastq.gz') reads = kevlar.parse_augmented_fastx(kevlar.open(readfile, 'r')) parts = kevlar.parse_partitioned_reads(reads) refr = data_file('localize-refr.fa') calls = list(kevlar.alac.alac(parts, refr, ksize=31, min_ikmers=3))
def test_localize_new(): refr_file = data_file('fiveparts-refr.fa.gz') contig_file = data_file('fiveparts.contigs.augfasta.gz') contigstream = kevlar.parse_augmented_fastx(kevlar.open(contig_file, 'r')) pstream = kevlar.parse_partitioned_reads(contigstream) localizer = localize(pstream, refr_file, seedsize=51, debug=True) cutoutdata = list(localizer) partids = [partid for partid, gdna in cutoutdata] gdnas = [gdna for partid, gdna in cutoutdata] deflines = [g.defline for g in gdnas] assert partids == ['1', '1', '2', '3', '4', '5'] assert sorted(deflines) == sorted([ 'seq1_284663-284950', 'seq1_1924681-1925049', 'seq1_1660589-1660884', 'seq1_2315741-2316037', 'seq1_2321099-2321322', 'seq1_593102-593389' ])
def test_alac_generate_mask_lowmem(capsys): readfile = data_file('fiveparts.augfastq.gz') refrfile = data_file('fiveparts-refr.fa.gz') readstream = kevlar.parse_augmented_fastx(kevlar.open(readfile, 'r')) partstream = kevlar.parse_partitioned_reads(readstream) with NamedTemporaryFile(suffix='.nt') as maskfile: calls = list( kevlar.alac.alac(partstream, refrfile, maskfile=maskfile.name, maskmem=100)) assert len(calls) == 5 out, err = capsys.readouterr() message = 'WARNING: mask FPR is 0.8065; exceeds user-specified limit' assert message in out or message in err
def test_alac_matedist(): readfile = data_file('mate-dist/cc130.augfastq.gz') refrfile = data_file('mate-dist/cc130.refr.fa.gz') readstream = kevlar.parse_augmented_fastx(kevlar.open(readfile, 'r')) partstream = kevlar.parse_partitioned_reads(readstream) caller = kevlar.alac.alac(partstream, refrfile, ksize=31, delta=50, seedsize=51) calls = list(caller) assert len(calls) == 3 passed = [c for c in calls if c.filterstr == 'PASS'] assert len(passed) == 3 assert sorted([c.position for c in passed]) == [1475, 115377, 127540]
def test_alac_maxdiff(vcfposition, X, cigar): pstream = kevlar.parse_partitioned_reads( kevlar.parse_augmented_fastx( kevlar.open(data_file('maxdiff-reads.augfastq.gz'), 'r'))) refrfile = data_file('maxdiff-refr.fa.gz') caller = kevlar.alac.alac(pstream, refrfile, ksize=31, delta=50, seedsize=51, maxdiff=X) calls = list(caller) assert len(calls) == 1 assert calls[0].cigar == cigar assert calls[0].position == vcfposition - 1
def test_alac_generate_mask(): readfile = data_file('fiveparts.augfastq.gz') refrfile = data_file('fiveparts-refr.fa.gz') readstream = kevlar.parse_augmented_fastx(kevlar.open(readfile, 'r')) partstream = kevlar.parse_partitioned_reads(readstream) with NamedTemporaryFile(suffix='.nt') as maskfile: calls = list( kevlar.alac.alac(partstream, refrfile, maskfile=maskfile.name, maskmem=1e6)) assert len(calls) == 5 for c in calls: print(c.vcf) testfilename = data_file('fiveparts-genmask.nodetable') assert filecmp.cmp(testfilename, maskfile.name) is True
def main(args): contigstream = kevlar.seqio.afxstream(args.contigs) if args.part_id: pstream = kevlar.parse_single_partition(contigstream, args.part_id) else: pstream = kevlar.parse_partitioned_reads(contigstream) outstream = kevlar.open(args.out, 'w') localizer = localize( pstream, args.refr, seedsize=args.seed_size, delta=args.delta, maxdiff=args.max_diff, inclpattern=args.include, exclpattern=args.exclude, ) for part, gdna in localizer: seqname = gdna.defline if part is not None: seqname += ' kvcc={}'.format(part) record = kevlar.sequence.Record(name=seqname, sequence=gdna.sequence) kevlar.sequence.write_record(record, outstream)
def main(args): readstream = kevlar.parse_augmented_fastx(kevlar.open(args.infile, 'r')) if args.part_id: pstream = kevlar.parse_single_partition(readstream, args.part_id) else: pstream = kevlar.parse_partitioned_reads(readstream) outstream = kevlar.open(args.out, 'w') workflow = alac( pstream, args.refr, threads=args.threads, ksize=args.ksize, maxreads=args.max_reads, delta=args.delta, seedsize=args.seed_size, maxdiff=args.max_diff, inclpattern=args.include, exclpattern=args.exclude, match=args.match, mismatch=args.mismatch, gapopen=args.open, gapextend=args.extend, min_ikmers=args.min_ikmers, maskfile=args.gen_mask, maskmem=args.mask_mem, maskmaxfpr=args.mask_max_fpr, maxtargetlen=args.max_target_length, ) writer = kevlar.vcf.VCFWriter( outstream, source='kevlar::alac', refr=args.refr, ) writer.write_header() for varcall in workflow: writer.write(varcall)
def test_partition_reader_mixed(): infile = data_file('part-reads-mixed.fa') readstream = kevlar.parse_augmented_fastx(kevlar.open(infile, 'r')) errormsg = r'with and without partition labels' with pytest.raises(KevlarPartitionLabelError, match=errormsg): partitions = list(kevlar.parse_partitioned_reads(readstream))
'--out-pattern', metavar='REGEX', help='out file name pattern with a {} placeholder for partition ID') parser.add_argument('augfastx') parser.add_argument('partition', nargs='+') args = parser.parse_args() if args.out and args.out_pattern: raise Exception('cannot give outfile and outpattern together') elif args.out and not args.out_pattern: args.out = kevlar.open(args.out, 'w') elif not args.out and not args.out_pattern: args.out = sys.stdout partids = set(args.partition) fh = kevlar.open(args.augfastx, 'r') reader = kevlar.parse_augmented_fastx(fh) preader = kevlar.parse_partitioned_reads(reader) for partid, partition in preader: if partid not in partids: continue if args.out_pattern: pattern = str(args.out_pattern) outfile = pattern.format(partid) with kevlar.open(outfile, 'w') as out: for read in partition: kevlar.print_augmented_fastx(read, out) else: for read in partition: kevlar.print_augmented_fastx(read, args.out)
def test_partition_reader_mixed(): infile = kevlar.tests.data_file('part-reads-mixed.fa') readstream = kevlar.parse_augmented_fastx(kevlar.open(infile, 'r')) with pytest.raises(KevlarPartitionLabelError) as ple: partitions = [p for p in kevlar.parse_partitioned_reads(readstream)] assert 'with and without partition labels' in str(ple)
def main(args): # Input and output files outstream = kevlar.open(args.out, 'w') writer = kevlar.vcf.VCFWriter( outstream, source='kevlar::call', refr=args.refr, ) writer.write_header() # Contigs = query sequences contigstream = kevlar.parse_partitioned_reads( kevlar.parse_augmented_fastx(kevlar.open(args.queryseq, 'r'))) contigs_by_partition = load_contigs(contigstream) gdnastream = kevlar.parse_partitioned_reads( kevlar.reference.load_refr_cutouts(kevlar.open(args.targetseq, 'r'))) mask = None if args.gen_mask: message = 'generating mask of variant-spanning k-mers' kevlar.plog('[kevlar::call]', message) ntables = 4 buckets = args.mask_mem * _buckets_per_byte['nodegraph'] / ntables mask = khmer.Nodetable(args.ksize, buckets, ntables) progress_indicator = kevlar.ProgressIndicator( '[kevlar::call] processed contigs/gDNAs for {counter} partitions', interval=10, breaks=[100, 1000, 10000], ) for partid, gdnas in gdnastream: progress_indicator.update() if partid not in contigs_by_partition: continue contigs = contigs_by_partition[partid] caller = call( gdnas, contigs, partid, match=args.match, mismatch=args.mismatch, gapopen=args.open, gapextend=args.extend, ksize=args.ksize, refrfile=args.refr, debug=args.debug, mindist=5, homopolyfilt=not args.no_homopoly_filter, maxtargetlen=args.max_target_length, ) for varcall in caller: if args.gen_mask: window = varcall.attribute('ALTWINDOW') if window is not None and len(window) >= args.ksize: mask.consume(window) writer.write(varcall) if args.gen_mask: fpr = khmer.calc_expected_collisions(mask, max_false_pos=1.0) if fpr > args.mask_max_fpr: message = 'WARNING: mask FPR is {:.4f}'.format(fpr) message += '; exceeds user-specified limit' message += ' of {:.4f}'.format(args.mask_max_fpr) kevlar.plog('[kevlar::call]', message) mask.save(args.gen_mask)