def filter(readstream, mask=None, minabund=5, ksize=31, memory=1e6, maxfpr=0.001, logstream=sys.stderr): timer = kevlar.Timer() timer.start('recalc') print('[kevlar::filter] Loading input; recalculate k-mer abundances,', 'de-duplicate reads and merge k-mers', file=logstream) readset = kevlar.seqio.AnnotatedReadSet(ksize, memory) for record in readstream: readset.add(record) fpr = summarize_readset(readset, logstream) if fpr > maxfpr: raise KevlarUnsuitableFPRError('FPR too high, bailing out!!!') elapsed = timer.stop('recalc') print('[kevlar::filter] Input loaded in {:.2f} sec'.format(elapsed), file=logstream) timer.start('validate') print('[kevlar::filter] Validate k-mers and print reads', file=logstream) readset.validate(mask=mask, minabund=minabund) for n, record in enumerate(readset, 1): yield record summarize_validate(readset, n, logstream) elapsed = timer.stop('validate') print('[kevlar::filter] k-mers validated and reads printed', 'in {:.2f} sec'.format(elapsed), file=logstream)
def main(args): if (args.num_bands is None) is not (args.band is None): raise ValueError('Must specify --num-bands and --band together') myband = args.band - 1 if args.band else None if args.mask: args.mask = kevlar.sketch.load(args.mask) timer = kevlar.Timer() timer.start() docount = args.counter_size > 1 dosmallcount = args.counter_size == 4 sketch = load_sample_seqfile(args.seqfile, args.ksize, args.memory, args.max_fpr, count=docount, smallcount=dosmallcount, mask=args.mask, consume_masked=args.count_masked, numbands=args.num_bands, band=myband, numthreads=args.threads, outfile=args.counttable, logfile=args.logfile) total = timer.stop() message = 'Total time: {:.2f} seconds'.format(total) print('[kevlar::count]', message, file=args.logfile)
def main(args): timer = kevlar.Timer() timer.start() mask = load_mask(args.mask, args.ksize, args.mask_memory, maxfpr=args.mask_max_fpr, savefile=args.save_mask, logstream=args.logfile) readstream = kevlar.seqio.afxstream(args.augfastq) outstream = kevlar.open(args.out, 'w') filterstream = filter(readstream, mask, minabund=args.min_abund, ksize=args.ksize, memory=args.abund_memory, maxfpr=args.abund_max_fpr, logstream=args.logfile) for record in filterstream: kevlar.print_augmented_fastx(record, outstream) total = timer.stop() message = 'Total time: {:.2f} seconds'.format(total) print('[kevlar::filter]', message, file=args.logfile)
def main(args): timer = kevlar.Timer() timer.start() if (not args.num_bands) is not (not args.band): raise ValueError('Must specify --num-bands and --band together') myband = args.band - 1 if args.band else None timer.start('loadall') print('[kevlar::novel] Loading control samples', file=args.logfile) timer.start('loadctrl') controls = load_samples(args.control_counts, args.control, args.ksize, args.memory, args.max_fpr, args.num_bands, myband, args.threads, args.logfile) elapsed = timer.stop('loadctrl') message = 'Control samples loaded in {:.2f} sec'.format(elapsed) print('[kevlar::novel]', message, file=args.logfile) print('[kevlar::novel] Loading case samples', file=args.logfile) timer.start('loadcases') cases = load_samples(args.case_counts, args.case, args.ksize, args.memory, args.max_fpr, args.num_bands, myband, args.threads, args.logfile) elapsed = timer.stop('loadcases') print('[kevlar::novel] Case samples loaded in {:.2f} sec'.format(elapsed), file=args.logfile) elapsed = timer.stop('loadall') print('[kevlar::novel] All samples loaded in {:.2f} sec'.format(elapsed), file=args.logfile) timer.start('iter') ncases = len(args.case) message = 'Iterating over reads from {:d} case sample(s)'.format(ncases) print('[kevlar::novel]', message, file=args.logfile) outstream = kevlar.open(args.out, 'w') infiles = [f for filelist in args.case for f in filelist] caserecords = kevlar.multi_file_iter_screed(infiles) readstream = novel( caserecords, cases, controls, ksize=args.ksize, abundscreen=args.abund_screen, casemin=args.case_min, ctrlmax=args.ctrl_max, numbands=args.num_bands, band=myband, skipuntil=args.skip_until, updateint=args.upint, logstream=args.logfile, ) for augmented_read in readstream: kevlar.print_augmented_fastx(augmented_read, outstream) elapsed = timer.stop('iter') message = 'Iterated over all case reads in {:.2f} seconds'.format(elapsed) print('[kevlar::novel]', message, file=args.logfile) total = timer.stop() message = 'Total time: {:.2f} seconds'.format(total) print('[kevlar::novel]', message, file=args.logfile)
def filter(readfile, mask=None, memory=1e6, maxfpr=0.01, casemin=6, ctrlmax=1): timer = kevlar.Timer() timer.start() reader = kevlar.parse_augmented_fastx(kevlar.open(readfile, 'r')) counts = first_pass(reader, mask, memory, timer) check_fpr(counts, maxfpr) reader = kevlar.parse_augmented_fastx(kevlar.open(readfile, 'r')) for read in second_pass(reader, counts, casemin, ctrlmax, timer): yield read total = timer.stop() message = 'Total time: {:.2f} seconds'.format(total) kevlar.plog('[kevlar::filter]', message)
def main(args): if (args.num_bands is None) is not (args.band is None): raise ValueError('Must specify --num-bands and --band together') myband = args.band - 1 if args.band else None timer = kevlar.Timer() timer.start() timer.start('loadctrl') print('[kevlar::count] Loading control samples', file=args.logfile) outfiles, infilelists = split_infiles_outfiles(args.control) controls = kevlar.counting.load_samples(infilelists, args.ksize, args.memory, outfiles=outfiles, memfraction=args.mem_frac, maxfpr=args.max_fpr, maxabund=args.ctrl_max, mask=None, numbands=args.num_bands, band=myband, numthreads=args.threads, logfile=args.logfile) elapsed = timer.stop('loadctrl') numcontrols = len(controls) message = '{:d} samples loaded in {:.2f} sec'.format(numcontrols, elapsed) print('[kevlar::count]', message, file=args.logfile) print('[kevlar::count] Loading case samples', file=args.logfile) timer.start('loadcase') outfiles, infilelists = split_infiles_outfiles(args.case) casemask = outfiles[0] if args.mem_frac else None cases = kevlar.counting.load_samples(infilelists, args.ksize, args.memory, outfiles=outfiles, memfraction=args.mem_frac, maxfpr=args.max_fpr, maxabund=args.ctrl_max, mask=casemask, numbands=args.num_bands, band=myband, numthreads=args.threads, logfile=args.logfile) elapsed = timer.stop('loadcase') numcases = len(cases) message = '{:d} sample(s) loaded in {:.2f} sec'.format(numcases, elapsed) print('[kevlar::count]', message, file=args.logfile) total = timer.stop() message = 'Total time: {:.2f} seconds'.format(total) print('[kevlar::count]', message, file=args.logfile)
def __init__(self, message, interval=10, breaks=[100, 1000, 10000], usetimer=False): self.message = message self.counter = 0 self.interval = interval self.nextupdate = interval self.breaks = breaks self.timer = None if usetimer: self.timer = kevlar.Timer() self.timer.start()
def partition(readstream, strict=False, minabund=None, maxabund=None, dedup=True, gmlfile=None, logstream=sys.stderr): timer = kevlar.Timer() timer.start() timer.start('loadreads') print('[kevlar::partition] Loading reads', file=logstream) graph = kevlar.ReadGraph() graph.load(readstream, minabund=minabund, maxabund=maxabund) elapsed = timer.stop('loadreads') print('[kevlar::partition]', 'Reads loaded in {:.2f} sec'.format(elapsed), file=logstream) timer.start('buildgraph') mode = 'strict' if strict else 'relaxed' message = 'Building read graph in {:s} mode'.format(mode) print('[kevlar::partition]', message, file=logstream) graph.populate_edges(strict=strict) elapsed = timer.stop('buildgraph') print('[kevlar::partition]', 'Graph built in {:.2f} sec'.format(elapsed), file=logstream) if gmlfile: # pragma: no cover kevlar.to_gml(graph, gmlfile, logstream) timer.start('partition') print('[kevlar::partition] Partition readgraph', file=logstream) part_iter = graph.partitions(dedup, minabund, maxabund, abundfilt=True) for part in part_iter: reads = [graph.get_record(readname) for readname in list(part)] yield reads elapsed = timer.stop('partition') print('[kevlar::partition]', 'Partitioning done in {:.2f} sec'.format(elapsed), file=logstream) total = timer.stop() message = 'Total time: {:.2f} seconds'.format(total) print('[kevlar::partition]', message, file=logstream)
def main(args): timer = kevlar.Timer() timer.start() mask = None if args.mask: timer.start('loadmask') print('[kevlar::filter] Loading mask from', args.mask, file=args.logfile) mask = load_mask(args.mask, args.ksize, args.mask_memory, maxfpr=args.mask_max_fpr, savefile=args.save_mask, logfile=args.logfile) elapsed = timer.stop('loadmask') print('[kevlar::filter]', 'Mask loaded in {:.2f} sec'.format(elapsed), file=args.logfile) timer.start('recalc') print('[kevlar::filter] Loading input; recalculate k-mer abundances,', 'de-duplicate reads and merge k-mers', file=args.logfile) readset, countgraph = load_input(args.augfastq, args.ksize, args.abund_memory, args.abund_max_fpr, args.logfile) elapsed = timer.stop('recalc') print('[kevlar::filter] Input loaded in {:.2f} sec'.format(elapsed), file=args.logfile) timer.start('validate') print('[kevlar::filter] Validate k-mers and print reads', file=args.logfile) outstream = kevlar.open(args.out, 'w') augstream = kevlar.open(args.aug_out, 'w') if args.aug_out else None validate_and_print(readset, countgraph, mask, args.min_abund, outstream, augstream, args.logfile) elapsed = timer.stop('validate') print('[kevlar::filter] k-mers validated and reads printed', 'in {:.2f} sec'.format(elapsed), file=args.logfile) total = timer.stop() message = 'Total time: {:.2f} seconds'.format(total) print('[kevlar::filter]', message, file=args.logfile)
def main(args): timer = kevlar.Timer() timer.start() timer.start('loadgenome') print('[kevlar::gentrio] Loading genome...', end='', file=sys.stderr) seqfile = kevlar.open(args.genome, 'r') genomeseqs = kevlar.seqio.parse_seq_dict(seqfile) elapsed = timer.stop('loadgenome') print('done! ({:.3f} seconds elapsed)'.format(elapsed), file=sys.stderr) samples = ('proband', 'mother', 'father') outfiles = ['{:s}-{:s}.fasta'.format(args.prefix, s) for s in samples] outstreams = [kevlar.open(outfile, 'w') for outfile in outfiles] vcfout = None if args.vcf: vcfout = kevlar.open(args.vcf, 'w') kevlar.vcf_header(vcfout, source='kevlar::gentrio', infoheader=True) weights = weights_str_to_dict(args.weights) mutator = gentrio(genomeseqs, outstreams, ninh=args.inherited, ndenovo=args.de_novo, weights=weights, seed=args.seed, logstream=args.logfile) timer.start('mutate') print('[kevlar::gentrio] Begin generating and applying mutations:', file=sys.stderr) for variant in mutator: if vcfout: print(variant.vcf, file=vcfout) elapsed = timer.stop('mutate') print('[kevlar::gentrio] Done applying mutations! ', end='', file=sys.stderr) print('({:.3f} seconds elapsed)'.format(elapsed), file=sys.stderr) for outstream in outstreams: outstream.close() elapsed = timer.stop() print('[kevlar::gentrio] Trio simulation complete; ', file=sys.stderr) print(' total runtime: {:.3f} seconds'.format(elapsed), file=sys.stderr)
def load_mask(maskfiles, ksize, memory, maxfpr=0.001, savefile=None, logstream=sys.stderr): """Load reference genome and/or contaminant database from a file.""" if maskfiles is None: return None timer = kevlar.Timer() timer.start('loadmask') print('[kevlar::filter] Loading mask from', maskfiles, file=logstream) if len(maskfiles) == 1 and maskfiles[0].endswith(('.nt', '.nodetable')): mask = kevlar.sketch.load(maskfiles[0]) message = ' nodetable loaded' else: buckets = memory * khmer._buckets_per_byte['nodegraph'] / 4 mask = khmer.Nodetable(ksize, buckets, 4) nr, nk = 0, 0 for maskfile in maskfiles: numreads, numkmers = mask.consume_seqfile(maskfile) nr += numreads nk += numkmers message = ' {:d} sequences and {:d} k-mers consumed'.format(nr, nk) fpr = kevlar.sketch.estimate_fpr(mask) message += '; estimated false positive rate is {:1.3f}'.format(fpr) print(message, file=logstream) if fpr > maxfpr: raise KevlarUnsuitableFPRError('FPR too high, bailing out!!!') if savefile: mask.save(savefile) message = ' nodetable saved to "{:s}"'.format(savefile) print(message, file=logstream) elapsed = timer.stop('loadmask') print('[kevlar::filter]', 'Mask loaded in {:.2f} sec'.format(elapsed), file=logstream) return mask
def partition(readstream, strict=False, minabund=None, maxabund=None, dedup=True, gmlfile=None): timer = kevlar.Timer() timer.start() timer.start('loadreads') kevlar.plog('[kevlar::partition] Loading reads') graph = kevlar.ReadGraph() graph.load(readstream, minabund=minabund, maxabund=maxabund) elapsed = timer.stop('loadreads') message = 'Reads loaded in {:.2f} sec'.format(elapsed) kevlar.plog('[kevlar::partition]', message) timer.start('buildgraph') mode = 'strict' if strict else 'relaxed' message = 'Building read graph in {:s} mode'.format(mode) kevlar.plog('[kevlar::partition]', message) graph.populate_edges(strict=strict) elapsed = timer.stop('buildgraph') message = 'Graph built in {:.2f} sec'.format(elapsed) kevlar.plog('[kevlar::partition]', message) if gmlfile: # pragma: no cover kevlar.to_gml(graph, gmlfile, logstream) timer.start('partition') kevlar.plog('[kevlar::partition] Partition readgraph') part_iter = graph.partitions(dedup, minabund, maxabund, abundfilt=True) for n, part in enumerate(part_iter, 1): reads = [graph.get_record(readname) for readname in list(part)] for read in reads: read.name += ' kvcc={:d}'.format(n) yield n, reads elapsed = timer.stop('partition') message = 'Partitioning done in {:.2f} sec'.format(elapsed) kevlar.plog('[kevlar::partition]', message) total = timer.stop() message = 'Total time: {:.2f} seconds'.format(total) kevlar.plog('[kevlar::partition]', message)
def main(args): if (args.num_bands is None) is not (args.band is None): raise ValueError('Must specify --num-bands and --band together') myband = args.band - 1 if args.band else None if len(args.outfiles) != len(args.sample): message = 'number of outfiles must match number of declared samples' raise ValueError(message) timer = kevlar.Timer() timer.start() loader = load_samples( args.sample, args.ksize, args.memory, memfraction=args.memfrac, maxfpr=args.max_fpr, maxabund=args.max_abund, numbands=args.num_bands, band=args.band, numthreads=args.threads, logfile=args.logfile ) for sketch, outfile in zip(loader, args.outfiles): sketch.save(outfile) total = timer.stop() message = 'Total time: {:.2f} seconds'.format(total) print('[kevlar::effcount]', message, file=args.logfile)
def novel(casestream, casecounts, controlcounts, ksize=31, abundscreen=None, casemin=5, ctrlmax=0, numbands=None, band=None, skipuntil=None, updateint=10000, logstream=sys.stderr): numbands_unset = not numbands band_unset = not band and band != 0 if numbands_unset is not band_unset: raise ValueError('Must specify `numbands` and `band` together') if band is not None and band < 0: maxband = numbands - 1 message = '`band` must be a value between 0 and {:d}'.format(maxband) message += ' (`numbands` - 1), inclusive' raise ValueError(message) timer = kevlar.Timer() timer.start() nkmers = 0 nreads = 0 nextupdate = updateint unique_kmers = set() for n, record, mate in kevlar.paired_reader(casestream): if skipuntil: # pragma: no cover if record.name == skipuntil: message = 'Found read {:s}'.format(skipuntil) message += ' (skipped {:d} reads)'.format(n) print('[kevlar::novel]', message, file=logstream) skipuntil = False continue if n >= nextupdate: nextupdate += updateint elapsed = timer.probe() msg = ' processed {} reads'.format(n) msg += ' in {:.2f} seconds...'.format(elapsed) print(msg, file=logstream) if len(record.sequence) < ksize: continue if re.search('[^ACGT]', record.sequence): # This check should be temporary; hopefully khmer will handle # this soon. continue discard_read = False irecord = None for i, kmer in enumerate(casecounts[0].get_kmers(record.sequence)): if numbands: khash = casecounts[0].hash(kmer) if khash & (numbands - 1) != band - 1: continue interesting, discard, caseabund, ctrlabund = kmer_is_interesting( kmer, casecounts, controlcounts, case_min=casemin, ctrl_max=ctrlmax, screen_thresh=abundscreen, ) if discard: discard_read = True break if not interesting: continue if irecord is None: irecord = kevlar.sequence.copy_record(record) abund = tuple(caseabund + ctrlabund) irecord.annotate(kmer, i, abund) minkmer = kevlar.revcommin(kmer) unique_kmers.add(minkmer) if discard_read or irecord is None: continue nreads += 1 nkmers += len(irecord.annotations) if mate: irecord.add_mate(mate.sequence) yield irecord elapsed = timer.stop() message = 'Found {:d} instances'.format(nkmers) message += ' of {:d} unique novel kmers'.format(len(unique_kmers)) message += ' in {:d} reads'.format(nreads) message += ' in {:.2f} seconds'.format(elapsed) print('[kevlar::novel]', message, file=logstream)
def novel(casestream, casecounts, controlcounts, ksize=31, abundscreen=None, casemin=5, ctrlmax=0, numbands=None, band=None, skipuntil=None): numbands_unset = not numbands band_unset = not band and band != 0 if numbands_unset is not band_unset: raise ValueError('Must specify `numbands` and `band` together') if band is not None and band < 0: maxband = numbands - 1 message = '`band` must be a value between 0 and {:d}'.format(maxband) message += ' (`numbands` - 1), inclusive' raise ValueError(message) timer = kevlar.Timer() timer.start() nkmers = 0 nreads = 0 update_message = '[kevlar::novel] processed {counter} reads' skip_message = None if skipuntil: msg = '; skipping reads in search of {read}'.format(read=skipuntil) skip_message = update_message + msg first_message = skip_message if skipuntil else update_message progress_indicator = kevlar.ProgressIndicator( first_message, interval=1e6, breaks=[1e7, 1e8, 1e9], usetimer=True, ) unique_kmers = set() for n, record in enumerate(casestream, 1): progress_indicator.update() if skipuntil: # pragma: no cover if record.name == skipuntil: message = 'Found read {:s}'.format(skipuntil) message += ' (skipped {:d} reads)'.format(n) kevlar.plog('[kevlar::novel]', message) skipuntil = False progress_indicator.message = update_message continue if len(record.sequence) < ksize: continue if re.search('[^ACGT]', record.sequence): # This check should be temporary; hopefully khmer will handle # this soon. continue discard_read = False irecord = None for i, kmer in enumerate(casecounts[0].get_kmers(record.sequence)): if numbands: khash = casecounts[0].hash(kmer) if khash & (numbands - 1) != band - 1: continue interesting, discard, caseabund, ctrlabund = kmer_is_interesting( kmer, casecounts, controlcounts, case_min=casemin, ctrl_max=ctrlmax, screen_thresh=abundscreen, ) if discard: discard_read = True break if not interesting: continue if irecord is None: irecord = kevlar.sequence.copy_record(record) abund = tuple(caseabund + ctrlabund) irecord.annotate(kmer, i, abund) minkmer = kevlar.revcommin(kmer) unique_kmers.add(minkmer) if discard_read or irecord is None: continue nreads += 1 nkmers += len(irecord.annotations) yield irecord elapsed = timer.stop() message = 'Found {:d} instances'.format(nkmers) message += ' of {:d} unique novel kmers'.format(len(unique_kmers)) message += ' in {:d} reads'.format(nreads) message += ' in {:.2f} seconds'.format(elapsed) kevlar.plog('[kevlar::novel]', message)