def cutout(contigs, refrseqs, seed_matches, seedsize=51, delta=50, maxdiff=None, inclpattern=None, exclpattern=None, debug=False): """Compute reference target sequences for a set of partitioned contigs. Partition by partition, decompose contigs into seeds, determine the genomic location of each seed, calculated the span of all seeds (plus some extension delta), and cut out that interval of the genome. """ localizer = kevlar.localize.Localizer( seedsize, incl=inclpattern, excl=exclpattern ) for contig in contigs: for seed in decompose_seeds(contig.sequence, seedsize): seed = kevlar.revcommin(seed) if seed not in seed_matches: if debug: # pragma: no cover message = 'WARNING: no position for seed {}'.format(seed) kevlar.plog('[kevlar::localize]', message) continue for seqid, position in seed_matches[seed]: localizer.add_seed_match(seqid, position) if maxdiff is None: maxcontiglen = max([len(c.sequence) for c in contigs]) maxdiff = maxcontiglen * 3 cutter = localizer.get_cutouts(refrseqs=refrseqs, delta=delta, clusterdist=maxdiff) for gdna in cutter: yield gdna
def test_ctrl3(mask, nkmers, nkmerinstances): readfile = data_file('trio1/novel_3_1,2.txt') ikmers = defaultdict(int) for read in kevlar.filter.filter(readfile, memory=1e7, mask=mask): for ikmer in read.annotations: kmerseq = kevlar.revcommin(read.ikmerseq(ikmer)) ikmers[kmerseq] += 1 assert len(ikmers) == nkmers assert sum(ikmers.values()) == nkmerinstances
def get_unique_seeds(recordstream, seedsize): """Grab all unique seeds from the specified sequence file.""" ct = Counttable(seedsize, 1, 1) kmers = set() for record in recordstream: for kmer in ct.get_kmers(record.sequence): minkmer = kevlar.revcommin(kmer) if minkmer not in kmers: kmers.add(minkmer) yield kmer
def load(self, readstream, minabund=None, maxabund=None, dedup=False): """ Load reads and interesting k-mers into a graph structure. A graph node is created for each read, and a set of reads containing each interesting k-mer is stored. If abundance thresholds are enforced, do a second in-memory pass over the k-mers to discard any that don't satisfy the threshold criteria. Set `dedup=True` to deduplicate read sequences for handling PCR duplicates. This doesn't do a proper check (i.e. check both pairs against genome), but simply makes sure that only one copy of each read sequence is loaded. This is implemented with a very naive and resource intensive approach, so this mode should only be used on small (e.g. already partitioned) graphs. """ temp_ikmers = defaultdict(set) unique_reads = set() for record in readstream: if dedup: minread = kevlar.revcommin(record.sequence) if minread in unique_reads: continue unique_reads.add(minread) self.add_node(record.name, record=record) self.readnames.add(record.name) for kmer in record.ikmers: kmerseq = kevlar.revcommin(kmer.sequence) temp_ikmers[kmerseq].add(record.name) if minabund is None and maxabund is None: self.ikmers = temp_ikmers else: for kmer in temp_ikmers: readset = temp_ikmers[kmer] abund = len(readset) minfail = minabund and abund < minabund maxfail = maxabund and abund > maxabund if not minfail and not maxfail: self.ikmers[kmer] = readset
def add(self, newrecord): if newrecord.name in self._reads: record = self._reads[newrecord.name] assert record.sequence == newrecord.sequence record.ikmers.extend(newrecord.ikmers) else: self._reads[newrecord.name] = newrecord self._counts.consume(newrecord.sequence) self._readcounts[newrecord.name] += 1 for kmer in newrecord.ikmers: minkmer = kevlar.revcommin(kmer.sequence) self._ikmercounts[minkmer] += 1
def test_partition_dedup(capsys): infile = kevlar.tests.data_file('dup.augfastq') tempdir = tempfile.mkdtemp() arglist = ['partition', '--split', tempdir + '/dedup', infile] args = kevlar.cli.parser().parse_args(arglist) kevlar.logstream, logstream = sys.stderr, kevlar.logstream kevlar.partition.main(args) kevlar.logstream = logstream out, err = capsys.readouterr() assert 'grouped 16 reads into 1 connected components' in err outfile = tempdir + '/dedup.cc1.augfastq.gz' stream = kevlar.open(outfile, 'r') parser = kevlar.parse_augmented_fastx(stream) readseqs = [r.sequence for r in parser] uniquereadseqs = set([kevlar.revcommin(s) for s in readseqs]) testreads = [ 'AACGAACCACCTCAATGATGACCTTTATGCTTCCACGGCAAATGGTGCGG', 'ACGAACCACCTCAATGATGACCTTTATGCTTCCACGGCAAATGGTGCGGT', 'AGGGCACACCTAACCGCACCATTTGCCGTGGAAGCATAAAGGTCATCATT', 'ATCGGAACGAACCACCTCAATGATGACCTTTATGCTTCCACGGCAAATGG', 'CCACCTCAATGATGACCTTTATGCTTCCACGGCAAATGGTGCGGTTAGGT', 'CCTCAATGATGACCTTTATGCTTCCACGGCAAATGGTGCGGTTAGGTGTG', 'CGCACCATTTGCCGTGGAAGCATAAAGGTCATCATTGAGGTGGTTCGTTC', 'CGGAAGGGCACACCTAACCGCACCATTTGCCGTGGAAGCATAAAGGTCAT', 'CGGCTATGGCGGAAGGGCACACCTAACCGCACCATTTGCCGTGGAAGCAT', 'CTATGGCGGAAGGGCACACCTAACCGCACCATTTGCCGTGGAAGCATAAA', 'GCTTCCACGGCAAATGGTGCGGTTAGGTGTGCCCTTCCGCCATAGCCGGA', 'GGAACGAACCACCTCAATGATGACCTTTATGCTTCCACGGCAAATGGTGC', 'GGCAAATGGTGCGGTTAGGTGTGCCCTTCCGCCATAGCCGGATCGTGGCA', 'TATGCTTCCACGGCAAATGGTGCGGTTAGGTGTGCCCTTCCGCCATAGCC', 'TTATGCTTCCACGGCAAATGGTGCGGTTAGGTGTGCCCTTCCGCCATAGC', 'TTGGTGCCACGATCCGGCTATGGCGGAAGGGCACACCTAACCGCACCATT', ] testreadseqs = set([kevlar.revcommin(s) for s in testreads]) assert uniquereadseqs == testreadseqs shutil.rmtree(tempdir)
def test_filter_abundfilt(): readfile = data_file('worm.augfasta') ikmers = defaultdict(int) filt = kevlar.filter.filter(readfile, memory=1000, casemin=5, ctrlmax=0) validated = list(filt) assert len(validated) == 5 for read in validated: for ikmer in read.annotations: kmerseq = kevlar.revcommin(read.ikmerseq(ikmer)) ikmers[kmerseq] += 1 assert len(ikmers) == 1 assert sum(ikmers.values()) == 5
def get_seed_matches(seedfile, refrfile, seedsize=51): """Determine the position of all seeds with a single system call to BWA.""" kevlar.plog('[kevlar::localize] computing seed matches') bwa_cmd = 'bwa mem -k {k} -T {k} -a -c 5000 {idx} {seeds}'.format( k=seedsize, idx=refrfile, seeds=seedfile ) bwa_args = bwa_cmd.split() seed_index = defaultdict(set) for seqid, start, end, seq in bwa_align(bwa_args, seqfilename=seedfile): minseq = kevlar.revcommin(seq) seed_index[minseq].add((seqid, start)) message = 'found positions for {} seeds'.format(len(seed_index)) kevlar.plog('[kevlar::localize]', message) return seed_index
def assemble_with_greed(graph, ccindex, debugout=None): """Find shortest common superstring using a greedy assembly algorithm.""" count = 0 while len(graph.edges()) > 0: count += 1 pair = fetch_largest_overlapping_pair(graph) newname = 'contig{:d}:cc={:d}'.format(count, ccindex) newrecord = merge_and_reannotate(pair, newname) if debugout: print('### DEBUG', pair.tail.name, pair.head.name, pair.offset, pair.overlap, pair.sameorient, file=debugout) kevlar.print_augmented_fastx(newrecord, debugout) for kmer in newrecord.ikmers: kmerseq = kevlar.revcommin(kmer.sequence) for readname in graph.ikmers[kmerseq]: already_merged = readname not in graph current_contig = readname in [ pair.tail.name, pair.head.name, newname ] if already_merged or current_contig: continue otherrecord = graph.get_record(readname) newpair = kevlar.overlap.calc_offset(newrecord, otherrecord, kmerseq, debugout) if newpair == kevlar.overlap.INCOMPATIBLE_PAIR: continue tn, hn = newpair.tail.name, newpair.head.name if tn in graph and hn in graph[tn]: assert graph[tn][hn]['overlap'] == newpair.overlap if graph[tn][hn]['tail'] == newpair.tail: assert graph[tn][hn]['offset'] == newpair.offset else: graph.add_edge(tn, hn, offset=newpair.offset, overlap=newpair.overlap, ikmer=kmerseq, orient=newpair.sameorient, tail=tn, swapped=newpair.swapped) graph.ikmers[kmerseq].add(newrecord.name) graph.add_node(newrecord.name, record=newrecord) graph.remove_node(pair.tail.name) graph.remove_node(pair.head.name)
def get_unique_kmers(recordstream, ksize=31): """ Grab all unique k-mers from the specified sequence file. Input is expected to be an iterable containing screed or khmer sequence records. """ ct = khmer.Counttable(ksize, 1, 1) kmers = set() for record in recordstream: for kmer in ct.get_kmers(record.sequence): minkmer = kevlar.revcommin(kmer) if minkmer not in kmers: kmers.add(minkmer) yield kmer
def contigs_2_seeds(partstream, seedstream, seedsize=51): """Convert a stream of partitioned contigs to seeds and write to a file.""" message = 'decomposing contigs into seeds of length {}'.format(seedsize) kevlar.plog('[kevlar::localize]', message) seeds = set() for partition in partstream: contigs = list(partition) for contig in contigs: for seed in decompose_seeds(contig.sequence, seedsize): seeds.add(kevlar.revcommin(seed)) n = 0 for n, seed in enumerate(sorted(seeds)): print('>seed{}\n{}'.format(n, seed), file=seedstream) seedstream.flush() message = 'contigs decomposed into {} seeds'.format(n) kevlar.plog('[kevlar::localize]', message)
def validate(self, counts, mask=None, minabund=5): for readid in self._reads: record = self._reads[readid] validated_kmers = list() for kmer in record.ikmers: kmerseq = kevlar.revcommin(kmer.sequence) if mask and mask.get(kmerseq) > 0: self._masked[kmerseq] += 1 elif counts.get(kmerseq) < minabund: self._lowabund[kmerseq] += 1 else: kmer.abund[0] = counts.get(kmerseq) validated_kmers.append(kmer) self._valid[kmerseq] += 1 record.ikmers = validated_kmers if len(validated_kmers) == 0: self._novalidkmers_count += 1
def load_reads_and_kmers(instream, logstream=None): """ Load reads into lookup tables for convenient access. The first table is a dictionary of reads indexed by read name, and the second table is a dictionary of read sets indexed by an interesting k-mer. """ reads = dict() kmers = defaultdict(set) for n, record in enumerate(kevlar.parse_augmented_fastx(instream), 1): if logstream and n % 10000 == 0: # pragma: no cover print('[kevlar::seqio] loaded {:d} reads'.format(n), file=logstream) reads[record.name] = record for kmer in record.ikmers: kmerseq = kevlar.revcommin(kmer.sequence) kmers[kmerseq].add(record.name) return reads, kmers
def load_input(filelist, ksize, memory, maxfpr=0.001, logfile=sys.stderr): """ Load input data. The input data is loaded into two data structures. First, the read sequences are loaded into a countgraph to recompute k-mer abundances with (effectively) exact precision. Second, the reads and their corresponding "interesting" k-mers are loaded into an AnnotatedReadSet to de-duplicate reads and group k-mers by read. """ countgraph = khmer.Countgraph(ksize, memory / 4, 4) read_inst_consumed = 0 int_kmer_instances = 0 int_kmers_parsed = set() readset = kevlar.seqio.AnnotatedReadSet() for filename in filelist: print(' -', filename, file=logfile) with kevlar.open(filename, 'r') as infile: for record in kevlar.parse_augmented_fastx(infile): if record.name not in readset._reads: countgraph.consume(record.sequence) readset.add(record) read_inst_consumed += 1 for kmer in record.ikmers: int_kmer_instances += 1 minkmer = kevlar.revcommin(kmer.sequence) int_kmers_parsed.add(minkmer) n_kmers_distinct = len(int_kmers_parsed) fpr = kevlar.sketch.estimate_fpr(countgraph) message = ' {:d} instances'.format(read_inst_consumed) message += ' of {:d} reads consumed'.format(len(readset)) message += ', annotated with {:d} instances '.format(int_kmer_instances) message += 'of {:d} distinct "interesting" k-mers'.format(n_kmers_distinct) message += '; estimated false positive rate is {:1.3f}'.format(fpr) print(message, file=logfile) if fpr > maxfpr: print('[kevlar::filter] FPR too high, bailing out', file=logfile) sys.exit(1) return readset, countgraph
def add_kmer(self, kmer, read_id): min_kmer = kevlar.revcommin(kmer) self.kmers[min_kmer].add(read_id) self._kmer_instances += 1
def novel(casestream, casecounts, controlcounts, ksize=31, abundscreen=None, casemin=5, ctrlmax=0, numbands=None, band=None, skipuntil=None): numbands_unset = not numbands band_unset = not band and band != 0 if numbands_unset is not band_unset: raise ValueError('Must specify `numbands` and `band` together') if band is not None and band < 0: maxband = numbands - 1 message = '`band` must be a value between 0 and {:d}'.format(maxband) message += ' (`numbands` - 1), inclusive' raise ValueError(message) timer = kevlar.Timer() timer.start() nkmers = 0 nreads = 0 update_message = '[kevlar::novel] processed {counter} reads' skip_message = None if skipuntil: msg = '; skipping reads in search of {read}'.format(read=skipuntil) skip_message = update_message + msg first_message = skip_message if skipuntil else update_message progress_indicator = kevlar.ProgressIndicator( first_message, interval=1e6, breaks=[1e7, 1e8, 1e9], usetimer=True, ) unique_kmers = set() for n, record in enumerate(casestream, 1): progress_indicator.update() if skipuntil: # pragma: no cover if record.name == skipuntil: message = 'Found read {:s}'.format(skipuntil) message += ' (skipped {:d} reads)'.format(n) kevlar.plog('[kevlar::novel]', message) skipuntil = False progress_indicator.message = update_message continue if len(record.sequence) < ksize: continue if re.search('[^ACGT]', record.sequence): # This check should be temporary; hopefully khmer will handle # this soon. continue discard_read = False irecord = None for i, kmer in enumerate(casecounts[0].get_kmers(record.sequence)): if numbands: khash = casecounts[0].hash(kmer) if khash & (numbands - 1) != band - 1: continue interesting, discard, caseabund, ctrlabund = kmer_is_interesting( kmer, casecounts, controlcounts, case_min=casemin, ctrl_max=ctrlmax, screen_thresh=abundscreen, ) if discard: discard_read = True break if not interesting: continue if irecord is None: irecord = kevlar.sequence.copy_record(record) abund = tuple(caseabund + ctrlabund) irecord.annotate(kmer, i, abund) minkmer = kevlar.revcommin(kmer) unique_kmers.add(minkmer) if discard_read or irecord is None: continue nreads += 1 nkmers += len(irecord.annotations) yield irecord elapsed = timer.stop() message = 'Found {:d} instances'.format(nkmers) message += ' of {:d} unique novel kmers'.format(len(unique_kmers)) message += ' in {:d} reads'.format(nreads) message += ' in {:.2f} seconds'.format(elapsed) kevlar.plog('[kevlar::novel]', message)
def novel(casestream, casecounts, controlcounts, ksize=31, abundscreen=None, casemin=5, ctrlmax=0, numbands=None, band=None, skipuntil=None, updateint=10000, logstream=sys.stderr): numbands_unset = not numbands band_unset = not band and band != 0 if numbands_unset is not band_unset: raise ValueError('Must specify `numbands` and `band` together') if band is not None and band < 0: maxband = numbands - 1 message = '`band` must be a value between 0 and {:d}'.format(maxband) message += ' (`numbands` - 1), inclusive' raise ValueError(message) timer = kevlar.Timer() timer.start() nkmers = 0 nreads = 0 nextupdate = updateint unique_kmers = set() for n, record, mate in kevlar.paired_reader(casestream): if skipuntil: # pragma: no cover if record.name == skipuntil: message = 'Found read {:s}'.format(skipuntil) message += ' (skipped {:d} reads)'.format(n) print('[kevlar::novel]', message, file=logstream) skipuntil = False continue if n >= nextupdate: nextupdate += updateint elapsed = timer.probe() msg = ' processed {} reads'.format(n) msg += ' in {:.2f} seconds...'.format(elapsed) print(msg, file=logstream) if len(record.sequence) < ksize: continue if re.search('[^ACGT]', record.sequence): # This check should be temporary; hopefully khmer will handle # this soon. continue discard_read = False irecord = None for i, kmer in enumerate(casecounts[0].get_kmers(record.sequence)): if numbands: khash = casecounts[0].hash(kmer) if khash & (numbands - 1) != band - 1: continue interesting, discard, caseabund, ctrlabund = kmer_is_interesting( kmer, casecounts, controlcounts, case_min=casemin, ctrl_max=ctrlmax, screen_thresh=abundscreen, ) if discard: discard_read = True break if not interesting: continue if irecord is None: irecord = kevlar.sequence.copy_record(record) abund = tuple(caseabund + ctrlabund) irecord.annotate(kmer, i, abund) minkmer = kevlar.revcommin(kmer) unique_kmers.add(minkmer) if discard_read or irecord is None: continue nreads += 1 nkmers += len(irecord.annotations) if mate: irecord.add_mate(mate.sequence) yield irecord elapsed = timer.stop() message = 'Found {:d} instances'.format(nkmers) message += ' of {:d} unique novel kmers'.format(len(unique_kmers)) message += ' in {:d} reads'.format(nreads) message += ' in {:.2f} seconds'.format(elapsed) print('[kevlar::novel]', message, file=logstream)
def add_contig(self, contig, kmer): min_contig = kevlar.revcommin(contig) min_kmer = kevlar.revcommin(kmer) self.contigs[min_contig].add(min_kmer)
cli = argparse.ArgumentParser() cli.add_argument('--sketch-type', metavar='T', choices=allocators.keys(), default='counttable', help='Sketch type to use for output') cli.add_argument('--num-tables', type=int, default=4, metavar='N') cli.add_argument('--table-size', type=int, default=1000, metavar='X') cli.add_argument('sketch', help='original sketch') cli.add_argument('subsketch', help='new sketch to create') cli.add_argument('sequence', nargs='+', help='sequences to sample from sketch') args = cli.parse_args() sketch = kevlar.sketch.load(args.sketch) allocfunc = allocators[args.sketch_type] subsketch = allocfunc(sketch.ksize(), args.table_size, args.num_tables) kmers = set() for seq in args.sequence: for kmer in sketch.get_kmers(seq): minkmer = kevlar.revcommin(kmer) kmers.add(minkmer) for kmer in kmers: count = sketch.get(kmer) for _ in range(count): subsketch.add(kmer) subsketch.save(args.subsketch) fpr = khmer.calc_expected_collisions(subsketch, max_false_pos=100.0) print('Estimated FPR: {:.4f}'.format(fpr))