def revcom(self): seq = kevlar.revcom(self.read.sequence) kmerseqrc = kevlar.revcom(self.kmerseq) newoffset = len(seq) - self.kmer.offset - self.kmer.ksize kmer = KmerOfInterest(self.kmer.ksize, newoffset, self.kmer.abund) kdict = {self.kmerseq: kmer, kmerseqrc: kmer} newread = Record(self.read.name, seq, annotations=[kmer], ikmers=kdict) return ReadWithKmer(newread, self.kmerseq)
def test_alpha(): readfile = data_file('collect.alpha.txt') filterer = kevlar.filter.filter(readfile, memory=500) validated = list(filterer) assert len(validated) == 8 badkmers = ['CAGGCCAGGGATCGCCGTG'] goodkmers = [ 'AGGGGCGTGACTTAATAAG', 'GGGCGTGACTTAATAAGGT', 'TAGGGGCGTGACTTAATAA', 'GGGGCGTGACTTAATAAGG', ] for record in validated: for kmer in record.annotations: seq = record.ikmerseq(kmer) assert seq not in badkmers and kevlar.revcom(seq) not in badkmers assert seq in goodkmers or kevlar.revcom(seq) in goodkmers
def merge_and_reannotate(pair, newname): """ Assemble a pair of overlapping reads and resolve their interesting k-mers. When a pair of compatible reads is merged, the offset of the interesting k-mers must be computed for one of the reads. """ contig = merge_pair(pair) newrecord = screed.Record(name=newname, sequence=contig, ikmers=pair.tail.ikmers) ksize = len(pair.tail.ikmers[0].sequence) if pair.sameorient: minoffset2keep = len(pair.tail.sequence) - pair.offset - ksize keepers = [ik for ik in pair.head.ikmers if ik.offset > minoffset2keep] for k in keepers: ikmer = kevlar.KmerOfInterest(k.sequence, k.offset + pair.offset, k.abund) newrecord.ikmers.append(ikmer) else: maxoffset2keep = pair.offset - ksize keepers = [ik for ik in pair.head.ikmers if ik.offset < maxoffset2keep] for k in keepers: ikmer = kevlar.KmerOfInterest( kevlar.revcom(k.sequence), len(pair.head.sequence) - k.offset - ksize + pair.offset, k.abund, ) newrecord.ikmers.append(ikmer) return newrecord
def augment(augseqstream, nakedseqstream, upint=10000): """ Augment an unannotated stream of sequences. - `augseqstream`: a stream of sequences annotated with k-mers of interest - `nakedseqstream`: a stream of unannotated sequences, to be augmented with k-mers of interest from `augseqstream` """ ksize = None ikmers = dict() for n, record in enumerate(augseqstream): if n > 0 and n % upint == 0: kevlar.plog('[kevlar::augment] processed', n, 'input reads') for ikmer in record.annotations: seq = record.ikmerseq(ikmer) ikmers[seq] = ikmer.abund ikmers[kevlar.revcom(seq)] = ikmer.abund ksize = ikmer.ksize for record in nakedseqstream: qual = None if hasattr(record, 'quality') and record.quality is not None: qual = record.quality newrecord = kevlar.sequence.Record( name=record.name, sequence=record.sequence, quality=qual, ) numkmers = len(record.sequence) - ksize + 1 for offset in range(numkmers): kmer = record.sequence[offset:offset + ksize] if kmer in ikmers: abund = ikmers[kmer] newrecord.annotate(kmer, offset, abund) yield newrecord
def print_read_pair(pair, position, outstream=sys.stderr): """Convenience print function for debugging.""" seq2 = pair.head.sequence if not pair.sameorient: seq2 = kevlar.revcom(pair.head.sequence) ksize = len(pair.head.ikmers[0].sequence) details = '--(overlap={:d}, offset={:d}, sameorient={})-->'.format( pair.overlap, pair.offset, pair.sameorient) info = '[kevlar::overlap] DEBUG: shared interesting k-mer ' info += '{:s} {:s} {:s}'.format(pair.tail.name, details, pair.head.name) print('≠' * 80, '\n', info, '\n', '-' * 80, '\n', pair.tail.sequence, '\n', ' ' * position, '|' * ksize, '\n', ' ' * pair.offset, seq2, '\n', '≠' * 80, '\n', sep='', file=outstream)
def check_kmer_freq_in_read_pair(read1, read2, minkmer, debugstream=None): """ Check interesting k-mer frequence in each read. When calculating offset between a pair of reads, do not use any interesting k-mers that occur multiple times in either read. """ maxkmer = kevlar.revcom(minkmer) matches1 = [ k for k in read1.ikmers if kevlar.same_seq(k.sequence, minkmer, maxkmer) ] matches2 = [ k for k in read2.ikmers if kevlar.same_seq(k.sequence, minkmer, maxkmer) ] nmatches1 = len(matches1) nmatches2 = len(matches2) assert nmatches1 > 0 and nmatches1 > 0, (nmatches1, nmatches2) if nmatches1 > 1 or nmatches2 > 1: if debugstream: message = ( 'stubbornly refusing to calculate offset bewteen {:s} and ' '{:s}; interesting k-mer {:s} occurs multiple times'.format( read1.name, read2.name, minkmer)) print('[kevlar::overlap] INFO', message, file=debugstream) return None, None kmer1 = matches1[0] kmer2 = matches2[0] return kmer1, kmer2
def determine_relative_orientation(read1, read2, kmer1, kmer2): """ Determine the relative orientation of a pair of overlapping reads. Use the sequence and position of the shared interesting k-mers to determine the read's relative orientation. """ ksize = len(kmer1.sequence) pos1 = kmer1.offset pos2 = kmer2.offset sameorient = True if kmer1.sequence != kmer2.sequence: assert kmer1.sequence == kevlar.revcom(kmer2.sequence) sameorient = False pos2 = len(read2.sequence) - (kmer2.offset + ksize) tail, head = read1, read2 tailpos, headpos = pos1, pos2 read1contained = pos1 == pos2 and len(read2.sequence) > len(read1.sequence) if pos2 > pos1 or read1contained: tail, head = read2, read1 tailpos, headpos = headpos, tailpos offset = tailpos - headpos return tail, head, offset, sameorient, tailpos
def merge_pair(pair): """ Assemble a pair of overlapping reads. Given a pair of compatible overlapping reads, collapse and merge them into a single sequence. """ tailseq = pair.tail.sequence headseq = pair.head.sequence offset = pair.offset if pair.sameorient is False: headseq = kevlar.revcom(pair.head.sequence) if headseq in pair.tail.sequence: return pair.tail.sequence if pair.swapped: tailseq, headseq = headseq, tailseq offset += len(tailseq) - len(headseq) headindex = len(tailseq) - offset headsuffix = headseq[headindex:] tailprefix = tailseq[offset:offset + pair.overlap] assert tailprefix == headseq[:headindex], \ 'error: attempted to assemble incompatible reads' return tailseq + headsuffix
def __iter__(self): for mincontig in sorted(self.contigs): maxcontig = kevlar.revcom(mincontig) kmers = self.contigs[mincontig] reads = set() for kmer in kmers: reads = reads.union(self.kmers[kmer]) yield mincontig, maxcontig, kmers, reads
def __init__(self, read, kmerseq): self.read = read self.kmer = read.ikmers[kmerseq] self.kmerseq = self.read.ikmerseq(self.kmer) if self.kmer else None self.num_occurrences = ( self.read.sequence.count(kmerseq) + self.read.sequence.count(kevlar.revcom(kmerseq)) )
def n_ikmers_present(record, window): n = 0 for ikmer in record.annotations: seq = record.ikmerseq(ikmer) if seq in window: n += 1 elif kevlar.revcom(seq) in window: n += 1 return n
def validate_read_overlap(tail, head, offset, sameorient, minkmer, swapped): """Verify that the overlap between two reads is identical.""" headseq = head.sequence if sameorient else kevlar.revcom(head.sequence) seg2offset = len(head.sequence) - len(tail.sequence) + offset if offset + len(headseq) <= len(tail.sequence): segment1 = tail.sequence[offset:offset + len(headseq)] segment2 = headseq seg2offset = None elif swapped: segment1 = tail.sequence[:-offset] segment2 = headseq[seg2offset:] else: segment1 = tail.sequence[offset:] segment2 = headseq[:-seg2offset] overlap1 = len(segment1) overlap2 = len(segment2) if overlap1 != overlap2: # pragma: no cover maxkmer = kevlar.revcom(minkmer) print( '[kevlar::overlap] ERROR ' 'tail="{tail}" head="{head}" offset={offset} altoffset={altoffset}' ' tailoverlap={overlap} headoverlap={headover} tailolvp={tailseq}' ' headolvp={headseq} kmer={minkmer},{maxkmer} tailseq={tailread}' ' headseq={headread}'.format( tail=tail.name, head=tail.name, offset=offset, altoffset=seg2offset, overlap=overlap1, headover=len(segment2), tailseq=segment1, headseq=segment2, minkmer=minkmer, maxkmer=maxkmer, tailread=tail.sequence, headread=head.sequence, ), file=sys.stderr) assert overlap1 == overlap2 if segment1 != segment2: return None return overlap1
def test_assemble_main(capsys): cliargs = ['assemble', data_file('var1.reads.augfastq')] args = kevlar.cli.parser().parse_args(cliargs) kevlar.assemble.main(args) out, err = capsys.readouterr() contig = ('GTCCTTGAGTCCATTAGAGACGGCTTCCGCCGTAGGCCCACTTCCTTAAAGTCGAGACTTCTA' 'AAAACCGGGGTGTAACTCTTTTATTACAAAGCGACTATCCACCTGTAAGGACAGTGATA') print('DEBUG', contig) print('DEBUG', out) assert contig in out or kevlar.revcom(contig) in out
def test_validate(): filelist = kevlar.tests.data_glob('collect.alpha.txt') readset = ReadSet(19, 5e3) for record in kevlar.seqio.afxstream(filelist): readset.add(record) readset.validate() assert readset.valid == (4, 32) assert len(readset) == 9 assert readset.discarded == 1 badkmers = ['CAGGCCAGGGATCGCCGTG'] goodkmers = [ 'AGGGGCGTGACTTAATAAG', 'GGGCGTGACTTAATAAGGT', 'TAGGGGCGTGACTTAATAA', 'GGGGCGTGACTTAATAAGG', ] for record in readset: for kmer in record.ikmers: assert kmer.sequence not in badkmers and \ kevlar.revcom(kmer.sequence) not in badkmers assert kmer.sequence in goodkmers or \ kevlar.revcom(kmer.sequence) in goodkmers
def test_validate_with_mask(): kmer = 'AGGGGCGTGACTTAATAAG' mask = khmer.Nodetable(19, 1e3, 2) mask.add(kmer) filelist = kevlar.tests.data_glob('collect.beta.?.txt') readset, countgraph = kevlar.filter.load_input(filelist, 19, 5e3) kevlar.filter.validate_and_print(readset, countgraph, mask) assert readset.valid == (3, 24) for record in readset: for ikmer in record.ikmers: assert ikmer.sequence != kmer assert kevlar.revcom(ikmer.sequence) != kmer
def test_validate(): filelist = kevlar.tests.data_glob('collect.alpha.txt') readset, countgraph = kevlar.filter.load_input(filelist, 19, 5e3) kevlar.filter.validate_and_print(readset, countgraph) assert readset.valid == (4, 32) assert len(readset) == 9 assert readset.discarded == 1 badkmers = ['CAGGCCAGGGATCGCCGTG'] goodkmers = [ 'AGGGGCGTGACTTAATAAG', 'GGGCGTGACTTAATAAGGT', 'TAGGGGCGTGACTTAATAA', 'GGGGCGTGACTTAATAAGG', ] for record in readset: for kmer in record.ikmers: assert kmer.sequence not in badkmers and \ kevlar.revcom(kmer.sequence) not in badkmers assert kmer.sequence in goodkmers or \ kevlar.revcom(kmer.sequence) in goodkmers
def augment(augseqstream, nakedseqstream, collapsemates=False, upint=10000): """ Augment an unannotated stream of sequences. - `augseqstream`: a stream of sequences annotated with k-mers of interest - `nakedseqstream`: a stream of unannotated sequences, to be augmented with k-mers of interest from `augseqstream` """ ksize = None ikmers = dict() mateseqs = dict() for n, record in enumerate(augseqstream): if n > 0 and n % upint == 0: print('[kevlar::augment] processed', n, 'input reads', file=sys.stderr) for ikmer in record.annotations: seq = record.ikmerseq(ikmer) ikmers[seq] = ikmer.abund ikmers[kevlar.revcom(seq)] = ikmer.abund ksize = ikmer.ksize assert len(record.mates) in (0, 1) if len(record.mates) == 1: mateseqs[record.name] = record.mates[0] print('[kevlar::augment] done loading input', file=sys.stderr) for record in nakedseqstream: qual = None if hasattr(record, 'quality') and record.quality is not None: qual = record.quality mates = list() if collapsemates: mates = sorted(mateseqs.values()) else: if record.name in mateseqs: mates.append(mateseqs[record.name]) newrecord = kevlar.sequence.Record( name=record.name, sequence=record.sequence, quality=qual, mates=mates, ) numkmers = len(record.sequence) - ksize + 1 for offset in range(numkmers): kmer = record.sequence[offset:offset + ksize] if kmer in ikmers: abund = ikmers[kmer] newrecord.annotate(kmer, offset, abund) yield newrecord
def test_validate_with_mask(): kmer = 'AGGGGCGTGACTTAATAAG' mask = khmer.Nodetable(19, 1e3, 2) mask.add(kmer) filelist = kevlar.tests.data_glob('collect.beta.?.txt') readset = ReadSet(19, 5e3) for record in kevlar.seqio.afxstream(filelist): readset.add(record) readset.validate(mask=mask) assert readset.valid == (3, 24) for record in readset: for ikmer in record.ikmers: assert ikmer.sequence != kmer assert kevlar.revcom(ikmer.sequence) != kmer
def collapse(self): unique_contigs = set() for contig in sorted(self.contigs, key=len, reverse=True): contigrc = kevlar.revcom(contig) merge = False for ucontig in unique_contigs: if contig in ucontig or contigrc in ucontig: mergedkmers = self.contigs[ucontig].union( self.contigs[contig] ) self.contigs[ucontig] = mergedkmers del self.contigs[contig] merge = True break if merge is False: unique_contigs.add(contig)
def call(targetlist, querylist, match=1, mismatch=2, gapopen=5, gapextend=0, ksize=31): """ Wrap the `kevlar call` procedure as a generator function. Input is the following. - an iterable containing one or more target sequences from the reference genome, stored as khmer or screed sequence records - an iterable containing one or more contigs assembled by kevlar, stored as khmer or screed sequence records - alignment match score (integer) - alignment mismatch penalty (integer) - alignment gap open penalty (integer) - alignment gap extension penalty (integer) The function yields tuples of target sequence name, query sequence name, and alignment CIGAR string """ for query in sorted(querylist, reverse=True, key=len): bestcigar = None bestscore = None besttarget = None bestorientation = None for target in sorted(targetlist, key=lambda record: record.name): cigar, score, strand = align_both_strands(target.sequence, query.sequence, match, mismatch, gapopen, gapextend) if bestscore is None or score > bestscore: bestscore = score bestcigar = cigar besttarget = target bestorientation = strand if bestorientation == -1: query.sequence = kevlar.revcom(query.sequence) for varcall in make_call(besttarget, query, bestcigar, ksize): yield varcall
def align_both_strands(targetseq, queryseq, match=1, mismatch=2, gapopen=5, gapextend=0): cigar1, score1 = kevlar.align(targetseq, queryseq, match, mismatch, gapopen, gapextend) cigar2, score2 = kevlar.align(targetseq, kevlar.revcom(queryseq), match, mismatch, gapopen, gapextend) if score2 > score1: cigar = cigar2 score = score2 strand = -1 else: cigar = cigar1 score = score1 strand = 1 return cigar, score, strand
def main(args): reads = dict() instream = kevlar.open(args.augfastq, 'r') for record in kevlar.parse_augmented_fastx(instream): reads[record.name] = record reader = khmer.ReadParser(args.fastq) outstream = kevlar.open(args.out, 'w') for read in reader: augrecord = reads[read.name] if len(read.sequence) < len(augrecord.sequence): ikmers = list() for kmer in augrecord.ikmers: stillthere = ( kmer.sequence in read.sequence or kevlar.revcom(kmer.sequence) in read.sequence ) if stillthere: ikmers.append(kmer) if len(ikmers) == 0: continue augrecord.ikmers = ikmers kevlar.print_augmented_fastx(augrecord, outstream)
def varseq(self): assert self.strand in (-1, 1) if self.strand == 1: return self.contig.sequence else: return kevlar.revcom(self.contig.sequence)
def ikmers(self): for kmer in self.contig.annotations: seq = self.contig.ikmerseq(kmer) yield seq yield kevlar.revcom(seq)
def test_assumptions(kmer): ct = Counttable(27, 1e5, 2) kmer_rc = kevlar.revcom(kmer) assert ct.hash(kmer) == ct.hash(kmer_rc) assert ct.get_kmer_hashes(kmer)[0] == ct.get_kmer_hashes(kmer_rc)[0]