def do_write(self, outfp): outq = self.outqueue while self.worker_count > 0 or not outq.empty(): try: g = outq.get(True, 1) except queue.Empty: continue for name, seq, qual in g.seqlist: if qual: record = screed.Record(name=name, sequence=seq, quality=qual) else: record = screed.Record(name=name, sequence=seq) write_record(record, outfp) if self.verbose: print("DONE writing.\nprocessed %d / wrote %d / removed %d" % (self.n_processed, self.n_written, self.n_processed - self.n_written), file=sys.stderr) print("processed %d bp / wrote %d bp / removed %d bp" % (self.bp_processed, self.bp_written, self.bp_processed - self.bp_written), file=sys.stderr) discarded = self.bp_processed - self.bp_written f = float(discarded) / float(self.bp_processed) * 100 print("discarded %.1f%%" % f, file=sys.stderr)
def trim_record(countgraph, record, cutoff, variable_coverage=False, normalize_to=None): name = record.name seq = record.sequence seqN = record.cleaned_seq if variable_coverage: # only trim when sequence has high enough C if not countgraph.median_at_least(seqN, normalize_to): return record, False # return unmodified _, trim_at = countgraph.trim_on_abundance(seqN, cutoff) # too short? eliminate read if trim_at < countgraph.ksize(): return None, True # would we trim? if not, return unmodified. if trim_at == len(seq): return record, False # construct new record trim_seq = seq[:trim_at] if hasattr(record, 'quality'): trim_qual = record.quality[:trim_at] trim_rec = screed.Record(name=name, sequence=trim_seq, quality=trim_qual) else: trim_rec = screed.Record(name=name, sequence=trim_seq) return trim_rec, True
class Test_BrokenPairedReader(object): stream = [screed.Record(name='seq1/1', sequence='A' * 5), screed.Record(name='seq1/2', sequence='A' * 4), screed.Record(name='seq2/1', sequence='A' * 5), screed.Record(name='seq3/1', sequence='A' * 3), screed.Record(name='seq3/2', sequence='A' * 5)] def testDefault(self): x, n, m = gather(self.stream, min_length=1) expected = [('seq1/1', 'seq1/2'), ('seq2/1', None), ('seq3/1', 'seq3/2')] assert x == expected, x assert m == 3 assert n == 3, n def testMinLength(self): x, n, m = gather(self.stream, min_length=3) expected = [('seq1/1', 'seq1/2'), ('seq2/1', None), ('seq3/1', 'seq3/2')] assert x == expected, x assert m == 3 assert n == 3, n def testMinLength_2(self): x, n, m = gather(self.stream, min_length=4) expected = [('seq1/1', 'seq1/2'), ('seq2/1', None), ('seq3/2', None)] assert x == expected, x assert m == 3 assert n == 3, n def testForceSingle(self): x, n, m = gather(self.stream, force_single=True) expected = [('seq1/1', None), ('seq1/2', None), ('seq2/1', None), ('seq3/1', None), ('seq3/2', None)] assert x == expected, x assert m == 5 assert n == 4, n def testForceSingleAndMinLength(self): x, n, m = gather(self.stream, min_length=5, force_single=True) expected = [('seq1/1', None), ('seq2/1', None), ('seq3/2', None)] assert x == expected, x assert m == 3, m assert n == 2, n
def test_check_is_pair_4b(): read1 = screed.Record(name='seq/1', sequence='AAA') read2 = screed.Record(name='seq/2', quality='###', sequence='AAA') try: check_is_pair(read1, read2) assert False # check_is_pair should fail here. except ValueError: pass
def test_BrokenPairedReader_OnPairs_4(): stream = [screed.Record(name='seq1/1', sequence='A' * 3), # too short screed.Record(name='seq1/2', sequence='A' * 4), screed.Record(name='seq3/1', sequence='A' * 4), screed.Record(name='seq3/2', sequence='A' * 5)] x, n, m = gather(stream, min_length=4, require_paired=True) expected = [('seq3/1', 'seq3/2')] assert x == expected, x assert m == 1 assert n == 0, n
def test_paired_2thread_more_seq(): class TSP_TestPairedProcess(ThreadedSequenceProcessor): # write a new do_process function that ensures paired ends are kept. def do_process(self): inq = self.inqueue outq = self.outqueue while not self.done or not inq.empty(): try: g = inq.get(True, 1) except queue.Empty: continue if len(g.seqlist) == 2: first_rec = g.seqlist[0] second_rec = g.seqlist[1] assert first_rec['name'][:-1] == second_rec['name'][:-1] assert first_rec['name'][-1] == '1' assert second_rec['name'][-1] == '2' keep = [] for record in g.seqlist: name, sequence = self.process_fn(record) if name: keep.append((name, sequence, None)) self.outqueue.put(SequenceGroup(0, keep)) # end of thread; exit, decrement worker count. self.worker_count -= 1 # tsp = TSP_TestPairedProcess(idem, 1, 1, verbose=False) input = [ screed.Record(name='b/1', sequence='AAA'), screed.Record(name='a/1', sequence='AAA'), screed.Record(name='a/2', sequence='TTT'), screed.Record(name='c/2', sequence='AAA'), ] outfp = StringIO() tsp.start(input, outfp) x = load_records_d(outfp) assert len(x) == 4, x assert x['a/1'] == 'AAA' assert x['a/2'] == 'TTT' assert x['b/1'] == 'AAA' assert x['c/2'] == 'AAA'
def test_BrokenPairedReader_OnPairs_2(): stream = [screed.Record(name='seq1/1', sequence='A' * 5), screed.Record(name='seq1/2', sequence='A' * 4), screed.Record(name='seq3/1', sequence='A' * 5), # switched screed.Record(name='seq3/2', sequence='A' * 3)] # wrt previous x, n, m = gather(stream, min_length=4, require_paired=True) expected = [('seq1/1', 'seq1/2')] assert x == expected, x assert m == 1 assert n == 0, n
def test_odd(): tsp = ThreadedSequenceProcessor(every_other, 1, 1, verbose=False) inseqs = [ screed.Record(name='a', sequence='AAA'), screed.Record(name='b', sequence='TTT'), ] outfp = StringIO() tsp.start(inseqs, outfp) x = load_records_d(outfp) assert len(x) == 1, x assert x['b'] == 'TTT'
def test_basic_fastq_like(): tsp = ThreadedSequenceProcessor(idem, 1, 1, verbose=False) inseqs = [ screed.Record(name='a', sequence='AAA', quality='###'), screed.Record(name='b', sequence='TTT', quality='###'), ] outfp = StringIO() tsp.start(inseqs, outfp) x = load_records_fastq(outfp) for i in x: assert i['quality'] == '###'
def test_basic_2thread(): tsp = ThreadedSequenceProcessor(idem, 2, 1, verbose=False) inseqs = [ screed.Record(name='a', sequence='AAA'), screed.Record(name='b', sequence='TTT'), ] outfp = StringIO() tsp.start(inseqs, outfp) x = load_records_d(outfp) assert len(x) == 2, x assert x['a'] == 'AAA' assert x['b'] == 'TTT'
def merge_and_reannotate(pair, newname): """ Assemble a pair of overlapping reads and resolve their interesting k-mers. When a pair of compatible reads is merged, the offset of the interesting k-mers must be computed for one of the reads. """ contig = merge_pair(pair) newrecord = screed.Record(name=newname, sequence=contig, ikmers=pair.tail.ikmers) ksize = len(pair.tail.ikmers[0].sequence) if pair.sameorient: minoffset2keep = len(pair.tail.sequence) - pair.offset - ksize keepers = [ik for ik in pair.head.ikmers if ik.offset > minoffset2keep] for k in keepers: ikmer = kevlar.KmerOfInterest(k.sequence, k.offset + pair.offset, k.abund) newrecord.ikmers.append(ikmer) else: maxoffset2keep = pair.offset - ksize keepers = [ik for ik in pair.head.ikmers if ik.offset < maxoffset2keep] for k in keepers: ikmer = kevlar.KmerOfInterest( kevlar.revcom(k.sequence), len(pair.head.sequence) - k.offset - ksize + pair.offset, k.abund, ) newrecord.ikmers.append(ikmer) return newrecord
def main(): counting_ht = sys.argv[1] infiles = sys.argv[2:] print('file with ht: %s' % counting_ht) print('making hashtable') ht = Countgraph.load(counting_ht) K = ht.ksize() for infile in infiles: print('filtering', infile) outfile = os.path.basename(infile) + '.below' outfp = open(outfile, 'w') paired_iter = broken_paired_reader(ReadParser(infile), min_length=K, force_single=True) for n, is_pair, read1, read2 in paired_iter: name = read1.name seq = read1.sequence if 'N' in seq: return None, None trim_seq, trim_at = ht.trim_below_abundance(seq, CUTOFF) if trim_at >= K: write_record(screed.Record(name=name, sequence=trim_seq), outfp)
def test_ikmer_abund_after_recalc(): """ Ensure interesting k-mer abundances are correct after recalculation. The interesting k-mer has an advertised abundance of 28, but a true abundance (in `counts`) of 10. The readset "validate" function should check and correct this. """ read = screed.Record( name='read1', sequence='AAGCAGGGGTCTACATTGTCCTCGGGACTCGAGATTTCTTCGCTGT', ikmers=[KmerOfInterest('CATTGTCCTCGGGACTC', 13, [28, 0, 0])], ) counts = khmer.Counttable(17, 1e5, 4) seq = 'TTCGTTCCCGAAGCAGGGGTCTACATTGTCCTCGGGACTCGAGATTTCTTCGCTGTTCCGTCCTTCA' for _ in range(10): counts.consume(seq) rs = ReadSet() rs.add(read) assert read.ikmers[0].abund[0] == 28 rs.validate(counts, minabund=8) assert rs.valid == (1, 1) assert read.ikmers[0].abund[0] == 10
def assemble_fml_asm(readstream, logstream=sys.stderr): reads = [r for r in readstream] assembler = kevlar.assembly.fml_asm(reads) for n, contig in enumerate(assembler, 1): name = 'contig{:d}'.format(n) record = screed.Record(name=name, sequence=contig) yield record
def record2a(): return screed.Record( name='read2', sequence='ACGCAAAGCTATTTACGCAA', ikmers=[ KmerOfInterest('CGCAA', 1, [15, 0, 0]), KmerOfInterest('CGCAA', 15, [15, 0, 0]), ], )
def record6(): return screed.Record( name='read6', sequence='TCACTGTCAAGAGAGGCCTACGGATTCGGTTACTG', ikmers=[ KmerOfInterest('CTGTCAA', 3, [12, 0, 0]), KmerOfInterest('TGTCAAG', 4, [13, 0, 0]), ], )
def record5(): return screed.Record( name='read5', sequence='CTCTTCCGGCAGTCACTGTCAAGAGAGGGTGAACT', ikmers=[ KmerOfInterest('CTGTCAA', 15, [12, 0, 0]), KmerOfInterest('TGTCAAG', 16, [13, 0, 0]), ], )
def record4(): # similar to record2 but with a single nucleotide mismatch return screed.Record( name='read4', sequence='ACGCAATGCTATTTAAAACC', ikmers=[ KmerOfInterest('CGCAA', 1, [15, 0, 0]), KmerOfInterest('AAAAC', 14, [19, 1, 0]), ], )
def record3(): # reverse complement of record2 return screed.Record( name='read3', sequence='GGTTTTAAATAGCTTTGCGT', ikmers=[ KmerOfInterest('GTTTT', 1, [19, 1, 0]), KmerOfInterest('TTGCG', 14, [15, 0, 0]), ], )
def record10(): return screed.Record( name='read10', sequence=('CAGGTCCCCACCCGGATACTTGAAGCAGGCAGCCT'), ikmers=[ KmerOfInterest('TCCCCACCCGGATACTT', 4, [28, 0, 0]), KmerOfInterest('CCCCACCCGGATACTTG', 5, [26, 0, 0]), KmerOfInterest('CCCGGATACTTGAAGCA', 10, [21, 0, 0]), ], )
def test_BrokenPairedReader_lowercase(): stream = [screed.Record(name='seq1/1', sequence='acgtn'), screed.Record(name='seq1/2', sequence='AcGtN'), screed.Record(name='seq1/2', sequence='aCgTn')] results = [] for num, is_pair, read1, read2 in broken_paired_reader(stream): results.append((read1, read2)) a, b = results[0] assert a.sequence == 'acgtn' assert a.cleaned_seq == 'ACGTA' assert b.sequence == 'AcGtN' assert b.cleaned_seq == 'ACGTA' c, d = results[1] assert c.sequence == 'aCgTn' assert c.cleaned_seq == 'ACGTA' assert d is None
def assemble_jca(readstream, memory, maxfpr=0.01, collapse=True, kmers_to_ignore=set(), logstream=sys.stderr): print('[kevlar::assemble::jca] loading reads', file=logstream) countgraph = None variants = kevlar.VariantSet() for record in readstream: for kmer in record.ikmers: variants.add_kmer(kmer.sequence, record.name) if countgraph is None: ksize = len(kmer.sequence) countgraph = khmer.Countgraph(ksize, memory / 4, 4) countgraph.consume(record.sequence) fpr = kevlar.sketch.estimate_fpr(countgraph) msg = '[kevlar::assemble::jca] done loading reads' msg += ', {:d} distinct k-mers stored'.format(countgraph.n_unique_kmers()) msg += '; estimated false positive rate is {:1.3f}'.format(fpr) if fpr > maxfpr: msg += ' (FPR too high, bailing out!!!)' raise kevlar.sketch.KevlarUnsuitableFPRError(msg) print(msg, file=logstream) asm = khmer.JunctionCountAssembler(countgraph) for kmer in variants.kmers: if kmer in kmers_to_ignore: continue contigs = asm.assemble(kmer) for contig in contigs: if hasattr(contig, 'decode'): contig = contig.decode() if contig == '': print(' WARNING: no assembly found for k-mer', kmer, file=args.logfile) continue variants.add_contig(contig, kmer) print(' {:d} linear paths'.format(variants.ncontigs), file=logstream) if collapse: print('[kevlar::assemble::jca] Collapsing contigs', file=logstream) variants.collapse() print(' {:d} collapsed contigs'.format(variants.ncontigs), file=logstream) for n, contigdata in enumerate(variants, 1): contig, contigrc, kmers, reads = contigdata contigname = 'contig{:d}:length={:d}:nkmers={:d}:nreads={:d}'.format( n, len(contig), len(kmers), len(reads)) contig = screed.Record(name=contigname, sequence=contig) yield contig
def record9(): return screed.Record( name='read9', sequence=('AGCAAGGCGCTCGCGTCAACGAAGTGAGCTCCCGTGGTCTTGAGTTATCG' 'CCTCACATAC'), ikmers=[ KmerOfInterest('AGCAAGGCGCTCGCGTC', 0, [25, 0, 0]), KmerOfInterest('GCAAGGCGCTCGCGTCA', 1, [39, 0, 0]), KmerOfInterest('GTTATCGCCTCACATAC', 42, [15, 1, 1]), KmerOfInterest('AGTTATCGCCTCACATA', 43, [15, 1, 0]), ], )
def record8(): return screed.Record( name='read8', sequence=('GTATGTGAGGCGATAACTCAAGACCACGGGAGCTCACTTCGTTGACGCGA' 'GCGCCTTGCT'), ikmers=[ KmerOfInterest('GTATGTGAGGCGATAAC', 0, [15, 1, 0]), KmerOfInterest('TATGTGAGGCGATAACT', 1, [15, 1, 1]), KmerOfInterest('TGACGCGAGCGCCTTGC', 42, [39, 0, 0]), KmerOfInterest('GACGCGAGCGCCTTGCT', 43, [25, 0, 0]), ], )
def picorecord2(): return screed.Record( name='seq1_901428_901847_3:0:0_0:0:0_87d/1', sequence=('TTACATTTATTCGTTTGTGCAGGCTGAGACCTCACTTCCAACTGTAATCCAAAAGCTTA' 'GTTTTTTTTTTGTTTCCCAAAGTAAGGCTGAGTGAACAATA'), ikmers=[ KmerOfInterest('TTTTTTGTTTCCCAAAGTAAGGCTG', 64, [19, 0, 0]), KmerOfInterest('TTTTTGTTTCCCAAAGTAAGGCTGA', 65, [18, 1, 0]), KmerOfInterest('TTTTGTTTCCCAAAGTAAGGCTGAG', 66, [18, 1, 0]), KmerOfInterest('TTTGTTTCCCAAAGTAAGGCTGAGT', 67, [18, 0, 0]), KmerOfInterest('TTGTTTCCCAAAGTAAGGCTGAGTG', 68, [17, 0, 0]), ], )
def test_variant_mapping(): contig = screed.Record( name='contig1', sequence='CCTGAGCCCTCTCAAGTCGGGTCCTGGCCCGGTCTGCCCATGAGGCTGGGCCTGAGCCCC' ) cutout = kevlar.reference.ReferenceCutout( defline='chr1_10000-10060', sequence='CCTGAGCCCTCTCAAGTCGGGTCCTGGCCCAGTCTGCCCATGAGGCTGGGCCTGAGCCCC' ) mapping = VariantMapping(contig, cutout, score=1e6, cigar='60M') assert mapping.seqid == 'chr1' assert mapping.interval == ('chr1', 10000, 10060)
def picorecord1(): return screed.Record( name='seq1_901350_901788_1:0:0_0:0:0_21ca1/2', sequence=('GTTTTTTTTTTGTTTCCCAAAGTAAGGCTGAGTGAACAATATTTTCTCATAGTTTTGAC' 'AAAAACAAAGGAATCCTTAGTTATTAAACTCGGGAGTTTGA'), ikmers=[ KmerOfInterest('TTTTTTGTTTCCCAAAGTAAGGCTG', 5, [19, 0, 0]), KmerOfInterest('TTTTTGTTTCCCAAAGTAAGGCTGA', 6, [18, 1, 0]), KmerOfInterest('TTTTGTTTCCCAAAGTAAGGCTGAG', 7, [18, 1, 0]), KmerOfInterest('TTTGTTTCCCAAAGTAAGGCTGAGT', 8, [18, 0, 0]), KmerOfInterest('TTGTTTCCCAAAGTAAGGCTGAGTG', 9, [17, 0, 0]), ], )
def parse_augmented_fastx(instream): """ Read augmented Fast[q|a] records into memory. The parsed records will have .name, .sequence, and .quality defined (unless it's augmented Fasta), as well as a list of interesting k-mers. See http://kevlar.readthedocs.io/en/latest/formats.html#augmented-sequences for more information. """ record = None for line in instream: if line.startswith(('@', '>')): if record is not None: yield record readid = line[1:].strip() seq = next(instream).strip() if line.startswith('@'): _ = next(instream) qual = next(instream).strip() record = screed.Record(name=readid, sequence=seq, quality=qual, ikmers=list()) else: record = screed.Record(name=readid, sequence=seq, ikmers=list()) elif line.endswith('#\n'): offset = len(line) - len(line.lstrip()) line = line.strip()[:-1] abundances = re.split('\s+', line) kmer = abundances.pop(0) abundances = [int(a) for a in abundances] ikmer = kevlar.KmerOfInterest(sequence=kmer, offset=offset, abund=abundances) record.ikmers.append(ikmer) if record is not None: yield record
def picorecord3(): return screed.Record( name='seq1_901428_901847_3:0:0_0:0:0_87d/1', sequence=('TATTGTTCACTCAGCCTTACTTTGGGAAACAAAAAAAAAACTAAGCTTTTGGATTACAG' 'TTGGAAGTGAGGTCTCAGCCTGCACAAACGAATAAATGTAA'), ikmers=[ KmerOfInterest('CAGCCTTACTTTGGGAAACAAAAAA', 11, [17, 0, 0]), KmerOfInterest('TCAGCCTTACTTTGGGAAACAAAAA', 10, [18, 0, 0]), KmerOfInterest('CTCAGCCTTACTTTGGGAAACAAAA', 9, [18, 1, 0]), KmerOfInterest('ACTCAGCCTTACTTTGGGAAACAAA', 8, [18, 1, 0]), KmerOfInterest('CACTCAGCCTTACTTTGGGAAACAA', 7, [19, 0, 0]), ], )
def record7(): return screed.Record( name='read7', sequence=('CAGGTCCCCACCCGGATACTTGAAGCAGGCAGCCTCAAGGTATGTGAGGC' 'GATAACTCAA'), ikmers=[ KmerOfInterest('TCCCCACCCGGATACTT', 4, [28, 0, 0]), KmerOfInterest('CCCCACCCGGATACTTG', 5, [26, 0, 0]), KmerOfInterest('CCCGGATACTTGAAGCA', 10, [21, 0, 0]), KmerOfInterest('GGTATGTGAGGCGATAA', 38, [14, 0, 0]), KmerOfInterest('GTATGTGAGGCGATAAC', 39, [15, 1, 0]), KmerOfInterest('TATGTGAGGCGATAACT', 40, [15, 1, 1]), ], )