def assemble_jca(readstream, memory, maxfpr=0.01, collapse=True, kmers_to_ignore=set(), logstream=sys.stderr): print('[kevlar::assemble::jca] loading reads', file=logstream) countgraph = None variants = kevlar.VariantSet() for record in readstream: for kmer in record.ikmers: variants.add_kmer(kmer.sequence, record.name) if countgraph is None: ksize = len(kmer.sequence) countgraph = khmer.Countgraph(ksize, memory / 4, 4) countgraph.consume(record.sequence) fpr = kevlar.sketch.estimate_fpr(countgraph) msg = '[kevlar::assemble::jca] done loading reads' msg += ', {:d} distinct k-mers stored'.format(countgraph.n_unique_kmers()) msg += '; estimated false positive rate is {:1.3f}'.format(fpr) if fpr > maxfpr: msg += ' (FPR too high, bailing out!!!)' raise kevlar.sketch.KevlarUnsuitableFPRError(msg) print(msg, file=logstream) asm = khmer.JunctionCountAssembler(countgraph) for kmer in variants.kmers: if kmer in kmers_to_ignore: continue contigs = asm.assemble(kmer) for contig in contigs: if hasattr(contig, 'decode'): contig = contig.decode() if contig == '': print(' WARNING: no assembly found for k-mer', kmer, file=args.logfile) continue variants.add_contig(contig, kmer) print(' {:d} linear paths'.format(variants.ncontigs), file=logstream) if collapse: print('[kevlar::assemble::jca] Collapsing contigs', file=logstream) variants.collapse() print(' {:d} collapsed contigs'.format(variants.ncontigs), file=logstream) for n, contigdata in enumerate(variants, 1): contig, contigrc, kmers, reads = contigdata contigname = 'contig{:d}:length={:d}:nkmers={:d}:nreads={:d}'.format( n, len(contig), len(kmers), len(reads)) contig = screed.Record(name=contigname, sequence=contig) yield contig
def test_beginning_to_end_across_tip(self, right_tip_structure): # assemble entire contig, ignoring branch point b/c of labels graph, contig, L, HDN, R, tip = right_tip_structure asm = khmer.JunctionCountAssembler(graph) asm.consume(contig) asm.consume(contig) asm.consume(contig) path = asm.assemble(contig[:K]) print('P:', path[0]) print('T:', tip) print('C:', contig) assert len(path) == 1, "there should only be one path" path = path[0] # @CTB assert len(path) == len(contig) assert utils._equals_rc(path, contig)
def main(): p = build_counting_args(descr='Streaming assembly with tracking info') p.add_argument('fastq_files', nargs='+') p.add_argument('--prefix', default='transcriptome') args = p.parse_args() cg = create_countgraph(args) asm = khmer.JunctionCountAssembler(cg) tr_fn = '{0}.transcripts.fa'.format(args.prefix) orf_fn = '{0}.orfs.fa'.format(args.prefix) stats_fn = '{0}.stats.fa'.format(args.prefix) with open(tr_fn, 'w') as tr_fp,\ open(orf_fn, 'w') as orf_fp,\ open(stats_fn, 'w') as stats_fp: kept = 0 next_contig = 1 next_orf = 1 output = set() statswriter = csv.DictWriter( stats_fp, delimiter=',', fieldnames=['read_n', 'action', 'cov', 'n_junctions', 'contig_n']) for filename in args.fastq_files: for n, record in enumerate(screed.open(filename)): if n and n % 10000 == 0: print('...', n, file=sys.stderr) if len(record.sequence) < args.ksize: continue cov, _, _ = cg.get_median_count(record.sequence) if cov < 20: kept += 1 cg.consume(record.sequence) statswriter.writerow({ 'read_n': n, 'action': 'c', 'cov': cov, 'n_junctions': None, 'contig_n': None }) elif cov < 30: seq, pos = cg.trim_on_abundance(record.sequence, 3) if len(seq) < args.ksize: continue n_junctions = asm.consume(seq) statswriter.writerow({ 'read_n': n, 'action': 't', 'cov': cov, 'n_junctions': n_junctions, 'contig_n': None }) elif cov == 30: contigs = asm.assemble(record.sequence[:args.ksize]) for contig_n, contig in enumerate(contigs): statswriter.writerow({ 'read_n': n, 'action': 'a', 'cov': cov, 'n_junctions': None, 'contig_n': (next_contig, contig_n) }) tr_fp.write('>contig%d\n%s\n' % (next_contig, contig)) next_contig += 1 for t in translate(contig): for orf_n, o in enumerate(extract_orfs(t)): if hash(o) not in output: new = True output.add(hash(o)) orf_fp.write('>orf%d\n%s\n' % (next_orf, o)) next_orf += 1 else: new = False else: statswriter.writerow({ 'read_n': n, 'action': 's', 'cov': cov, 'n_junctions': None, 'contig_n': None })