def main(args): timer = kevlar.Timer() timer.start() mask = load_mask(args.mask, args.ksize, args.mask_memory, maxfpr=args.mask_max_fpr, savefile=args.save_mask, logstream=args.logfile) readstream = kevlar.seqio.afxstream(args.augfastq) outstream = kevlar.open(args.out, 'w') filterstream = filter(readstream, mask, minabund=args.min_abund, ksize=args.ksize, memory=args.abund_memory, maxfpr=args.abund_max_fpr, logstream=args.logfile) for record in filterstream: kevlar.print_augmented_fastx(record, outstream) total = timer.stop() message = 'Total time: {:.2f} seconds'.format(total) print('[kevlar::filter]', message, file=args.logfile)
def main(args): timer = kevlar.Timer() timer.start() if (not args.num_bands) is not (not args.band): raise ValueError('Must specify --num-bands and --band together') myband = args.band - 1 if args.band else None timer.start('loadall') print('[kevlar::novel] Loading control samples', file=args.logfile) timer.start('loadctrl') controls = load_samples(args.control_counts, args.control, args.ksize, args.memory, args.max_fpr, args.num_bands, myband, args.threads, args.logfile) elapsed = timer.stop('loadctrl') message = 'Control samples loaded in {:.2f} sec'.format(elapsed) print('[kevlar::novel]', message, file=args.logfile) print('[kevlar::novel] Loading case samples', file=args.logfile) timer.start('loadcases') cases = load_samples(args.case_counts, args.case, args.ksize, args.memory, args.max_fpr, args.num_bands, myband, args.threads, args.logfile) elapsed = timer.stop('loadcases') print('[kevlar::novel] Case samples loaded in {:.2f} sec'.format(elapsed), file=args.logfile) elapsed = timer.stop('loadall') print('[kevlar::novel] All samples loaded in {:.2f} sec'.format(elapsed), file=args.logfile) timer.start('iter') ncases = len(args.case) message = 'Iterating over reads from {:d} case sample(s)'.format(ncases) print('[kevlar::novel]', message, file=args.logfile) outstream = kevlar.open(args.out, 'w') infiles = [f for filelist in args.case for f in filelist] caserecords = kevlar.multi_file_iter_screed(infiles) readstream = novel( caserecords, cases, controls, ksize=args.ksize, abundscreen=args.abund_screen, casemin=args.case_min, ctrlmax=args.ctrl_max, numbands=args.num_bands, band=myband, skipuntil=args.skip_until, updateint=args.upint, logstream=args.logfile, ) for augmented_read in readstream: kevlar.print_augmented_fastx(augmented_read, outstream) elapsed = timer.stop('iter') message = 'Iterated over all case reads in {:.2f} seconds'.format(elapsed) print('[kevlar::novel]', message, file=args.logfile) total = timer.stop() message = 'Total time: {:.2f} seconds'.format(total) print('[kevlar::novel]', message, file=args.logfile)
def main(args): readstream = kevlar.parse_augmented_fastx(kevlar.open(args.augfastq, 'r')) if args.part_id: pstream = kevlar.parse_single_partition(readstream, args.part_id) else: pstream = kevlar.parse_partitioned_reads(readstream) outstream = kevlar.open(args.out, 'w') assembler = assemble(pstream, maxreads=args.max_reads) for partid, contig in assembler: kevlar.print_augmented_fastx(contig, outstream)
def assemble_with_greed(graph, ccindex, debugout=None): """Find shortest common superstring using a greedy assembly algorithm.""" count = 0 while len(graph.edges()) > 0: count += 1 pair = fetch_largest_overlapping_pair(graph) newname = 'contig{:d}:cc={:d}'.format(count, ccindex) newrecord = merge_and_reannotate(pair, newname) if debugout: print('### DEBUG', pair.tail.name, pair.head.name, pair.offset, pair.overlap, pair.sameorient, file=debugout) kevlar.print_augmented_fastx(newrecord, debugout) for kmer in newrecord.ikmers: kmerseq = kevlar.revcommin(kmer.sequence) for readname in graph.ikmers[kmerseq]: already_merged = readname not in graph current_contig = readname in [ pair.tail.name, pair.head.name, newname ] if already_merged or current_contig: continue otherrecord = graph.get_record(readname) newpair = kevlar.overlap.calc_offset(newrecord, otherrecord, kmerseq, debugout) if newpair == kevlar.overlap.INCOMPATIBLE_PAIR: continue tn, hn = newpair.tail.name, newpair.head.name if tn in graph and hn in graph[tn]: assert graph[tn][hn]['overlap'] == newpair.overlap if graph[tn][hn]['tail'] == newpair.tail: assert graph[tn][hn]['offset'] == newpair.offset else: graph.add_edge(tn, hn, offset=newpair.offset, overlap=newpair.overlap, ikmer=kmerseq, orient=newpair.sameorient, tail=tn, swapped=newpair.swapped) graph.ikmers[kmerseq].add(newrecord.name) graph.add_node(newrecord.name, record=newrecord) graph.remove_node(pair.tail.name) graph.remove_node(pair.head.name)
def test_kmer_rep_in_read(capsys): from sys import stdout read = ('AGGATGAGGATGAGGATGAGGATGAGGATGAGGATGAGGATGAGGATGAGGATGAGGATGAGGAT' 'GAGGATGAGGATGAGGAT') record = screed.Record(name='reqseq', sequence=read, ikmers=list()) k1 = kevlar.KmerOfInterest(sequence='GATGAGGATGAGGATGAGGATGAGG', offset=2, abund=[11, 1, 0]) k2 = kevlar.KmerOfInterest(sequence='GATGAGGATGAGGATGAGGATGAGG', offset=8, abund=[11, 1, 0]) record.ikmers.extend([k1, k2]) kevlar.print_augmented_fastx(record, stdout) out, err = capsys.readouterr() assert read in out
def test_augfastx_writer(): output = StringIO() record = Record( name='BasiliscusVulgarisRead84467/1', sequence='TTAACTCTAGATTAGGGGCGTGACTTAATAAGGTGTGGGCCTAAGCGTCT', quality='BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB', annotations=[ KmerOfInterest(ksize=19, offset=13, abund=(12, 1, 1)), KmerOfInterest(ksize=19, offset=15, abund=(20, 0, 1)), ], ) kevlar.print_augmented_fastx(record, output) record = Record( name='BasiliscusVulgarisRead90577/2', sequence='CTGTAATCCCAGCACTTTGGGAGGCCGAGGCAAGCAGATGATGCGGTCAG', quality='BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB', annotations=[ KmerOfInterest(ksize=19, offset=1, abund=(5, 7, 9)), KmerOfInterest(ksize=19, offset=2, abund=(7, 10, 9)), ], mates=['CAGATGTGTCTTGTGGGCAGTGCAGCGGAGAGGTGCAAATATGGGTTTGG'] ) kevlar.print_augmented_fastx(record, output) record = Record( name='BasiliscusVulgarisRead99037/1', sequence='AGCACTTTGGGAGGCCGAGGCAAGCAGATGATGCGGTCAGGATTACAGAT', quality='BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB' ) kevlar.print_augmented_fastx(record, output) assert output.getvalue() == """@BasiliscusVulgarisRead84467/1
def validate_and_print(readset, countgraph, mask=None, minabund=5, outfile=sys.stdout, augout=None, logfile=sys.stderr): readset.validate(countgraph, mask=mask, minabund=minabund) n = 0 # Get an unbound var error later (printing report) without this?!?! for n, record in enumerate(readset): khmer.utils.write_record(record, outfile) if augout: kevlar.print_augmented_fastx(record, augout) int_distinct = readset.masked[0] + readset.lowabund[0] + readset.valid[0] int_instances = readset.masked[1] + readset.lowabund[1] + readset.valid[1] message = ' processed {:d} instances'.format(int_instances) message += ' of {:d} distinct "interesting" k-mers'.format(int_distinct) message += ' in {:d} reads'.format(len(readset)) message += '\n ' message += '{:d} instances'.format(readset.masked[1]) message += ' of {:d} distinct k-mers'.format(readset.masked[0]) message += ' masked by the reference genome' message += '\n ' message += '{:d} instances'.format(readset.lowabund[1]) message += ' of {:d} distinct k-mers'.format(readset.lowabund[0]) message += ' discarded due to low abundance' message += '\n ' message += '{:d} instances'.format(readset.valid[1]) message += ' of {:d} distinct k-mers'.format(readset.valid[0]) message += ' validated as novel' message += '\n ' message += '{:d} reads'.format(readset.discarded) message += ' with no surviving valid k-mers ignored' message += '\n ' message += '{:d} reads written to output'.format(n + 1) print(message, file=logfile)
def main(args): augseqs = kevlar.parse_augmented_fastx(kevlar.open(args.augseqs, 'r')) nakedseqs = kevlar.parse_augmented_fastx(kevlar.open(args.seqs, 'r')) outstream = kevlar.open(args.out, 'w') for record in augment(augseqs, nakedseqs): kevlar.print_augmented_fastx(record, outstream)
'--out-pattern', metavar='REGEX', help='out file name pattern with a {} placeholder for partition ID') parser.add_argument('augfastx') parser.add_argument('partition', nargs='+') args = parser.parse_args() if args.out and args.out_pattern: raise Exception('cannot give outfile and outpattern together') elif args.out and not args.out_pattern: args.out = kevlar.open(args.out, 'w') elif not args.out and not args.out_pattern: args.out = sys.stdout partids = set(args.partition) fh = kevlar.open(args.augfastx, 'r') reader = kevlar.parse_augmented_fastx(fh) preader = kevlar.parse_partitioned_reads(reader) for partid, partition in preader: if partid not in partids: continue if args.out_pattern: pattern = str(args.out_pattern) outfile = pattern.format(partid) with kevlar.open(outfile, 'w') as out: for read in partition: kevlar.print_augmented_fastx(read, out) else: for read in partition: kevlar.print_augmented_fastx(read, args.out)
def main(args): reads = kevlar.parse_augmented_fastx(kevlar.open(args.augfastq, 'r')) outstream = kevlar.open(args.out, 'w') for contig in assemble_fml_asm(reads): kevlar.print_augmented_fastx(contig, outstream)
def split(pstream, outstreams): """Split the partitions across the N outstreams.""" for partition, outstream in zip(pstream, cycle(outstreams)): for read in partition: kevlar.print_augmented_fastx(read, outstream)