def main(): info('filter-abund.py', ['counting']) args = sanitize_help(get_parser()).parse_args() check_input_files(args.input_graph, args.force) infiles = args.input_filename if ('-' in infiles or '/dev/stdin' in infiles) and not \ args.single_output_file: print("Accepting input from stdin; output filename must " "be provided with -o.", file=sys.stderr) sys.exit(1) for filename in infiles: check_input_files(filename, args.force) check_space(infiles, args.force) print('loading countgraph:', args.input_graph, file=sys.stderr) countgraph = khmer.load_countgraph(args.input_graph) ksize = countgraph.ksize() print("K:", ksize, file=sys.stderr) # the filtering function. def process_fn(record): name = record.name seq = record.sequence seqN = seq.replace('N', 'A') if args.variable_coverage: # only trim when sequence has high enough C med, _, _ = countgraph.get_median_count(seqN) if med < args.normalize_to: return name, seq _, trim_at = countgraph.trim_on_abundance(seqN, args.cutoff) if trim_at >= ksize: # be sure to not to change the 'N's in the trimmed sequence - # so, return 'seq' and not 'seqN'. return name, seq[:trim_at] return None, None if args.single_output_file: outfile = args.single_output_file.name outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip) # the filtering loop for infile in infiles: print('filtering', infile, file=sys.stderr) if not args.single_output_file: outfile = os.path.basename(infile) + '.abundfilt' outfp = open(outfile, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) tsp = ThreadedSequenceProcessor(process_fn, n_workers=args.threads) tsp.start(verbose_loader(infile), outfp) print('output in', outfile, file=sys.stderr)
def main(): counting_ht = sys.argv[1] infiles = sys.argv[2:] print('file with ht: %s' % counting_ht) print('-- settings:') print('N THREADS', WORKER_THREADS) print('--') print('making hashtable') ht = khmer.load_countgraph(counting_ht) K = ht.ksize() for infile in infiles: print('filtering', infile) outfile = os.path.basename(infile) + '.below' outfp = open(outfile, 'w') def process_fn(record, ht=ht): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = ht.trim_below_abundance(seq, CUTOFF) if trim_at >= K: return name, trim_seq return None, None tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE) tsp.start(verbose_fasta_iter(infile), outfp)
def main(): info("filter-abund-single.py", ["counting"]) args = get_parser().parse_args() check_file_status(args.datafile) check_space([args.datafile]) if args.savetable: check_space_for_hashtable(args.n_tables * args.min_tablesize) report_on_config(args) config = khmer.get_config() config.set_reads_input_buffer_size(args.threads * 64 * 1024) print "making k-mer counting table" htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables, args.threads) # first, load reads into hash table rparser = khmer.ReadParser(args.datafile, args.threads) threads = [] print "consuming input, round 1 --", args.datafile for _ in xrange(args.threads): cur_thread = threading.Thread(target=htable.consume_fasta_with_reads_parser, args=(rparser,)) threads.append(cur_thread) cur_thread.start() for _ in threads: _.join() fp_rate = khmer.calc_expected_collisions(htable) print "fp rate estimated to be %1.3f" % fp_rate # now, trim. # the filtering function. def process_fn(record): name = record["name"] seq = record["sequence"] if "N" in seq: return None, None trim_seq, trim_at = htable.trim_on_abundance(seq, args.cutoff) if trim_at >= args.ksize: return name, trim_seq return None, None # the filtering loop print "filtering", args.datafile outfile = os.path.basename(args.datafile) + ".abundfilt" outfp = open(outfile, "w") tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(args.datafile), outfp) print "output in", outfile if args.savetable: print "Saving k-mer counting table filename", args.savetable print "...saving to", args.savetable htable.save(args.savetable)
def main(): counting_ht = sys.argv[1] infiles = sys.argv[2:] print 'file with ht: %s' % counting_ht print '-- settings:' print 'N THREADS', WORKER_THREADS print '--' print 'making hashtable' ht = khmer.new_counting_hash(K, 1, 1) ht.load(counting_ht) for infile in infiles: print 'filtering', infile outfile = infile + '.abundfilt' outfp = open(outfile, 'w') def process_fn(record, ht=ht): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = ht.trim_on_abundance(seq, 2) if trim_at >= K: return name, trim_seq return None, None tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE) tsp.start(verbose_fasta_iter(infile), outfp)
def main(): htfile = sys.argv[1] outfiles = sys.argv[2:] print 'loading hashbits' ht = khmer.load_hashbits(htfile) def process_fn(record, ht=ht): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = ht.trim_on_sodd(seq, MAX_SODD) if trim_at >= ht.ksize(): return name, trim_seq return None, None for filename in outfiles: outpath = os.path.basename(filename) + '.sodd' outfp = open(outpath, 'w') tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE) tsp.start(verbose_fasta_iter(filename), outfp)
def main(): parser = build_counting_multifile_args() parser.add_argument( "--cutoff", "-C", dest="coverage", default=DEFAULT_COVERAGE, type=int, help="Diginorm coverage." ) parser.add_argument( "--max-error-region", "-M", dest="max_error_region", default=DEFAULT_MAX_ERROR_REGION, type=int, help="Max length of error region allowed", ) args = parser.parse_args() counting_ht = args.input_table infiles = args.input_filenames print "file with ht: %s" % counting_ht print "loading hashtable" ht = khmer.load_counting_hash(counting_ht) K = ht.ksize() C = args.coverage max_error_region = args.max_error_region print "K:", K print "C:", C print "max error region:", max_error_region # the filtering function. def process_fn(record): # read_aligner is probably not threadsafe? aligner = khmer.new_readaligner(ht, 1, C, max_error_region) name = record["name"] seq = record["sequence"] seq = seq.replace("N", "A") grXreAlign, reXgrAlign = aligner.align(seq) if len(reXgrAlign) > 0: graph_seq = grXreAlign.replace("-", "") seq = graph_seq return name, seq # the filtering loop for infile in infiles: print "filtering", infile outfile = os.path.basename(infile) + ".corr" outfp = open(outfile, "w") tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print "output in", outfile
def main(): parser = build_counting_multifile_args() parser.add_argument( "--cutoff", "-C", dest="cutoff", default=DEFAULT_CUTOFF, type=int, help="Trim at k-mers below this abundance." ) parser.add_argument("-V", "--variable-coverage", action="store_true", dest="variable_coverage", default=False) parser.add_argument( "--normalize-to", "-Z", type=int, dest="normalize_to", help="base variable-coverage cutoff on this median k-mer abundance", default=DEFAULT_NORMALIZE_LIMIT, ) args = parser.parse_args() counting_ht = args.input_table infiles = args.input_filenames print "file with ht: %s" % counting_ht print "loading hashtable" ht = khmer.load_counting_hash(counting_ht) K = ht.ksize() print "K:", K ### the filtering function. def process_fn(record): name = record["name"] seq = record["sequence"] if "N" in seq: return None, None if args.variable_coverage: # only trim when sequence has high enough C med, _, _ = ht.get_median_count(seq) if med < args.normalize_to: return name, seq trim_seq, trim_at = ht.trim_on_abundance(seq, args.cutoff) if trim_at >= K: return name, trim_seq return None, None ### the filtering loop for infile in infiles: print "filtering", infile outfile = os.path.basename(infile) + ".abundfilt" outfp = open(outfile, "w") tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print "output in", outfile
def test_basic_fastq_like(): tsp = ThreadedSequenceProcessor(idem, 1, 1, verbose=False) inseqs = [screed.Record(name='a', sequence='AAA', quality='###'), screed.Record(name='b', sequence='TTT', quality='###'), ] outfp = StringIO() tsp.start(inseqs, outfp) x = load_records_fastq(outfp) for i in x: assert i['quality'] == '###'
def main(): parser = build_counting_multifile_args() parser.add_argument('--cutoff', '-C', dest='coverage', default=DEFAULT_COVERAGE, type=int, help="Diginorm coverage.") parser.add_argument('--max-error-region', '-M', dest='max_error_region', default=DEFAULT_MAX_ERROR_REGION, type=int, help="Max length of error region allowed") args = parser.parse_args() counting_ht = args.input_table infiles = args.input_filenames print 'file with ht: %s' % counting_ht print 'loading hashtable' ht = khmer.load_counting_hash(counting_ht) K = ht.ksize() C = args.coverage max_error_region = args.max_error_region print "K:", K print "C:", C print "max error region:", max_error_region # the filtering function. def process_fn(record): # read_aligner is probably not threadsafe? aligner = khmer.new_readaligner(ht, 1, C, max_error_region) name = record['name'] seq = record['sequence'] seq = seq.replace('N', 'A') grXreAlign, reXgrAlign = aligner.align(seq) if len(reXgrAlign) > 0: graph_seq = grXreAlign.replace('-', '') seq = graph_seq return name, seq # the filtering loop for infile in infiles: print 'filtering', infile outfile = os.path.basename(infile) + '.corr' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print 'output in', outfile
def test_odd(): tsp = ThreadedSequenceProcessor(every_other, 1, 1, verbose=False) input = [ dict(name='a', sequence='AAA'), dict(name='b', sequence='TTT'), ] outfp = StringIO() tsp.start(input, outfp) x = load_records_d(outfp) assert len(x) == 1, x assert x['b'] == 'TTT'
def test_basic_fastq_like(): tsp = ThreadedSequenceProcessor(idem, 1, 1, verbose=False) input = [dict(name='a', sequence='AAA', accuracy='###'), dict(name='b', sequence='TTT', accuracy='###'), ] outfp = StringIO() tsp.start(input, outfp) x = load_records_fastq(outfp) for i in x: assert i['accuracy'] == '###'
def main(): info('filter-abund.py', ['counting']) args = get_parser().parse_args() check_input_files(args.input_table, args.force) infiles = args.input_filename for filename in infiles: check_input_files(filename, args.force) check_space(infiles, args.force) print('loading counting table:', args.input_table, file=sys.stderr) htable = khmer.load_counting_hash(args.input_table) ksize = htable.ksize() print("K:", ksize, file=sys.stderr) # the filtering function. def process_fn(record): name = record.name seq = record.sequence seqN = seq.replace('N', 'A') if args.variable_coverage: # only trim when sequence has high enough C med, _, _ = htable.get_median_count(seqN) if med < args.normalize_to: return name, seq _, trim_at = htable.trim_on_abundance(seqN, args.cutoff) if trim_at >= ksize: # be sure to not to change the 'N's in the trimmed sequence - # so, return 'seq' and not 'seqN'. return name, seq[:trim_at] return None, None # the filtering loop for infile in infiles: print('filtering', infile, file=sys.stderr) if args.single_output_filename != '': outfile = args.single_output_filename outfp = open(outfile, 'a') else: outfile = os.path.basename(infile) + '.abundfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn, n_workers=args.threads) tsp.start(verbose_loader(infile), outfp) print('output in', outfile, file=sys.stderr)
def main(): repfile = sys.argv[1] infile = sys.argv[1] if len(sys.argv) >= 3: infile = sys.argv[2] outfile = os.path.basename(infile) + '.loess' if len(sys.argv) >= 4: outfile = sys.argv[3] print 'file with representative artifacts: %s' % repfile print 'input file to degree filter: %s' % infile print 'filtering to output:', outfile print '-- settings:' print 'K', K print 'HASHTABLE SIZE %g' % HASHTABLE_SIZE print 'N HASHTABLES %d' % N_HT print 'N THREADS', WORKER_THREADS print 'RADIUS', RADIUS print 'MAX DENSITY', MAX_VOLUME / RADIUS print '--' print 'making hashtable' ht = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT) outfp = open(outfile, 'w') print 'eating', repfile ht.consume_fasta(repfile) def process_fn(record, ht=ht): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = ht.trim_on_density_explosion(seq, RADIUS, MAX_VOLUME) # if trim_at >= K: # return name, trim_seq if trim_at == len(seq): return name, seq return None, None tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE) ### tsp.start(verbose_fasta_iter(infile), outfp)
def test_basic_2thread(): tsp = ThreadedSequenceProcessor(idem, 2, 1, verbose=False) input = [ dict(name='a', sequence='AAA'), dict(name='b', sequence='TTT'), ] outfp = StringIO() tsp.start(input, outfp) x = load_records_d(outfp) assert len(x) == 2, x assert x['a'] == 'AAA' assert x['b'] == 'TTT'
def test_basic(): tsp = ThreadedSequenceProcessor(idem, 1, 1, verbose=False) inseqs = [screed.Record(name='a', sequence='AAA'), screed.Record(name='b', sequence='TTT'), ] outfp = StringIO() tsp.start(inseqs, outfp) x = load_records_d(outfp) assert len(x) == 2, x assert x['a'] == 'AAA' assert x['b'] == 'TTT'
def main(): repfile = sys.argv[1] infile = sys.argv[1] if len(sys.argv) >= 3: infile = sys.argv[2] outfile = os.path.basename(infile) + ".loess" if len(sys.argv) >= 4: outfile = sys.argv[3] print "file with representative artifacts: %s" % repfile print "input file to degree filter: %s" % infile print "filtering to output:", outfile print "-- settings:" print "K", K print "HASHTABLE SIZE %g" % HASHTABLE_SIZE print "N HASHTABLES %d" % N_HT print "N THREADS", WORKER_THREADS print "RADIUS", RADIUS print "MAX DENSITY", MAX_VOLUME / RADIUS print "--" print "making hashtable" ht = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT) outfp = open(outfile, "w") print "eating", repfile ht.consume_fasta(repfile) def process_fn(record, ht=ht): name = record["name"] seq = record["sequence"] if "N" in seq: return None, None trim_seq, trim_at = ht.trim_on_density_explosion(seq, RADIUS, MAX_VOLUME) # if trim_at >= K: # return name, trim_seq if trim_at == len(seq): return name, seq return None, None tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE) ### tsp.start(verbose_fasta_iter(infile), outfp)
def main(): info('filter-abund.py', ['counting']) args = get_parser().parse_args() counting_ht = args.input_table infiles = args.input_filename for _ in infiles: check_file_status(_, args.force) check_space(infiles, args.force) print >>sys.stderr, 'loading hashtable' htable = khmer.load_counting_hash(counting_ht) ksize = htable.ksize() print >>sys.stderr, "K:", ksize # the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None if args.variable_coverage: # only trim when sequence has high enough C med, _, _ = htable.get_median_count(seq) if med < args.normalize_to: return name, seq trim_seq, trim_at = htable.trim_on_abundance(seq, args.cutoff) if trim_at >= ksize: return name, trim_seq return None, None # the filtering loop for infile in infiles: print >>sys.stderr, 'filtering', infile if args.single_output_filename != '': outfile = args.single_output_filename outfp = open(outfile, 'a') else: outfile = os.path.basename(infile) + '.abundfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn, n_workers=args.threads) tsp.start(verbose_loader(infile), outfp) print >>sys.stderr, 'output in', outfile
def main(): parser = build_counting_multifile_args() parser.add_argument('--cutoff', '-C', dest='cutoff', default=DEFAULT_CUTOFF, type=int, help="Trim at k-mers below this abundance.") parser.add_argument('-o', '--outputpath', dest='outputpath', default='.') args = parser.parse_args() counting_ht = args.input_table infiles = args.input_filenames outpath = args.outputpath print 'file with ht: %s' % counting_ht print 'loading hashtable' ht = khmer.load_counting_hash(counting_ht) K = ht.ksize() print "K:", K ### the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = ht.trim_on_abundance(seq, args.cutoff) if trim_at >= K: return name, trim_seq return None, None ### the filtering loop for infile in infiles: print 'filtering', infile outfile = outpath + '/' + os.path.basename(infile) + '.abundfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print 'output in', outfile
def main(): stoptags = sys.argv[1] infile = sys.argv[2] outfile = os.path.basename(infile) + '.stopkeep' if len(sys.argv) >= 4: outfile = sys.argv[3] print 'file with stop tags: %s' % stoptags print 'input file to filter: %s' % infile print 'filtering to output:', outfile print '-- settings:' print 'K', K print 'N THREADS', WORKER_THREADS print '--' print 'making hashtable' ht = khmer.new_hashbits(K, 1, 1) ht.load_stop_tags(stoptags) outfp = open(outfile, 'w') def process_fn(record, ht=ht): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = ht.trim_on_stoptags(seq) if trim_at < K: return name, seq seq = seq[trim_at:] if seq: return name, seq return None, None tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE) ### tsp.start(verbose_fasta_iter(infile), outfp)
def main(): repfile = sys.argv[1] infile = sys.argv[2] outfile = os.path.basename(infile) + '.fno255' if len(sys.argv) >= 4: outfile = sys.argv[3] print 'file to count from: %s' % repfile print 'input file to filter: %s' % infile print 'filtering to output:', outfile print '-- settings:' print 'K', K print 'N THREADS', WORKER_THREADS print '--' print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) print 'consuming input', repfile ht.consume_fasta(repfile) outfp = open(outfile, 'w') def process_fn(record, ht=ht): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None if len(seq) < K: return None, None if ht.get_max_count(seq) >= 255: return None, None return name, seq tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE) ### tsp.start(verbose_fastq_iter(infile), outfp)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-k', default=DEFAULT_K, type=int, help='k-mer size', dest='ksize') parser.add_argument('stoptags_file') parser.add_argument('input_filenames', nargs='+') args = parser.parse_args() K = args.ksize stoptags = args.stoptags_file infiles = args.input_filenames print 'loading stop tags, with K', K ht = khmer.new_hashbits(K, 1, 1) ht.load_stop_tags(stoptags) def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = ht.trim_on_stoptags(seq) if trim_at >= K: return name, trim_seq return None, None ### the filtering loop for infile in infiles: print 'filtering', infile outfile = os.path.dirname(infile) + '/' + os.path.basename(infile) + '.stopfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print 'output in', outfile
def main(): parser = build_counting_multifile_args() parser.add_argument('--cutoff', '-C', dest='cutoff', default=DEFAULT_CUTOFF, type=int, help="Trim at reads above this median abundance.") args = parser.parse_args() counting_ht = args.input_table infiles = args.input_filenames print 'file with ht: %s' % counting_ht print 'loading hashtable' ht = khmer.load_counting_hash(counting_ht) K = ht.ksize() print "K:", K ### the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] med, _, _ = ht.get_median_count(seq) if med >= args.cutoff: return name, seq return None, None ### the filtering loop for infile in infiles: print 'filtering', infile outfile = os.path.basename(infile) + '.himed' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print 'output in', outfile
def main(): counting_ht = sys.argv[1] infiles = sys.argv[2:] print 'file with ht: %s' % counting_ht print '-- settings:' print 'N THREADS', WORKER_THREADS print '--' print 'making hashtable' ht = khmer.new_counting_hash(K, 1, 1) ht.load(counting_ht) for infile in infiles: print 'filtering', infile outfile = os.path.basename(infile) + '.ham1filt' outfp = open(outfile, 'w') def process_fn(record, ht=ht): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None for pos in range(len(seq) - K): kmer = seq[pos:pos + K] if ht.max_hamming1_count(kmer) > 2000: trim_at = pos + K - 1 seq = seq[:trim_at] break if len(seq) >= K: return name, seq return None, None tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE) tsp.start(verbose_fasta_iter(infile), outfp)
def main(): info("filter-stoptags.py", ["graph"]) args = get_parser().parse_args() stoptags = args.stoptags_file infiles = args.input_filenames for _ in infiles: check_file_status(_) check_space(infiles) print "loading stop tags, with K", args.ksize htable = khmer.new_hashbits(args.ksize, 1, 1) htable.load_stop_tags(stoptags) def process_fn(record): name = record["name"] seq = record["sequence"] if "N" in seq: return None, None trim_seq, trim_at = htable.trim_on_stoptags(seq) if trim_at >= args.ksize: return name, trim_seq return None, None # the filtering loop for infile in infiles: print "filtering", infile outfile = os.path.basename(infile) + ".stopfilt" outfp = open(outfile, "w") tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print "output in", outfile
def main(): info('filter-stoptags.py', ['graph']) args = get_parser().parse_args() stoptags = args.stoptags_file infiles = args.input_filenames for _ in infiles: check_input_files(_, args.force) check_space(infiles, args.force) print >>sys.stderr, 'loading stop tags, with K', args.ksize htable = khmer.new_hashbits(args.ksize, 1, 1) htable.load_stop_tags(stoptags) def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = htable.trim_on_stoptags(seq) if trim_at >= args.ksize: return name, trim_seq return None, None # the filtering loop for infile in infiles: print >>sys.stderr, 'filtering', infile outfile = os.path.basename(infile) + '.stopfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print >>sys.stderr, 'output in', outfile
def main(): parser = build_counting_args() parser.add_argument('--coverage', '-C', dest='coverage', default=DEFAULT_COVERAGE, type=int) args = parser.parse_args() counting_ht = args.input_table infiles = args.input_filenames print 'file with ht: %s' % counting_ht print 'loading hashtable' ht = khmer.load_counting_hash(counting_ht) K = ht.ksize() print "K:", K # the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] med, avg, dev = ht.get_median_count(seq) if random.randint(1, med) > args.coverage: return None, None return name, seq # the filtering loop for infile in infiles: print 'filtering', infile outfile = os.path.basename(infile) + '.medfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print 'output in', outfile
def main(): parser = build_counting_multifile_args() parser.add_argument("--coverage", "-C", dest="coverage", default=DEFAULT_COVERAGE, type=int) args = parser.parse_args() counting_ht = args.input_table infiles = args.input_filenames print "file with ht: %s" % counting_ht print "loading hashtable" ht = khmer.load_counting_hash(counting_ht) K = ht.ksize() print "K:", K ### the filtering function. def process_fn(record): name = record["name"] seq = record["sequence"] med, avg, dev = ht.get_median_count(seq) if random.randint(1, med) > args.coverage: return None, None return name, seq ### the filtering loop for infile in infiles: print "filtering", infile outfile = os.path.basename(infile) + ".medfilt" outfp = open(outfile, "w") tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print "output in", outfile
def main(): args = sanitize_help(get_parser()).parse_args() stoptags = args.stoptags_file infiles = args.input_filenames for _ in infiles: check_input_files(_, args.force) check_space(infiles, args.force) print('loading stop tags, with K', args.ksize, file=sys.stderr) nodegraph = Nodegraph(args.ksize, 1, 1) nodegraph.load_stop_tags(stoptags) def process_fn(record): name = record.name seq = record.sequence if 'N' in seq: return None, None trim_seq, trim_at = nodegraph.trim_on_stoptags(seq) if trim_at >= args.ksize: return name, trim_seq return None, None # the filtering loop for infile in infiles: print('filtering', infile, file=sys.stderr) outfile = os.path.basename(infile) + '.stopfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print('output in', outfile, file=sys.stderr)
def main(): print '-- settings:' print 'K', K print 'N THREADS', WORKER_THREADS print '--' print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) for filename in sys.argv[1:]: print 'consuming input', filename ht.consume_fasta(filename) def process_fn(record, ht=ht): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None if len(seq) < K: return None, None if ht.get_min_count(seq) < 2: return None, None return name, seq for filename in sys.argv[1:]: print '***', filename outfile = os.path.basename(filename) + '.f2' if os.path.exists(outfile): print 'SKIPPING', outfile, ' -- already exists' continue outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE) tsp.start(verbose_fasta_iter(filename), outfp)
def main(): print "-- settings:" print "K", K print "N THREADS", WORKER_THREADS print "--" print "making hashtable" ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) for filename in sys.argv[1:]: print "consuming input", filename ht.consume_fasta(filename) def process_fn(record, ht=ht): name = record["name"] seq = record["sequence"] if "N" in seq: return None, None if len(seq) < K: return None, None if ht.get_min_count(seq) < 2: return None, None return name, seq for filename in sys.argv[1:]: print "***", filename outfile = os.path.basename(filename) + ".f2" if os.path.exists(outfile): print "SKIPPING", outfile, " -- already exists" continue outfp = open(outfile, "w") tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE) tsp.start(verbose_fasta_iter(filename), outfp)
def main(): parser = build_counting_multifile_args() parser.add_argument('--cutoff', '-C', dest='cutoff', default=DEFAULT_CUTOFF, type=int, help="Trim at k-mers below this abundance.") parser.add_argument('-V', '--variable-coverage', action='store_true', dest='variable_coverage', default=False) parser.add_argument('--normalize-to', '-Z', type=int, dest='normalize_to', help='base variable-coverage cutoff on this median' ' k-mer abundance', default=DEFAULT_NORMALIZE_LIMIT) args = parser.parse_args() counting_ht = args.input_table infiles = args.input_filenames print 'file with ht: %s' % counting_ht print 'loading hashtable' ht = khmer.load_counting_hash(counting_ht) K = ht.ksize() print "K:", K # the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None if args.variable_coverage: # only trim when sequence has high enough C med, _, _ = ht.get_median_count(seq) if med < args.normalize_to: return name, seq trim_seq, trim_at = ht.trim_on_abundance(seq, args.cutoff) if trim_at >= K: return name, trim_seq return None, None # the filtering loop for infile in infiles: print 'filtering', infile outfile = os.path.basename(infile) + '.abundfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print 'output in', outfile
def main(): info('filter-abund-single.py', ['counting', 'SeqAn']) args = get_parser().parse_args() check_input_files(args.datafile, args.force) check_space([args.datafile], args.force) if args.savetable: check_space_for_hashtable(args, 'countgraph', args.force) report_on_config(args) print('making countgraph', file=sys.stderr) htable = khmer_args.create_countgraph(args) # first, load reads into hash table rparser = khmer.ReadParser(args.datafile) threads = [] print('consuming input, round 1 --', args.datafile, file=sys.stderr) for _ in range(args.threads): cur_thread = \ threading.Thread( target=htable.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thread) cur_thread.start() for _ in threads: _.join() if args.report_total_kmers: print('Total number of unique k-mers: {0}'.format( htable.n_unique_kmers()), file=sys.stderr) fp_rate = khmer.calc_expected_collisions(htable, args.force) print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr) # now, trim. # the filtering function. def process_fn(record): name = record.name seq = record.sequence seqN = seq.replace('N', 'A') _, trim_at = htable.trim_on_abundance(seqN, args.cutoff) if trim_at >= args.ksize: # be sure to not to change the 'N's in the trimmed sequence - # so, return 'seq' and not 'seqN'. return name, seq[:trim_at] return None, None # the filtering loop print('filtering', args.datafile, file=sys.stderr) outfile = os.path.basename(args.datafile) + '.abundfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(args.datafile), outfp) print('output in', outfile, file=sys.stderr) if args.savetable: print('Saving k-mer counting table filename', args.savetable, file=sys.stderr) print('...saving to', args.savetable, file=sys.stderr) htable.save(args.savetable) print('wrote to: ', outfile, file=sys.stderr)
def main(): args = sanitize_help(get_parser()).parse_args() if not args.quiet: info('filter-abund.py', ['counting']) configure_logging(args.quiet) infiles = args.input_filename if ('-' in infiles or '/dev/stdin' in infiles) and not \ args.single_output_file: log_error("Accepting input from stdin; output filename must " "be provided with -o.") sys.exit(1) for filename in infiles: check_input_files(filename, args.force) check_space(infiles, args.force) log_info('loading countgraph: {graph}', graph=args.input_graph) countgraph = khmer.load_countgraph(args.input_graph) ksize = countgraph.ksize() log_info("K: {ksize}", ksize=ksize) # the filtering function. def process_fn(record): name = record.name seq = record.sequence seqN = seq.replace('N', 'A') if args.variable_coverage: # only trim when sequence has high enough C med, _, _ = countgraph.get_median_count(seqN) if med < args.normalize_to: return name, seq _, trim_at = countgraph.trim_on_abundance(seqN, args.cutoff) if trim_at >= ksize: # be sure to not to change the 'N's in the trimmed sequence - # so, return 'seq' and not 'seqN'. return name, seq[:trim_at] return None, None if args.single_output_file: outfile = args.single_output_file.name outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip) # the filtering loop for infile in infiles: log_info('filtering {infile}', infile=infile) if not args.single_output_file: outfile = os.path.basename(infile) + '.abundfilt' outfp = open(outfile, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) tsp = ThreadedSequenceProcessor(process_fn, n_workers=args.threads, verbose=not args.quiet) tsp.start(verbose_loader(infile), outfp) log_info('output in {outfile}', outfile=outfile)
def main(): parser = build_construct_args( "Filter k-mers at the given abundance (inmem version).") add_threading_args(parser) parser.add_argument('--cutoff', '-C', dest='cutoff', default=DEFAULT_CUTOFF, type=int, help="Trim at k-mers below this abundance.") parser.add_argument('--savehash', dest='savehash', default='') parser.add_argument('datafile') args = parser.parse_args() report_on_config(args) K = args.ksize HT_SIZE = args.min_hashsize N_HT = args.n_hashes n_threads = int(args.n_threads) config = khmer.get_config() bufsz = config.get_reads_input_buffer_size() config.set_reads_input_buffer_size(n_threads * 64 * 1024) print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads) filename = args.datafile # first, load reads into hash table rparser = khmer.ReadParser(filename, n_threads) threads = [] print 'consuming input, round 1 --', filename for tnum in xrange(n_threads): t = \ threading.Thread( target=ht.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(t) t.start() for t in threads: t.join() fp_rate = khmer.calc_expected_collisions(ht) print 'fp rate estimated to be %1.3f' % fp_rate # now, trim. # the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = ht.trim_on_abundance(seq, args.cutoff) if trim_at >= K: return name, trim_seq return None, None # the filtering loop print 'filtering', filename outfile = os.path.basename(filename) + '.abundfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(filename), outfp) print 'output in', outfile if args.savehash: print 'Saving hashfile', args.savehash print '...saving to', args.savehash ht.save(args.savehash)
def main(): info('filter-abund-single.py', ['counting', 'SeqAn']) args = get_parser().parse_args() check_file_status(args.datafile, args.force) check_space([args.datafile], args.force) if args.savetable: check_space_for_hashtable(args.n_tables * args.min_tablesize, args.force) report_on_config(args) print >> sys.stderr, 'making k-mer counting table' htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables) # first, load reads into hash table rparser = khmer.ReadParser(args.datafile) threads = [] print >> sys.stderr, 'consuming input, round 1 --', args.datafile for _ in xrange(args.threads): cur_thread = \ threading.Thread( target=htable.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thread) cur_thread.start() for _ in threads: _.join() if args.report_total_kmers: print >> sys.stderr, 'Total number of unique k-mers: {0}'.format( htable.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(htable) print >> sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate # now, trim. # the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = htable.trim_on_abundance(seq, args.cutoff) if trim_at >= args.ksize: return name, trim_seq return None, None # the filtering loop print >> sys.stderr, 'filtering', args.datafile outfile = os.path.basename(args.datafile) + '.abundfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(args.datafile), outfp) print >> sys.stderr, 'output in', outfile if args.savetable: print >>sys.stderr, 'Saving k-mer counting table filename', \ args.savetable print >> sys.stderr, '...saving to', args.savetable htable.save(args.savetable) print >> sys.stderr, 'wrote to: ', outfile
def main(): args = sanitize_help(get_parser()).parse_args() if not args.quiet: info('filter-abund-single.py', ['counting', 'SeqAn']) configure_logging(args.quiet) check_input_files(args.datafile, args.force) check_space([args.datafile], args.force) if args.savegraph: tablesize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.savegraph, tablesize, args.force) report_on_config(args) log_info('making countgraph') graph = khmer_args.create_countgraph(args) # first, load reads into graph rparser = khmer.ReadParser(args.datafile) threads = [] log_info('consuming input, round 1 -- {datafile}', datafile=args.datafile) for _ in range(args.threads): cur_thread = \ threading.Thread( target=graph.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thread) cur_thread.start() for _ in threads: _.join() log_info('Total number of unique k-mers: {nk}', nk=graph.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(graph, args.force) log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate) # now, trim. # the filtering function. def process_fn(record): name = record.name seq = record.sequence seqN = seq.replace('N', 'A') if args.variable_coverage: # only trim when sequence has high enough C med, _, _ = graph.get_median_count(seqN) if med < args.normalize_to: return name, seq _, trim_at = graph.trim_on_abundance(seqN, args.cutoff) if trim_at >= args.ksize: # be sure to not to change the 'N's in the trimmed sequence - # so, return 'seq' and not 'seqN'. return name, seq[:trim_at] return None, None # the filtering loop log_info('filtering {datafile}', datafile=args.datafile) if args.outfile is None: outfile = os.path.basename(args.datafile) + '.abundfilt' else: outfile = args.outfile outfp = open(outfile, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) tsp = ThreadedSequenceProcessor(process_fn, verbose=not args.quiet) tsp.start(verbose_loader(args.datafile), outfp) log_info('output in {outfile}', outfile=outfile) if args.savegraph: log_info('Saving k-mer countgraph filename {graph}', graph=args.savegraph) graph.save(args.savegraph)