def main(): counting_ht = sys.argv[1] infiles = sys.argv[2:] print 'file with ht: %s' % counting_ht print '-- settings:' print 'N THREADS', WORKER_THREADS print '--' print 'making hashtable' ht = khmer.new_counting_hash(K, 1, 1) ht.load(counting_ht) for infile in infiles: print 'filtering', infile outfile = infile + '.abundfilt' outfp = open(outfile, 'w') def process_fn(record, ht=ht): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = ht.trim_on_abundance(seq, 2) if trim_at >= K: return name, trim_seq return None, None tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE) tsp.start(verbose_fasta_iter(infile), outfp)
def main(): htfile = sys.argv[1] outfiles = sys.argv[2:] print 'loading hashbits' ht = khmer.load_hashbits(htfile) def process_fn(record, ht=ht): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = ht.trim_on_sodd(seq, MAX_SODD) if trim_at >= ht.ksize(): return name, trim_seq return None, None for filename in outfiles: outpath = os.path.basename(filename) + '.sodd' outfp = open(outpath, 'w') tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE) tsp.start(verbose_fasta_iter(filename), outfp)
def main(): counting_ht = sys.argv[1] infiles = sys.argv[2:] print 'file with ht: %s' % counting_ht print '-- settings:' print 'N THREADS', WORKER_THREADS print '--' print 'making hashtable' ht = khmer.load_counting_hash(counting_ht) K = ht.ksize() for infile in infiles: print 'filtering', infile outfile = os.path.basename(infile) + '.below' outfp = open(outfile, 'w') def process_fn(record, ht=ht): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = ht.trim_below_abundance(seq, CUTOFF) if trim_at >= K: return name, trim_seq return None, None tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE) tsp.start(verbose_fasta_iter(infile), outfp)
def main(): counting_ht = sys.argv[1] infiles = sys.argv[2:] print('file with ht: %s' % counting_ht) print('-- settings:') print('N THREADS', WORKER_THREADS) print('--') print('making hashtable') ht = khmer.load_countgraph(counting_ht) K = ht.ksize() for infile in infiles: print('filtering', infile) outfile = os.path.basename(infile) + '.below' outfp = open(outfile, 'w') def process_fn(record, ht=ht): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = ht.trim_below_abundance(seq, CUTOFF) if trim_at >= K: return name, trim_seq return None, None tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE) tsp.start(verbose_fasta_iter(infile), outfp)
def main(): htfile = sys.argv[1] outfiles = sys.argv[2:] print 'loading hashbits' ht = khmer.load_hashbits(htfile) def process_fn(record, ht=ht): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = ht.trim_on_sodd(seq, MAX_SODD) if trim_at >= ht.ksize(): return name, trim_seq return None, None for filename in outfiles: outpath = os.path.basename(filename) + '.sodd' outfp = open(outpath, 'w') tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE) tsp.start(verbose_fasta_iter(filename), outfp)
def main(): repfile = sys.argv[1] infile = sys.argv[1] if len(sys.argv) >= 3: infile = sys.argv[2] outfile = os.path.basename(infile) + '.loess' if len(sys.argv) >= 4: outfile = sys.argv[3] print 'file with representative artifacts: %s' % repfile print 'input file to degree filter: %s' % infile print 'filtering to output:', outfile print '-- settings:' print 'K', K print 'HASHTABLE SIZE %g' % HASHTABLE_SIZE print 'N HASHTABLES %d' % N_HT print 'N THREADS', WORKER_THREADS print 'RADIUS', RADIUS print 'MAX DENSITY', MAX_VOLUME / RADIUS print '--' print 'making hashtable' ht = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT) outfp = open(outfile, 'w') print 'eating', repfile ht.consume_fasta(repfile) def process_fn(record, ht=ht): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = ht.trim_on_density_explosion(seq, RADIUS, MAX_VOLUME) # if trim_at >= K: # return name, trim_seq if trim_at == len(seq): return name, seq return None, None tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE) ### tsp.start(verbose_fasta_iter(infile), outfp)
def main(): repfile = sys.argv[1] infile = sys.argv[1] if len(sys.argv) >= 3: infile = sys.argv[2] outfile = os.path.basename(infile) + '.loess' if len(sys.argv) >= 4: outfile = sys.argv[3] print 'file with representative artifacts: %s' % repfile print 'input file to degree filter: %s' % infile print 'filtering to output:', outfile print '-- settings:' print 'K', K print 'HASHTABLE SIZE %g' % HASHTABLE_SIZE print 'N HASHTABLES %d' % N_HT print 'N THREADS', WORKER_THREADS print 'RADIUS', RADIUS print 'MAX DENSITY', MAX_VOLUME / RADIUS print '--' print 'making hashtable' ht = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT) outfp = open(outfile, 'w') print 'eating', repfile ht.consume_fasta(repfile) def process_fn(record, ht=ht): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = ht.trim_on_density_explosion(seq, RADIUS, MAX_VOLUME) # if trim_at >= K: # return name, trim_seq if trim_at == len(seq): return name, seq return None, None tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE) ### tsp.start(verbose_fasta_iter(infile), outfp)
def main(): repfile = sys.argv[1] infile = sys.argv[1] if len(sys.argv) >= 3: infile = sys.argv[2] outfile = os.path.basename(infile) + ".loess" if len(sys.argv) >= 4: outfile = sys.argv[3] print "file with representative artifacts: %s" % repfile print "input file to degree filter: %s" % infile print "filtering to output:", outfile print "-- settings:" print "K", K print "HASHTABLE SIZE %g" % HASHTABLE_SIZE print "N HASHTABLES %d" % N_HT print "N THREADS", WORKER_THREADS print "RADIUS", RADIUS print "MAX DENSITY", MAX_VOLUME / RADIUS print "--" print "making hashtable" ht = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT) outfp = open(outfile, "w") print "eating", repfile ht.consume_fasta(repfile) def process_fn(record, ht=ht): name = record["name"] seq = record["sequence"] if "N" in seq: return None, None trim_seq, trim_at = ht.trim_on_density_explosion(seq, RADIUS, MAX_VOLUME) # if trim_at >= K: # return name, trim_seq if trim_at == len(seq): return name, seq return None, None tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE) ### tsp.start(verbose_fasta_iter(infile), outfp)
def main(): stoptags = sys.argv[1] infile = sys.argv[2] outfile = os.path.basename(infile) + '.stopkeep' if len(sys.argv) >= 4: outfile = sys.argv[3] print 'file with stop tags: %s' % stoptags print 'input file to filter: %s' % infile print 'filtering to output:', outfile print '-- settings:' print 'K', K print 'N THREADS', WORKER_THREADS print '--' print 'making hashtable' ht = khmer.new_hashbits(K, 1, 1) ht.load_stop_tags(stoptags) outfp = open(outfile, 'w') def process_fn(record, ht=ht): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = ht.trim_on_stoptags(seq) if trim_at < K: return name, seq seq = seq[trim_at:] if seq: return name, seq return None, None tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE) ### tsp.start(verbose_fasta_iter(infile), outfp)
def main(): stoptags = sys.argv[1] infile = sys.argv[2] outfile = os.path.basename(infile) + '.stopkeep' if len(sys.argv) >= 4: outfile = sys.argv[3] print 'file with stop tags: %s' % stoptags print 'input file to filter: %s' % infile print 'filtering to output:', outfile print '-- settings:' print 'K', K print 'N THREADS', WORKER_THREADS print '--' print 'making hashtable' ht = khmer.new_hashbits(K, 1, 1) ht.load_stop_tags(stoptags) outfp = open(outfile, 'w') def process_fn(record, ht=ht): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = ht.trim_on_stoptags(seq) if trim_at < K: return name, seq seq = seq[trim_at:] if seq: return name, seq return None, None tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE) ### tsp.start(verbose_fasta_iter(infile), outfp)
def main(): counting_ht = sys.argv[1] infiles = sys.argv[2:] print 'file with ht: %s' % counting_ht print '-- settings:' print 'N THREADS', WORKER_THREADS print '--' print 'making hashtable' ht = khmer.new_counting_hash(K, 1, 1) ht.load(counting_ht) for infile in infiles: print 'filtering', infile outfile = os.path.basename(infile) + '.ham1filt' outfp = open(outfile, 'w') def process_fn(record, ht=ht): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None for pos in range(len(seq) - K): kmer = seq[pos:pos + K] if ht.max_hamming1_count(kmer) > 2000: trim_at = pos + K - 1 seq = seq[:trim_at] break if len(seq) >= K: return name, seq return None, None tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE) tsp.start(verbose_fasta_iter(infile), outfp)
def main(): counting_ht = sys.argv[1] infiles = sys.argv[2:] print 'file with ht: %s' % counting_ht print '-- settings:' print 'N THREADS', WORKER_THREADS print '--' print 'making hashtable' ht = khmer.new_counting_hash(K, 1, 1) ht.load(counting_ht) for infile in infiles: print 'filtering', infile outfile = os.path.basename(infile) + '.ham1filt' outfp = open(outfile, 'w') def process_fn(record, ht=ht): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None for pos in range(len(seq) - K): kmer = seq[pos:pos + K] if ht.max_hamming1_count(kmer) > 2000: trim_at = pos + K - 1 seq = seq[:trim_at] break if len(seq) >= K: return name, seq return None, None tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE) tsp.start(verbose_fasta_iter(infile), outfp)
def main(): print '-- settings:' print 'K', K print 'N THREADS', WORKER_THREADS print '--' print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) for filename in sys.argv[1:]: print 'consuming input', filename ht.consume_fasta(filename) def process_fn(record, ht=ht): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None if len(seq) < K: return None, None if ht.get_min_count(seq) < 2: return None, None return name, seq for filename in sys.argv[1:]: print '***', filename outfile = os.path.basename(filename) + '.f2' if os.path.exists(outfile): print 'SKIPPING', outfile, ' -- already exists' continue outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE) tsp.start(verbose_fasta_iter(filename), outfp)
def main(): print '-- settings:' print 'K', K print 'N THREADS', WORKER_THREADS print '--' print 'making hashtable' ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) for filename in sys.argv[1:]: print 'consuming input', filename ht.consume_fasta(filename) def process_fn(record, ht=ht): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None if len(seq) < K: return None, None if ht.get_min_count(seq) < 2: return None, None return name, seq for filename in sys.argv[1:]: print '***', filename outfile = os.path.basename(filename) + '.f2' if os.path.exists(outfile): print 'SKIPPING', outfile, ' -- already exists' continue outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE) tsp.start(verbose_fasta_iter(filename), outfp)
def main(): print "-- settings:" print "K", K print "N THREADS", WORKER_THREADS print "--" print "making hashtable" ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) for filename in sys.argv[1:]: print "consuming input", filename ht.consume_fasta(filename) def process_fn(record, ht=ht): name = record["name"] seq = record["sequence"] if "N" in seq: return None, None if len(seq) < K: return None, None if ht.get_min_count(seq) < 2: return None, None return name, seq for filename in sys.argv[1:]: print "***", filename outfile = os.path.basename(filename) + ".f2" if os.path.exists(outfile): print "SKIPPING", outfile, " -- already exists" continue outfp = open(outfile, "w") tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE) tsp.start(verbose_fasta_iter(filename), outfp)
def main(): infile = sys.argv[1] outfile = os.path.basename(infile) + '.graphsize' if len(sys.argv) == 3: outfile = sys.argv[2] print('input file to graphsize filter: %s' % infile) print('filtering to output:', outfile) print('-- settings:') print('K', K) print('HASHTABLE SIZE %g' % HASHTABLE_SIZE) print('N HASHTABLES %d' % N_HT) print('THRESHOLD', THRESHOLD) print('N THREADS', WORKER_THREADS) print('--') print('creating ht') ht = khmer.Nodegraph(K, HASHTABLE_SIZE, N_HT) print('eating fa', infile) total_reads, n_consumed = ht.consume_fasta(infile) outfp = open(outfile, 'w') ### def process_fn(record, ht=ht): kmer = record['sequence'][:K] size = ht.calc_connected_graph_size(kmer, THRESHOLD) if size >= THRESHOLD: return record['name'], record['sequence'] return None, None tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE) ### tsp.start(verbose_fasta_iter(infile), outfp)
def main(): infile = sys.argv[1] outfile = os.path.basename(infile) + '.graphsize' if len(sys.argv) == 3: outfile = sys.argv[2] print('input file to graphsize filter: %s' % infile) print('filtering to output:', outfile) print('-- settings:') print('K', K) print('HASHTABLE SIZE %g' % HASHTABLE_SIZE) print('N HASHTABLES %d' % N_HT) print('THRESHOLD', THRESHOLD) print('N THREADS', WORKER_THREADS) print('--') print('creating ht') ht = khmer.new_hashbits(K, HASHTABLE_SIZE, N_HT) print('eating fa', infile) total_reads, n_consumed = ht.consume_fasta(infile) outfp = open(outfile, 'w') ### def process_fn(record, ht=ht): kmer = record['sequence'][:K] size = ht.calc_connected_graph_size(kmer, THRESHOLD) if size >= THRESHOLD: return record['name'], record['sequence'] return None, None tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE) ### tsp.start(verbose_fasta_iter(infile), outfp)