def main(): parser = build_counting_multifile_args() parser.add_argument('--cutoff', '-C', dest='coverage', default=DEFAULT_COVERAGE, type=int, help="Diginorm coverage.") parser.add_argument('--max-error-region', '-M', dest='max_error_region', default=DEFAULT_MAX_ERROR_REGION, type=int, help="Max length of error region allowed") args = parser.parse_args() counting_ht = args.input_table infiles = args.input_filenames print 'file with ht: %s' % counting_ht print 'loading hashtable' ht = khmer.load_counting_hash(counting_ht) K = ht.ksize() C = args.coverage max_error_region = args.max_error_region print "K:", K print "C:", C print "max error region:", max_error_region # the filtering function. def process_fn(record): # read_aligner is probably not threadsafe? aligner = khmer.new_readaligner(ht, 1, C, max_error_region) name = record['name'] seq = record['sequence'] seq = seq.replace('N', 'A') grXreAlign, reXgrAlign = aligner.align(seq) if len(reXgrAlign) > 0: graph_seq = grXreAlign.replace('-', '') seq = graph_seq return name, seq # the filtering loop for infile in infiles: print 'filtering', infile outfile = os.path.basename(infile) + '.corr' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print 'output in', outfile
def main(): parser = build_counting_multifile_args() parser.add_argument( "--cutoff", "-C", dest="coverage", default=DEFAULT_COVERAGE, type=int, help="Diginorm coverage." ) parser.add_argument( "--max-error-region", "-M", dest="max_error_region", default=DEFAULT_MAX_ERROR_REGION, type=int, help="Max length of error region allowed", ) args = parser.parse_args() counting_ht = args.input_table infiles = args.input_filenames print "file with ht: %s" % counting_ht print "loading hashtable" ht = khmer.load_counting_hash(counting_ht) K = ht.ksize() C = args.coverage max_error_region = args.max_error_region print "K:", K print "C:", C print "max error region:", max_error_region # the filtering function. def process_fn(record): # read_aligner is probably not threadsafe? aligner = khmer.new_readaligner(ht, 1, C, max_error_region) name = record["name"] seq = record["sequence"] seq = seq.replace("N", "A") grXreAlign, reXgrAlign = aligner.align(seq) if len(reXgrAlign) > 0: graph_seq = grXreAlign.replace("-", "") seq = graph_seq return name, seq # the filtering loop for infile in infiles: print "filtering", infile outfile = os.path.basename(infile) + ".corr" outfp = open(outfile, "w") tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print "output in", outfile
def main(): parser = build_counting_multifile_args() parser.add_argument( "--cutoff", "-C", dest="cutoff", default=DEFAULT_CUTOFF, type=int, help="Trim at k-mers below this abundance." ) parser.add_argument("-V", "--variable-coverage", action="store_true", dest="variable_coverage", default=False) parser.add_argument( "--normalize-to", "-Z", type=int, dest="normalize_to", help="base variable-coverage cutoff on this median k-mer abundance", default=DEFAULT_NORMALIZE_LIMIT, ) args = parser.parse_args() counting_ht = args.input_table infiles = args.input_filenames print "file with ht: %s" % counting_ht print "loading hashtable" ht = khmer.load_counting_hash(counting_ht) K = ht.ksize() print "K:", K ### the filtering function. def process_fn(record): name = record["name"] seq = record["sequence"] if "N" in seq: return None, None if args.variable_coverage: # only trim when sequence has high enough C med, _, _ = ht.get_median_count(seq) if med < args.normalize_to: return name, seq trim_seq, trim_at = ht.trim_on_abundance(seq, args.cutoff) if trim_at >= K: return name, trim_seq return None, None ### the filtering loop for infile in infiles: print "filtering", infile outfile = os.path.basename(infile) + ".abundfilt" outfp = open(outfile, "w") tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print "output in", outfile
def main(): parser = build_counting_multifile_args() parser.add_argument('--coverage', '-C', dest='coverage', default=DEFAULT_COVERAGE, type=int) args = parser.parse_args() counting_ht = args.input_table infiles = args.input_filenames print 'file with ht: %s' % counting_ht print 'loading hashtable' ht = khmer.load_counting_hash(counting_ht) K = ht.ksize() xxxfp = None print "K:", K ### the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] med, avg, dev = ht.get_median_count(seq) pct = dev / avg * 100 xxxfp.write('%s %s %s %s %s\n' % (med, avg, dev, pct, name)) if random.randint(1, med) > args.coverage or pct > 100: return None, None return name, seq ### the filtering loop for infile in infiles: print 'filtering', infile xxxfp = open(os.path.basename(infile) + '.medpctfilt.stats', 'w') outfile = os.path.basename(infile) + '.medpctfilt' outfp = open(outfile, 'w') for n, record in enumerate(screed.open(infile)): if n % 100000 == 0: print '...', n name, seq = process_fn(record) if name and seq: print >> outfp, '>%s\n%s' % (name, seq) print 'output in', outfile
def main(): parser = build_counting_multifile_args() parser.add_argument('--coverage', '-C', dest='coverage', default=DEFAULT_COVERAGE, type=int) args = parser.parse_args() counting_ht = args.input_table infiles = args.input_filenames print 'file with ht: %s' % counting_ht print 'loading hashtable' ht = khmer.load_counting_hash(counting_ht) K = ht.ksize() xxxfp = None print "K:", K ### the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] med, avg, dev = ht.get_median_count(seq) pct = dev / avg * 100 xxxfp.write('%s %s %s %s %s\n' % (med, avg, dev, pct, name)) if random.randint(1, med) > args.coverage or pct > 100: return None, None return name, seq ### the filtering loop for infile in infiles: print 'filtering', infile xxxfp = open(os.path.basename(infile) + '.medpctfilt.stats', 'w') outfile = os.path.basename(infile) + '.medpctfilt' outfp = open(outfile, 'w') for n, record in enumerate(screed.open(infile)): if n % 100000 == 0: print '...', n name, seq = process_fn(record) if name and seq: print >>outfp, '>%s\n%s' % (name, seq) print 'output in', outfile
def main(): parser = build_counting_multifile_args() parser.add_argument('--cutoff', '-C', dest='cutoff', default=DEFAULT_CUTOFF, type=int, help="Trim at k-mers below this abundance.") parser.add_argument('-o', '--outputpath', dest='outputpath', default='.') args = parser.parse_args() counting_ht = args.input_table infiles = args.input_filenames outpath = args.outputpath print 'file with ht: %s' % counting_ht print 'loading hashtable' ht = khmer.load_counting_hash(counting_ht) K = ht.ksize() print "K:", K ### the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = ht.trim_on_abundance(seq, args.cutoff) if trim_at >= K: return name, trim_seq return None, None ### the filtering loop for infile in infiles: print 'filtering', infile outfile = outpath + '/' + os.path.basename(infile) + '.abundfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print 'output in', outfile
def main(): parser = build_counting_multifile_args() parser.add_argument('--cutoff', '-C', dest='cutoff', default=DEFAULT_CUTOFF, type=int, help="Trim at reads above this median abundance.") args = parser.parse_args() counting_ht = args.input_table infiles = args.input_filenames print 'file with ht: %s' % counting_ht print 'loading hashtable' ht = khmer.load_counting_hash(counting_ht) K = ht.ksize() print "K:", K ### the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] med, _, _ = ht.get_median_count(seq) if med >= args.cutoff: return name, seq return None, None ### the filtering loop for infile in infiles: print 'filtering', infile outfile = os.path.basename(infile) + '.himed' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print 'output in', outfile
def main(): parser = build_counting_multifile_args() parser.add_argument('--coverage', '-C', dest='coverage', default=DEFAULT_COVERAGE, type=int) args = parser.parse_args() counting_ht = args.input_table infiles = args.input_filenames print 'file with ht: %s' % counting_ht print 'loading hashtable' ht = khmer.load_counting_hash(counting_ht) K = ht.ksize() print "K:", K ### the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] med, avg, dev = ht.get_median_count(seq) if random.randint(1, med) > args.coverage: return None, None return name, seq ### the filtering loop for infile in infiles: print 'filtering', infile outfile = os.path.basename(infile) + '.medfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print 'output in', outfile
def main(): parser = build_counting_multifile_args() parser.add_argument('--coverage', '-C', dest='coverage', default=DEFAULT_COVERAGE, type=int) args = parser.parse_args() counting_ht = args.input_table infiles = args.input_filenames print 'file with ht: %s' % counting_ht print 'loading hashtable' ht = khmer.load_counting_hash(counting_ht) K = ht.ksize() print "K:", K # the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] med, avg, dev = ht.get_median_count(seq) if random.randint(1, med) > args.coverage: return None, None return name, seq # the filtering loop for infile in infiles: print 'filtering', infile outfile = os.path.basename(infile) + '.medfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print 'output in', outfile
def main(): parser = build_counting_multifile_args() parser.add_argument("--coverage", "-C", dest="coverage", default=DEFAULT_COVERAGE, type=int) args = parser.parse_args() counting_ht = args.input_table infiles = args.input_filenames print "file with ht: %s" % counting_ht print "loading hashtable" ht = khmer.load_counting_hash(counting_ht) K = ht.ksize() print "K:", K ### the filtering function. def process_fn(record): name = record["name"] seq = record["sequence"] med, avg, dev = ht.get_median_count(seq) if random.randint(1, med) > args.coverage: return None, None return name, seq ### the filtering loop for infile in infiles: print "filtering", infile outfile = os.path.basename(infile) + ".medfilt" outfp = open(outfile, "w") tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print "output in", outfile
def main(): parser = build_counting_multifile_args() parser.add_argument('--cutoff', '-C', dest='cutoff', default=DEFAULT_CUTOFF, type=int, help="Trim at k-mers below this abundance.") parser.add_argument('-V', '--variable-coverage', action='store_true', dest='variable_coverage', default=False) parser.add_argument('--normalize-to', '-Z', type=int, dest='normalize_to', help='base variable-coverage cutoff on this median' ' k-mer abundance', default=DEFAULT_NORMALIZE_LIMIT) args = parser.parse_args() counting_ht = args.input_table infiles = args.input_filenames print 'file with ht: %s' % counting_ht print 'loading hashtable' ht = khmer.load_counting_hash(counting_ht) K = ht.ksize() print "K:", K # the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None if args.variable_coverage: # only trim when sequence has high enough C med, _, _ = ht.get_median_count(seq) if med < args.normalize_to: return name, seq trim_seq, trim_at = ht.trim_on_abundance(seq, args.cutoff) if trim_at >= K: return name, trim_seq return None, None # the filtering loop for infile in infiles: print 'filtering', infile outfile = os.path.basename(infile) + '.abundfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print 'output in', outfile