def main(): args = sanitize_help(get_parser()).parse_args() output_file = args.graphbase + '.pmap.merged' pmap_files = glob.glob(args.graphbase + '.subset.*.pmap') print('loading %d pmap files (first one: %s)' % (len(pmap_files), pmap_files[0]), file=sys.stderr) ksize = args.ksize nodegraph = khmer.Nodegraph(ksize, 1, 1) for _ in pmap_files: check_input_files(_, args.force) check_space(pmap_files, args.force) for pmap_file in pmap_files: print('merging', pmap_file, file=sys.stderr) nodegraph.merge_subset_from_disk(pmap_file) print('saving merged to', output_file, file=sys.stderr) nodegraph.save_partitionmap(output_file) if args.remove_subsets: print('removing pmap files', file=sys.stderr) for pmap_file in pmap_files: os.unlink(pmap_file)
def main(): info('filter-abund.py', ['counting']) args = sanitize_help(get_parser()).parse_args() check_input_files(args.input_graph, args.force) infiles = args.input_filename if ('-' in infiles or '/dev/stdin' in infiles) and not \ args.single_output_file: print("Accepting input from stdin; output filename must " "be provided with -o.", file=sys.stderr) sys.exit(1) for filename in infiles: check_input_files(filename, args.force) check_space(infiles, args.force) print('loading countgraph:', args.input_graph, file=sys.stderr) countgraph = khmer.load_countgraph(args.input_graph) ksize = countgraph.ksize() print("K:", ksize, file=sys.stderr) # the filtering function. def process_fn(record): name = record.name seq = record.sequence seqN = seq.replace('N', 'A') if args.variable_coverage: # only trim when sequence has high enough C med, _, _ = countgraph.get_median_count(seqN) if med < args.normalize_to: return name, seq _, trim_at = countgraph.trim_on_abundance(seqN, args.cutoff) if trim_at >= ksize: # be sure to not to change the 'N's in the trimmed sequence - # so, return 'seq' and not 'seqN'. return name, seq[:trim_at] return None, None if args.single_output_file: outfile = args.single_output_file.name outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip) # the filtering loop for infile in infiles: print('filtering', infile, file=sys.stderr) if not args.single_output_file: outfile = os.path.basename(infile) + '.abundfilt' outfp = open(outfile, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) tsp = ThreadedSequenceProcessor(process_fn, n_workers=args.threads) tsp.start(verbose_loader(infile), outfp) print('output in', outfile, file=sys.stderr)
def main(): info('annotate-partitions.py', ['graph']) args = get_parser().parse_args() ksize = args.ksize filenames = args.input_filenames htable = khmer.new_hashbits(ksize, 1, 1) partitionmap_file = args.graphbase + '.pmap.merged' check_input_files(partitionmap_file, args.force) for _ in filenames: check_input_files(_, args.force) check_space(filenames, args.force) print >>sys.stderr, 'loading partition map from:', partitionmap_file htable.load_partitionmap(partitionmap_file) for infile in filenames: print >>sys.stderr, 'outputting partitions for', infile outfile = os.path.basename(infile) + '.part' part_count = htable.output_partitions(infile, outfile) print >>sys.stderr, 'output %d partitions for %s' % ( part_count, infile) print >>sys.stderr, 'partitions are in', outfile
def main(): info("annotate-partitions.py", ["graph"]) args = get_parser().parse_args() ksize = args.ksize filenames = args.input_filenames htable = khmer.Hashbits(ksize, 1, 1) partitionmap_file = args.graphbase + ".pmap.merged" check_input_files(partitionmap_file, args.force) for _ in filenames: check_input_files(_, args.force) check_space(filenames, args.force) print("loading partition map from:", partitionmap_file, file=sys.stderr) htable.load_partitionmap(partitionmap_file) for infile in filenames: print("outputting partitions for", infile, file=sys.stderr) outfile = os.path.basename(infile) + ".part" part_count = htable.output_partitions(infile, outfile) print("output %d partitions for %s" % (part_count, infile), file=sys.stderr) print("partitions are in", outfile, file=sys.stderr)
def main(): args = sanitize_help(get_parser()).parse_args() htfile = args.countgraph input_filename = args.input output = args.output infiles = [htfile, input_filename] for infile in infiles: check_input_files(infile, args.force) check_space(infiles, args.force) print('loading k-mer countgraph from', htfile, file=sys.stderr) countgraph = load_countgraph(htfile) ksize = countgraph.ksize() print('writing to', output.name, file=sys.stderr) output = csv.writer(output) # write headers: output.writerow(['name', 'median', 'average', 'stddev', 'seqlen']) for record in screed.open(input_filename): seq = record.sequence.upper() if 'N' in seq: seq = seq.replace('N', 'A') if ksize <= len(seq): medn, ave, stdev = countgraph.get_median_count(seq) ave, stdev = [round(x, 9) for x in (ave, stdev)] output.writerow([record.name, medn, ave, stdev, len(seq)])
def main(): info('annotate-partitions.py', ['graph']) args = get_parser().parse_args() ksize = args.ksize filenames = args.input_filenames nodegraph = khmer.Nodegraph(ksize, 1, 1) partitionmap_file = args.graphbase + '.pmap.merged' check_input_files(partitionmap_file, args.force) for _ in filenames: check_input_files(_, args.force) check_space(filenames, args.force) print('loading partition map from:', partitionmap_file, file=sys.stderr) nodegraph.load_partitionmap(partitionmap_file) for infile in filenames: print('outputting partitions for', infile, file=sys.stderr) outfile = os.path.basename(infile) + '.part' part_count = nodegraph.output_partitions(infile, outfile) print('output %d partitions for %s' % ( part_count, infile), file=sys.stderr) print('partitions are in', outfile, file=sys.stderr)
def main(): info('merge-partitions.py', ['graph']) args = get_parser().parse_args() output_file = args.graphbase + '.pmap.merged' pmap_files = glob.glob(args.graphbase + '.subset.*.pmap') print('loading %d pmap files (first one: %s)' % (len(pmap_files), pmap_files[0]), file=sys.stderr) ksize = args.ksize htable = khmer.new_hashbits(ksize, 1, 1) for _ in pmap_files: check_input_files(_, args.force) check_space(pmap_files, args.force) for pmap_file in pmap_files: print('merging', pmap_file, file=sys.stderr) htable.merge_subset_from_disk(pmap_file) print('saving merged to', output_file, file=sys.stderr) htable.save_partitionmap(output_file) if args.remove_subsets: print('removing pmap files', file=sys.stderr) for pmap_file in pmap_files: os.unlink(pmap_file)
def main(): info('count-median.py', ['diginorm']) args = get_parser().parse_args() htfile = args.ctfile input_filename = args.input output_filename = args.output infiles = [htfile, input_filename] for infile in infiles: check_file_status(infile, args.force) check_space(infiles, args.force) print >>sys.stderr, 'loading k-mer counting table from', htfile htable = khmer.load_counting_hash(htfile) ksize = htable.ksize() print >>sys.stderr, 'writing to', output_filename output = open(output_filename, 'w') for record in screed.open(input_filename): seq = record.sequence.upper() if 'N' in seq: seq = seq.replace('N', 'G') if ksize <= len(seq): medn, ave, stdev = htable.get_median_count(seq) print >> output, record.name, medn, ave, stdev, len(seq)
def main(): info("count-median.py", ["diginorm"]) args = sanitize_help(get_parser()).parse_args() htfile = args.countgraph input_filename = args.input output = args.output infiles = [htfile, input_filename] for infile in infiles: check_input_files(infile, args.force) check_space(infiles, args.force) print("loading k-mer countgraph from", htfile, file=sys.stderr) countgraph = load_countgraph(htfile) ksize = countgraph.ksize() print("writing to", output.name, file=sys.stderr) output = csv.writer(output) # write headers: output.writerow(["name", "median", "average", "stddev", "seqlen"]) for record in screed.open(input_filename): seq = record.sequence.upper() if "N" in seq: seq = seq.replace("N", "A") if ksize <= len(seq): medn, ave, stdev = countgraph.get_median_count(seq) ave, stdev = [round(x, 9) for x in (ave, stdev)] output.writerow([record.name, medn, ave, stdev, len(seq)])
def main(args): info('build-graph.py', ['graph', 'SeqAn']) report_on_config(args, hashtype='nodegraph') base = args.output_filename filenames = args.input_filenames for fname in args.input_filenames: check_input_files(fname, args.force) # if optimization args are given, do optimization args = functions.do_sanity_checking(args, 0.01) check_space(args.input_filenames, args.force) check_space_for_hashtable(args, 'nodegraph', args.force) print('Saving k-mer presence table to %s' % base, file=sys.stderr) print('Loading kmers from sequences in %s' % repr(filenames), file=sys.stderr) if args.no_build_tagset: print('We WILL NOT build the tagset.', file=sys.stderr) else: print('We WILL build the tagset (for partitioning/traversal).', file=sys.stderr) print('making nodegraph', file=sys.stderr) htable = khmer_args.create_nodegraph(args) functions.build_graph(filenames, htable, args.threads, not args.no_build_tagset) print('Total number of unique k-mers: {0}'.format(htable.n_unique_kmers()), file=sys.stderr) print('saving k-mer presence table in', base + '.pt', file=sys.stderr) htable.save(base + '.pt') if not args.no_build_tagset: print('saving tagset in', base + '.tagset', file=sys.stderr) htable.save_tagset(base + '.tagset') info_fp = open(base + '.info', 'w') info_fp.write('%d unique k-mers' % htable.n_unique_kmers()) fp_rate = \ khmer.calc_expected_collisions(htable, args.force, max_false_pos=.15) # 0.18 is ACTUAL MAX. Do not change. print('false positive rate estimated to be %1.3f' % fp_rate, file=sys.stderr) print('\nfalse positive rate estimated to be %1.3f' % fp_rate, file=info_fp) print('wrote to', base + '.info and', base + '.pt', file=sys.stderr) if not args.no_build_tagset: print('and ' + base + '.tagset', file=sys.stderr) sys.exit(0)
def main(): args = sanitize_help(get_parser()).parse_args() configure_logging(args.quiet) check_input_files(args.datafile, args.force) check_space([args.datafile], args.force) if args.savegraph: tablesize = calculate_graphsize(args, "countgraph") check_space_for_graph(args.savegraph, tablesize, args.force) report_on_config(args) log_info("making countgraph") graph = khmer_args.create_countgraph(args) # first, load reads into graph rparser = khmer.ReadParser(args.datafile) threads = [] log_info("consuming input, round 1 -- {datafile}", datafile=args.datafile) for _ in range(args.threads): cur_thread = threading.Thread(target=graph.consume_fasta_with_reads_parser, args=(rparser,)) threads.append(cur_thread) cur_thread.start() for _ in threads: _.join() log_info("Total number of unique k-mers: {nk}", nk=graph.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(graph, args.force) log_info("fp rate estimated to be {fpr:1.3f}", fpr=fp_rate) # the filtering loop log_info("filtering {datafile}", datafile=args.datafile) if args.outfile is None: outfile = os.path.basename(args.datafile) + ".abundfilt" else: outfile = args.outfile outfp = open(outfile, "wb") outfp = get_file_writer(outfp, args.gzip, args.bzip) paired_iter = broken_paired_reader(ReadParser(args.datafile), min_length=graph.ksize(), force_single=True) for n, is_pair, read1, read2 in paired_iter: assert not is_pair assert read2 is None trimmed_record, _ = trim_record(graph, read1, args.cutoff, args.variable_coverage, args.normalize_to) if trimmed_record: print((trimmed_record,)) write_record(trimmed_record, outfp) log_info("output in {outfile}", outfile=outfile) if args.savegraph: log_info("Saving k-mer countgraph filename {graph}", graph=args.savegraph) graph.save(args.savegraph)
def main(): info('interleave-reads.py') args = get_parser().parse_args() for _ in args.infiles: check_file_status(_, args.force) check_space(args.infiles, args.force) s1_file = args.infiles[0] if len(args.infiles) == 2: s2_file = args.infiles[1] else: s2_file = s1_file.replace('_R1_', '_R2_') print >> sys.stderr, ("given only one file; " "guessing that R2 file is %s" % s2_file) fail = False if not os.path.exists(s1_file): print >> sys.stderr, "Error! R1 file %s does not exist" % s1_file fail = True if not os.path.exists(s2_file): print >> sys.stderr, "Error! R2 file %s does not exist" % s2_file fail = True if fail and not args.force: sys.exit(1) print >> sys.stderr, "Interleaving:\n\t%s\n\t%s" % (s1_file, s2_file) counter = 0 for read1, read2 in itertools.izip(screed.open(s1_file), screed.open(s2_file)): if counter % 100000 == 0: print >> sys.stderr, '...', counter, 'pairs' counter += 1 name1 = read1.name if not name1.endswith('/1'): name1 += '/1' name2 = read2.name if not name2.endswith('/2'): name2 += '/2' assert name1[:-2] == name2[:-2], \ "This doesn't look like paired data! %s %s" % (name1, name2) read1.name = name1 read2.name = name2 write_record(read1, args.output) write_record(read2, args.output) print >> sys.stderr, 'final: interleaved %d pairs' % counter print >> sys.stderr, 'output written to', args.output
def main(): info('optimal_args_nodegraph.py', ['graph', 'SeqAn']) args = get_parser().parse_args() report_on_config(args, graphtype='nodegraph') filenames = args.input_filenames base = filenames[0] for _ in args.input_filenames: check_input_files(_, False) check_space(args.input_filenames, False) print('Counting kmers from sequences in %s' % repr(filenames), file=sys.stderr) htable = khmer.new_nodegraph(args.ksize, args.max_tablesize, args.n_tables) target_method = htable.consume_fasta_with_reads_parser for _, filename in enumerate(filenames): rparser = khmer.ReadParser(filename) threads = [] print('consuming input', filename, file=sys.stderr) for num in xrange(args.threads): cur_thread = threading.Thread( target=target_method, args=(rparser,)) threads.append(cur_thread) cur_thread.start() for thread in threads: thread.join() unique_kmers = htable.n_unique_kmers() print('Total number of unique k-mers: {0}'.format(unique_kmers), file=sys.stderr) info_optimal = open(base + '.optimal_args', 'w') fp_rate = khmer.calc_expected_collisions(htable) print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr) if fp_rate > 0.15: # 0.18 is ACTUAL MAX. Do not change. print("**", file=sys.stderr) print("** ERROR: the graph structure is too small for this data set." "Increase table size/# tables.", file=sys.stderr) print("**", file=sys.stderr) if not False: sys.exit(1) to_print = graphsize_args_report(unique_kmers, fp_rate) print(to_print, file=info_optimal) print('optimal arguments were written to', base + '.optimal_args', file=sys.stderr)
def main(): info('make-initial-stoptags.py', ['graph']) args = get_parser().parse_args() graphbase = args.graphbase # @RamRS: This might need some more work infiles = [graphbase + '.pt', graphbase + '.tagset'] if args.stoptags: infiles.append(args.stoptags) for _ in infiles: check_input_files(_, args.force) check_space(infiles, args.force) print >>sys.stderr, 'loading htable %s.pt' % graphbase htable = khmer.load_hashbits(graphbase + '.pt') # do we want to load stop tags, and do they exist? if args.stoptags: print >>sys.stderr, 'loading stoptags from', args.stoptags htable.load_stop_tags(args.stoptags) print >>sys.stderr, 'loading tagset %s.tagset...' % graphbase htable.load_tagset(graphbase + '.tagset') ksize = htable.ksize() counting = khmer.new_counting_hash(ksize, args.min_tablesize, args.n_tables) # divide up into SUBSET_SIZE fragments divvy = htable.divide_tags_into_subsets(args.subset_size) # pick off the first one if len(divvy) == 1: start, end = 0, 0 else: start, end = divvy[:2] # partition! print >>sys.stderr, 'doing pre-partitioning from', start, 'to', end subset = htable.do_subset_partition(start, end) # now, repartition... print >>sys.stderr, 'repartitioning to find HCKs.' htable.repartition_largest_partition(subset, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) print >>sys.stderr, 'saving stop tags' htable.save_stop_tags(graphbase + '.stoptags') print >> sys.stderr, 'wrote to:', graphbase + '.stoptags'
def main(args): info('build-graph.py', ['graph', 'SeqAn']) report_on_config(args, hashtype='hashbits') base = args.output_filename filenames = args.input_filenames for fname in args.input_filenames: check_input_files(fname, args.force) check_space(args.input_filenames, args.force) check_space_for_hashtable( (float(args.n_tables * args.min_tablesize) / 8.), args.force) print >>sys.stderr, 'Saving k-mer presence table to %s' % base print >>sys.stderr, 'Loading kmers from sequences in %s' % repr(filenames) if args.no_build_tagset: print >>sys.stderr, 'We WILL NOT build the tagset.' else: print >>sys.stderr, 'We WILL build the tagset', \ ' (for partitioning/traversal).' print >>sys.stderr, 'making k-mer presence table' htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables) functions.build_graph(filenames, htable, args.threads, not args.no_build_tagset) print >> sys.stderr, 'Total number of unique k-mers: {0}'.format( htable.n_unique_kmers()) print >>sys.stderr, 'saving k-mer presence table in', base + '.pt' htable.save(base + '.pt') if not args.no_build_tagset: print >>sys.stderr, 'saving tagset in', base + '.tagset' htable.save_tagset(base + '.tagset') info_fp = open(base + '.info', 'w') info_fp.write('%d unique k-mers' % htable.n_unique_kmers()) fp_rate = \ khmer.calc_expected_collisions(htable, args.force, max_false_pos=.15) # 0.18 is ACTUAL MAX. Do not change. print >>sys.stderr, 'false positive rate estimated to be %1.3f' % fp_rate print >>info_fp, '\nfalse positive rate estimated to be %1.3f' % fp_rate print >> sys.stderr, 'wrote to', base + '.info and', base + '.pt' if not args.no_build_tagset: print >> sys.stderr, 'and ' + base + '.tagset' sys.exit(0)
def main(): info('filter-abund.py', ['counting']) args = get_parser().parse_args() check_input_files(args.input_table, args.force) infiles = args.input_filename for filename in infiles: check_input_files(filename, args.force) check_space(infiles, args.force) print('loading counting table:', args.input_table, file=sys.stderr) htable = khmer.load_counting_hash(args.input_table) ksize = htable.ksize() print("K:", ksize, file=sys.stderr) # the filtering function. def process_fn(record): name = record.name seq = record.sequence seqN = seq.replace('N', 'A') if args.variable_coverage: # only trim when sequence has high enough C med, _, _ = htable.get_median_count(seqN) if med < args.normalize_to: return name, seq _, trim_at = htable.trim_on_abundance(seqN, args.cutoff) if trim_at >= ksize: # be sure to not to change the 'N's in the trimmed sequence - # so, return 'seq' and not 'seqN'. return name, seq[:trim_at] return None, None # the filtering loop for infile in infiles: print('filtering', infile, file=sys.stderr) if args.single_output_filename != '': outfile = args.single_output_filename outfp = open(outfile, 'a') else: outfile = os.path.basename(infile) + '.abundfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn, n_workers=args.threads) tsp.start(verbose_loader(infile), outfp) print('output in', outfile, file=sys.stderr)
def main(): info('filter-abund.py', ['counting']) args = get_parser().parse_args() counting_ht = args.input_table infiles = args.input_filename for _ in infiles: check_file_status(_, args.force) check_space(infiles, args.force) print >>sys.stderr, 'loading hashtable' htable = khmer.load_counting_hash(counting_ht) ksize = htable.ksize() print >>sys.stderr, "K:", ksize # the filtering function. def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None if args.variable_coverage: # only trim when sequence has high enough C med, _, _ = htable.get_median_count(seq) if med < args.normalize_to: return name, seq trim_seq, trim_at = htable.trim_on_abundance(seq, args.cutoff) if trim_at >= ksize: return name, trim_seq return None, None # the filtering loop for infile in infiles: print >>sys.stderr, 'filtering', infile if args.single_output_filename != '': outfile = args.single_output_filename outfp = open(outfile, 'a') else: outfile = os.path.basename(infile) + '.abundfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn, n_workers=args.threads) tsp.start(verbose_loader(infile), outfp) print >>sys.stderr, 'output in', outfile
def main(): info('interleave-reads.py') args = sanitize_help(get_parser()).parse_args() check_input_files(args.left, args.force) check_input_files(args.right, args.force) check_space([args.left, args.right], args.force) s1_file = args.left s2_file = args.right fail = False print("Interleaving:\n\t%s\n\t%s" % (s1_file, s2_file), file=sys.stderr) outfp = get_file_writer(args.output, args.gzip, args.bzip) counter = 0 screed_iter_1 = screed.open(s1_file) screed_iter_2 = screed.open(s2_file) for read1, read2 in zip_longest(screed_iter_1, screed_iter_2): if read1 is None or read2 is None: print(("ERROR: Input files contain different number" " of records."), file=sys.stderr) sys.exit(1) if counter % 100000 == 0: print('...', counter, 'pairs', file=sys.stderr) counter += 1 name1 = read1.name name2 = read2.name if not args.no_reformat: if not check_is_left(name1): name1 += '/1' if not check_is_right(name2): name2 += '/2' read1.name = name1 read2.name = name2 if not check_is_pair(read1, read2): print("ERROR: This doesn't look like paired data! " "%s %s" % (read1.name, read2.name), file=sys.stderr) sys.exit(1) write_record_pair(read1, read2, outfp) print('final: interleaved %d pairs' % counter, file=sys.stderr) print('output written to', describe_file_handle(outfp), file=sys.stderr)
def main(): args = sanitize_help(get_parser()).parse_args() configure_logging(args.quiet) infiles = args.input_filename if ('-' in infiles or '/dev/stdin' in infiles) and not \ args.single_output_file: log_error("Accepting input from stdin; output filename must " "be provided with -o.") sys.exit(1) for filename in infiles: check_input_files(filename, args.force) check_space(infiles, args.force) log_info('loading countgraph: {graph}', graph=args.input_graph) countgraph = khmer.load_countgraph(args.input_graph) ksize = countgraph.ksize() log_info("K: {ksize}", ksize=ksize) if args.single_output_file: outfile = args.single_output_file.name outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip) # the filtering loop for infile in infiles: log_info('filtering {infile}', infile=infile) if not args.single_output_file: outfile = os.path.basename(infile) + '.abundfilt' outfp = open(outfile, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) paired_iter = broken_paired_reader(ReadParser(infile), min_length=ksize, force_single=True) for n, is_pair, read1, read2 in paired_iter: assert not is_pair assert read2 is None trimmed_record, _ = trim_record(countgraph, read1, args.cutoff, args.variable_coverage, args.normalize_to) if trimmed_record: write_record(trimmed_record, outfp) log_info('output in {outfile}', outfile=outfile)
def main(args): info("build-graph.py", ["graph", "SeqAn"]) report_on_config(args, hashtype="hashbits") base = args.output_filename filenames = args.input_filenames for fname in args.input_filenames: check_input_files(fname, args.force) check_space(args.input_filenames, args.force) check_space_for_hashtable((float(args.n_tables * args.min_tablesize) / 8.0), args.force) print("Saving k-mer presence table to %s" % base, file=sys.stderr) print("Loading kmers from sequences in %s" % repr(filenames), file=sys.stderr) if args.no_build_tagset: print("We WILL NOT build the tagset.", file=sys.stderr) else: print("We WILL build the tagset (for partitioning/traversal).", file=sys.stderr) print("making k-mer presence table", file=sys.stderr) htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables) functions.build_graph(filenames, htable, args.threads, not args.no_build_tagset) print("Total number of unique k-mers: {0}".format(htable.n_unique_kmers()), file=sys.stderr) print("saving k-mer presence table in", base + ".pt", file=sys.stderr) htable.save(base + ".pt") if not args.no_build_tagset: print("saving tagset in", base + ".tagset", file=sys.stderr) htable.save_tagset(base + ".tagset") info_fp = open(base + ".info", "w") info_fp.write("%d unique k-mers" % htable.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(htable, args.force, max_false_pos=0.15) # 0.18 is ACTUAL MAX. Do not change. print("false positive rate estimated to be %1.3f" % fp_rate, file=sys.stderr) print("\nfalse positive rate estimated to be %1.3f" % fp_rate, file=info_fp) print("wrote to", base + ".info and", base + ".pt", file=sys.stderr) if not args.no_build_tagset: print("and " + base + ".tagset", file=sys.stderr) sys.exit(0)
def main(): info('split-paired-reads.py') args = get_parser().parse_args() infile = args.infile check_file_status(infile, args.force) filenames = [infile] check_space(filenames, args.force) if args.output_directory: if not os.path.exists(args.output_directory): os.makedirs(args.output_directory) out1 = args.output_directory + '/' + os.path.basename(infile) + '.1' out2 = args.output_directory + '/' + os.path.basename(infile) + '.2' else: out1 = os.path.basename(infile) + '.1' out2 = os.path.basename(infile) + '.2' # OVERRIDE defaults with -1, -2 if args.output_first: out1 = args.output_first if args.output_second: out2 = args.output_second fp_out1 = open(out1, 'w') fp_out2 = open(out2, 'w') counter1 = 0 counter2 = 0 index = None for index, record in enumerate(screed.open(infile)): if index % 100000 == 0 and index: print >> sys.stderr, '...', index name = record.name if name.endswith('/1'): write_record(record, fp_out1) counter1 += 1 elif name.endswith('/2'): write_record(record, fp_out2) counter2 += 1 print >> sys.stderr, "DONE; split %d sequences (%d left, %d right)" % \ (index + 1, counter1, counter2) print >> sys.stderr, "/1 reads in %s" % out1 print >> sys.stderr, "/2 reads in %s" % out2
def main(): info('extract-paired-reads.py') args = get_parser().parse_args() check_input_files(args.infile, args.force) infiles = [args.infile] check_space(infiles, args.force) outfile = os.path.basename(args.infile) if len(sys.argv) > 2: outfile = sys.argv[2] single_fp = open(outfile + '.se', 'w') paired_fp = open(outfile + '.pe', 'w') print >>sys.stderr, 'reading file "%s"' % args.infile print >>sys.stderr, 'outputting interleaved pairs to "%s.pe"' % outfile print >>sys.stderr, 'outputting orphans to "%s.se"' % outfile n_pe = 0 n_se = 0 screed_iter = screed.open(args.infile, parse_description=False) for index, is_pair, read1, read2 in broken_paired_reader(screed_iter): if index % 100000 == 0 and index > 0: print >>sys.stderr, '...', index if is_pair: write_record_pair(read1, read2, paired_fp) n_pe += 1 else: write_record(read1, single_fp) n_se += 1 single_fp.close() paired_fp.close() if n_pe == 0: raise Exception("no paired reads!? check file formats...") print >>sys.stderr, 'DONE; read %d sequences,' \ ' %d pairs and %d singletons' % \ (n_pe * 2 + n_se, n_pe, n_se) print >> sys.stderr, 'wrote to: ' + outfile \ + '.se' + ' and ' + outfile + '.pe'
def main(): info('count-median.py', ['diginorm']) args = get_parser().parse_args() htfile = args.ctfile input_filename = args.input output_filename = args.output infiles = [htfile, input_filename] for infile in infiles: check_input_files(infile, args.force) check_space(infiles, args.force) print('loading k-mer counting table from', htfile, file=sys.stderr) htable = khmer.load_counting_hash(htfile) ksize = htable.ksize() print('writing to', output_filename, file=sys.stderr) output = open(output_filename, 'w') if args.csv: output = csv.writer(output) # write headers: output.writerow(['name', 'median', 'average', 'stddev', 'seqlen']) parse_description = True # @legacy behavior: split seq headers if args.csv: parse_description = False # only enable if we're doing csv out for record in screed.open(input_filename, parse_description=parse_description): seq = record.sequence.upper() if 'N' in seq: seq = seq.replace('N', 'A') if ksize <= len(seq): medn, ave, stdev = htable.get_median_count(seq) ave, stdev = [round(x, 9) for x in (ave, stdev)] if args.csv: output.writerow([record.name, medn, ave, stdev, len(seq)]) else: print(record.name, medn, ave, stdev, len(seq), file=output)
def main(): info('count-overlap.py', ['counting']) args = get_parser().parse_args() report_on_config(args, hashtype='hashbits') for infile in [args.ptfile, args.fafile]: check_input_files(infile, args.force) check_space([args.ptfile, args.fafile], args.force) print('loading k-mer presence table from', args.ptfile, file=sys.stderr) ht1 = khmer.load_hashbits(args.ptfile) kmer_size = ht1.ksize() output = open(args.report_filename, 'w') f_curve_obj = open(args.report_filename + '.curve', 'w') if args.csv: f_curve_obj_csv = csv.writer(f_curve_obj) # write headers: f_curve_obj_csv.writerow(['input_seq', 'overlap_kmer']) ht2 = khmer.new_hashbits(kmer_size, args.min_tablesize, args.n_tables) (n_unique, n_overlap, list_curve) = ht2.count_overlap(args.fafile, ht1) printout1 = """\ dataset1(pt file): %s dataset2: %s # of unique k-mers in dataset2: %d # of overlap unique k-mers: %d """ % (args.ptfile, args.fafile, n_unique, n_overlap) output.write(printout1) for i in range(100): if args.csv: f_curve_obj_csv.writerow([list_curve[100 + i], list_curve[i]]) else: print(list_curve[100 + i], list_curve[i], file=f_curve_obj) print('wrote to: ' + args.report_filename, file=sys.stderr)
def main(): info('filter-stoptags.py', ['graph']) args = get_parser().parse_args() stoptags = args.stoptags_file infiles = args.input_filenames for _ in infiles: check_input_files(_, args.force) check_space(infiles, args.force) print >>sys.stderr, 'loading stop tags, with K', args.ksize htable = khmer.new_hashbits(args.ksize, 1, 1) htable.load_stop_tags(stoptags) def process_fn(record): name = record['name'] seq = record['sequence'] if 'N' in seq: return None, None trim_seq, trim_at = htable.trim_on_stoptags(seq) if trim_at >= args.ksize: return name, trim_seq return None, None # the filtering loop for infile in infiles: print >>sys.stderr, 'filtering', infile outfile = os.path.basename(infile) + '.stopfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print >>sys.stderr, 'output in', outfile
def main(): args = sanitize_help(get_parser()).parse_args() stoptags = args.stoptags_file infiles = args.input_filenames for _ in infiles: check_input_files(_, args.force) check_space(infiles, args.force) print('loading stop tags, with K', args.ksize, file=sys.stderr) nodegraph = Nodegraph(args.ksize, 1, 1) nodegraph.load_stop_tags(stoptags) def process_fn(record): name = record.name seq = record.sequence if 'N' in seq: return None, None trim_seq, trim_at = nodegraph.trim_on_stoptags(seq) if trim_at >= args.ksize: return name, trim_seq return None, None # the filtering loop for infile in infiles: print('filtering', infile, file=sys.stderr) outfile = os.path.basename(infile) + '.stopfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(infile), outfp) print('output in', outfile, file=sys.stderr)
def main(): info('merge-stoptags.py') args = get_parser().parse_args() stdbase = args.stdbase # @RamRS: This might need some more work infiles = [] for _ in glob.glob(stdbase + "*/*.stoptags") : if os.path.exists(_): check_input_files(_, False) infiles.append(_) check_space(infiles, False) ht = khmer.new_hashbits(args.ksize, 1, 1) for _ in infiles: print >>sys.stderr, 'loading stoptags %s' % _ ht.load_stop_tags(_, 0) print >>sys.stderr, 'writing file merge.stoptags' ht.save_stop_tags('merge.stoptags') print >>sys.stderr, 'done!'
def main(): info('count-overlap.py', ['counting']) args = get_parser().parse_args() report_on_config(args, hashtype='hashbits') for infile in [args.ptfile, args.fafile]: check_file_status(infile, args.force) check_space([args.ptfile, args.fafile], args.force) print >>sys.stderr, 'loading k-mer presence table from', args.ptfile ht1 = khmer.load_hashbits(args.ptfile) kmer_size = ht1.ksize() output = open(args.report_filename, 'w') f_curve_obj = open(args.report_filename + '.curve', 'w') ht2 = khmer.new_hashbits(kmer_size, args.min_tablesize, args.n_tables) (n_unique, n_overlap, list_curve) = ht2.count_overlap(args.fafile, ht1) printout1 = """\ dataset1(pt file): %s dataset2: %s # of unique k-mers in dataset2: %d # of overlap unique k-mers: %d """ % (args.ptfile, args.fafile, n_unique, n_overlap) output.write(printout1) for i in range(100): to_print = str(list_curve[100 + i]) + ' ' + str(list_curve[i]) + '\n' f_curve_obj.write(to_print) print >> sys.stderr, 'wrote to: ' + args.report_filename
def main(): info('interleave-reads.py') args = get_parser().parse_args() for _ in args.infiles: check_input_files(_, args.force) check_space(args.infiles, args.force) s1_file = args.infiles[0] if len(args.infiles) == 2: s2_file = args.infiles[1] else: s2_file = s1_file.replace('_R1_', '_R2_') if s1_file == s2_file: print >>sys.stderr, ("ERROR: given only one filename, that " "doesn't contain _R1_. Exiting.") sys.exit(1) print >> sys.stderr, ("given only one file; " "guessing that R2 file is %s" % s2_file) fail = False if not os.path.exists(s1_file): print >> sys.stderr, "Error! R1 file %s does not exist" % s1_file fail = True if not os.path.exists(s2_file): print >> sys.stderr, "Error! R2 file %s does not exist" % s2_file fail = True if fail and not args.force: sys.exit(1) print >> sys.stderr, "Interleaving:\n\t%s\n\t%s" % (s1_file, s2_file) counter = 0 screed_iter_1 = screed.open(s1_file, parse_description=False) screed_iter_2 = screed.open(s2_file, parse_description=False) for read1, read2 in itertools.izip_longest(screed_iter_1, screed_iter_2): if read1 is None or read2 is None: print >>sys.stderr, ("ERROR: Input files contain different number" " of records.") sys.exit(1) if counter % 100000 == 0: print >> sys.stderr, '...', counter, 'pairs' counter += 1 name1 = read1.name if not check_is_left(name1): name1 += '/1' name2 = read2.name if not check_is_right(name2): name2 += '/2' read1.name = name1 read2.name = name2 if not check_is_pair(read1, read2): print >>sys.stderr, "ERROR: This doesn't look like paired data! " \ "%s %s" % (read1.name, read2.name) sys.exit(1) write_record_pair(read1, read2, args.output) print >> sys.stderr, 'final: interleaved %d pairs' % counter print >> sys.stderr, 'output written to', args.output.name
def main(): # pylint: disable=too-many-branches,too-many-statements info('normalize-by-median.py', ['diginorm']) args = get_parser().parse_args() report_on_config(args) report_fp = args.report check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savetable: check_space_for_hashtable(args.n_tables * args.min_tablesize, args.force) # list to save error files along with throwing exceptions if args.force: corrupt_files = [] if args.loadtable: print 'loading k-mer counting table from', args.loadtable htable = khmer.load_counting_hash(args.loadtable) else: print 'making k-mer counting table' htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables) total = 0 discarded = 0 input_filename = None for index, input_filename in enumerate(args.input_filenames): if args.single_output_filename != '': output_name = args.single_output_filename outfp = open(args.single_output_filename, 'a') else: output_name = os.path.basename(input_filename) + '.keep' outfp = open(output_name, 'w') total_acc = 0 discarded_acc = 0 try: total_acc, discarded_acc = normalize_by_median( input_filename, outfp, htable, args, report_fp) except IOError as err: handle_error(err, output_name, input_filename, args.fail_save, htable) if not args.force: print >> sys.stderr, '** Exiting!' sys.exit(1) else: print >> sys.stderr, '*** Skipping error file, moving on...' corrupt_files.append(input_filename) else: if total_acc == 0 and discarded_acc == 0: print 'SKIPPED empty file', input_filename else: total += total_acc discarded += discarded_acc print 'DONE with {inp}; kept {kept} of {total} or {perc:2}%'\ .format(inp=input_filename, kept=total - discarded, total=total, perc=int(100. - discarded / float(total) * 100.)) print 'output in', output_name if (args.dump_frequency > 0 and index > 0 and index % args.dump_frequency == 0): print 'Backup: Saving k-mer counting file through', input_filename if args.savetable: hashname = args.savetable print '...saving to', hashname else: hashname = 'backup.ct' print 'Nothing given for savetable, saving to', hashname htable.save(hashname) if args.report_total_kmers: print >> sys.stderr, 'Total number of unique k-mers: {0}'.format( htable.n_unique_kmers()) if args.savetable: print 'Saving k-mer counting table through', input_filename print '...saving to', args.savetable htable.save(args.savetable) fp_rate = khmer.calc_expected_collisions(htable) print 'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate) if args.force and len(corrupt_files) > 0: print >> sys.stderr, "** WARNING: Finished with errors!" print >> sys.stderr, "** IOErrors occurred in the following files:" print >> sys.stderr, "\t", " ".join(corrupt_files) if fp_rate > MAX_FALSE_POSITIVE_RATE: print >> sys.stderr, "**" print >> sys.stderr, ("** ERROR: the k-mer counting table is too small" " for this data set. Increase tablesize/# " "tables.") print >> sys.stderr, "**" print >> sys.stderr, "** Do not use these results!!" if not args.force: sys.exit(1)
def main(): # pylint: disable=too-many-locals,too-many-statements args = sanitize_help(get_parser()).parse_args() report_on_config(args, graphtype='nodegraph') for infile in args.input_filenames: check_input_files(infile, args.force) check_space(args.input_filenames, args.force) print('Saving k-mer nodegraph to %s' % args.graphbase, file=sys.stderr) print('Loading kmers from sequences in %s' % repr(args.input_filenames), file=sys.stderr) print('--', file=sys.stderr) print('SUBSET SIZE', args.subset_size, file=sys.stderr) print('N THREADS', args.threads, file=sys.stderr) print('--', file=sys.stderr) # load-graph.py print('making nodegraph', file=sys.stderr) nodegraph = khmer_args.create_nodegraph(args) for _, filename in enumerate(args.input_filenames): print('consuming input', filename, file=sys.stderr) nodegraph.consume_fasta_and_tag(filename) # 0.18 is ACTUAL MAX. Do not change. fp_rate = \ khmer.calc_expected_collisions( nodegraph, args.force, max_false_pos=.15) print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr) # partition-graph # do we want to exhaustively traverse the graph? stop_big_traversals = args.no_big_traverse if stop_big_traversals: print('** This script brakes for lumps: ', 'stop_big_traversals is true.', file=sys.stderr) else: print('** Traverse all the things:', ' stop_big_traversals is false.', file=sys.stderr) # # now, partition! # # divide the tags up into subsets divvy = nodegraph.divide_tags_into_subsets(int(args.subset_size)) divvy = list(divvy) n_subsets = len(divvy) divvy.append(0) # build a queue of tasks: worker_q = queue.Queue() # break up the subsets into a list of worker tasks for _ in range(0, n_subsets): start = divvy[_] end = divvy[_ + 1] worker_q.put((nodegraph, _, start, end)) print('enqueued %d subset tasks' % n_subsets, file=sys.stderr) open('%s.info' % args.graphbase, 'w').write('%d subsets total\n' % (n_subsets)) if n_subsets < args.threads: args.threads = n_subsets # start threads! print('starting %d threads' % args.threads, file=sys.stderr) print('---', file=sys.stderr) threads = [] for _ in range(args.threads): cur_thread = threading.Thread(target=worker, args=(worker_q, args.graphbase, stop_big_traversals)) threads.append(cur_thread) cur_thread.start() print('done starting threads', file=sys.stderr) # wait for threads for _ in threads: _.join() print('---', file=sys.stderr) print('done making subsets! see %s.subset.*.pmap' % (args.graphbase,), file=sys.stderr) # merge-partitions pmap_files = glob.glob(args.graphbase + '.subset.*.pmap') print('loading %d pmap files (first one: %s)' % (len(pmap_files), pmap_files[0]), file=sys.stderr) nodegraph = khmer.Nodegraph(args.ksize, 1, 1) for pmap_file in pmap_files: print('merging', pmap_file, file=sys.stderr) nodegraph.merge_subset_from_disk(pmap_file) if args.remove_subsets: print('removing pmap files', file=sys.stderr) for pmap_file in pmap_files: os.unlink(pmap_file) # annotate-partitions for infile in args.input_filenames: print('outputting partitions for', infile, file=sys.stderr) outfile = os.path.basename(infile) + '.part' part_count = nodegraph.output_partitions(infile, outfile) print('output %d partitions for %s' % ( part_count, infile), file=sys.stderr) print('partitions are in', outfile, file=sys.stderr)
def main(): info('split-paired-reads.py') args = sanitize_help(get_parser()).parse_args() infile = args.infile filenames = [infile] check_input_files(infile, args.force) check_space(filenames, args.force) basename = os.path.basename(infile) # decide where to put output files - specific directory? or just default? if infile in ('/dev/stdin', '-'): if not (args.output_first and args.output_second): print( "Accepting input from stdin; " "output filenames must be provided.", file=sys.stderr) sys.exit(1) elif args.output_directory: if not os.path.exists(args.output_directory): os.makedirs(args.output_directory) out1 = os.path.join(args.output_directory, basename + '.1') out2 = os.path.join(args.output_directory, basename + '.2') else: out1 = basename + '.1' out2 = basename + '.2' # OVERRIDE output file locations with -1, -2 if args.output_first: fp_out1 = get_file_writer(args.output_first, args.gzip, args.bzip) out1 = fp_out1.name else: # Use default filename created above fp_out1 = get_file_writer(open(out1, 'wb'), args.gzip, args.bzip) if args.output_second: fp_out2 = get_file_writer(args.output_second, args.gzip, args.bzip) out2 = fp_out2.name else: # Use default filename created above fp_out2 = get_file_writer(open(out2, 'wb'), args.gzip, args.bzip) # put orphaned reads here, if -0! if args.output_orphaned: fp_out0 = get_file_writer(args.output_orphaned, args.gzip, args.bzip) out0 = describe_file_handle(args.output_orphaned) counter1 = 0 counter2 = 0 counter3 = 0 index = None screed_iter = screed.open(infile) # walk through all the reads in broken-paired mode. paired_iter = broken_paired_reader(screed_iter, require_paired=not args.output_orphaned) try: for index, is_pair, record1, record2 in paired_iter: if index % 10000 == 0: print('...', index, file=sys.stderr) if is_pair: write_record(record1, fp_out1) counter1 += 1 write_record(record2, fp_out2) counter2 += 1 elif args.output_orphaned: write_record(record1, fp_out0) counter3 += 1 except UnpairedReadsError as e: print("Unpaired reads found starting at {name}; exiting".format( name=e.r1.name), file=sys.stderr) sys.exit(1) print("DONE; split %d sequences (%d left, %d right, %d orphans)" % (counter1 + counter2, counter1, counter2, counter3), file=sys.stderr) print("/1 reads in %s" % out1, file=sys.stderr) print("/2 reads in %s" % out2, file=sys.stderr) if args.output_orphaned: print("orphans in %s" % out0, file=sys.stderr)
def main(): args = sanitize_help(get_parser()).parse_args() if not args.quiet: info('filter-abund-single.py', ['counting', 'SeqAn']) configure_logging(args.quiet) check_input_files(args.datafile, args.force) check_space([args.datafile], args.force) if args.savegraph: tablesize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.savegraph, tablesize, args.force) report_on_config(args) log_info('making countgraph') graph = khmer_args.create_countgraph(args) # first, load reads into graph rparser = khmer.ReadParser(args.datafile) threads = [] log_info('consuming input, round 1 -- {datafile}', datafile=args.datafile) for _ in range(args.threads): cur_thread = \ threading.Thread( target=graph.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thread) cur_thread.start() for _ in threads: _.join() log_info('Total number of unique k-mers: {nk}', nk=graph.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(graph, args.force) log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate) # the filtering loop log_info('filtering {datafile}', datafile=args.datafile) if args.outfile is None: outfile = os.path.basename(args.datafile) + '.abundfilt' else: outfile = args.outfile outfp = open(outfile, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) screed_iter = screed.open(args.datafile) paired_iter = broken_paired_reader(screed_iter, min_length=graph.ksize(), force_single=True) for n, is_pair, read1, read2 in paired_iter: assert not is_pair assert read2 is None trimmed_record, _ = trim_record(graph, read1, args.cutoff, args.variable_coverage, args.normalize_to) if trimmed_record: print((trimmed_record,)) write_record(trimmed_record, outfp) log_info('output in {outfile}', outfile=outfile) if args.savegraph: log_info('Saving k-mer countgraph filename {graph}', graph=args.savegraph) graph.save(args.savegraph)
def main(): # pylint: disable=too-many-branches,too-many-statements info('saturate-by-median.py', ['diginorm']) args = get_parser().parse_args() report_on_config(args) report_fp = args.report report_frequency = args.report_frequency check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, False) if args.savetable: check_space_for_hashtable(args.n_tables * args.min_tablesize, False) # list to save error files along with throwing exceptions if args.force: corrupt_files = [] if args.loadtable: print 'loading k-mer counting table from', args.loadtable htable = khmer.load_counting_hash(args.loadtable) else: print 'making k-mer counting table' htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables) total = 0 discarded = 0 for index, input_filename in enumerate(args.input_filenames): total_acc = 0 discarded_acc = 0 try: total_acc, discarded_acc = normalize_by_median( input_filename, htable, args, report_fp, report_frequency) except IOError as err: handle_error(err, input_filename) if not args.force: print >> sys.stderr, '** Exiting!' sys.exit(1) else: print >> sys.stderr, '*** Skipping error file, moving on...' corrupt_files.append(input_filename) else: if total_acc == 0 and discarded_acc == 0: print 'SKIPPED empty file', input_filename else: total += total_acc discarded += discarded_acc print 'DONE with {inp}; kept {kept} of {total} or {perc:2}%'\ .format(inp=input_filename, kept=total - discarded, total=total, perc=int(100. - discarded / float(total) * 100.)) if args.savetable: print 'Saving k-mer counting table through', input_filename print '...saving to', args.savetable htable.save(args.savetable) fp_rate = khmer.calc_expected_collisions(htable) print 'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate) if args.force and len(corrupt_files) > 0: print >> sys.stderr, "** WARNING: Finished with errors!" print >> sys.stderr, "** IOErrors occurred in the following files:" print >> sys.stderr, "\t", " ".join(corrupt_files) if fp_rate > MAX_FALSE_POSITIVE_RATE: print >> sys.stderr, "**" print >> sys.stderr, ("** ERROR: the k-mer counting table is too small" " for this data set. Increase tablesize/# " "tables.") print >> sys.stderr, "**" print >> sys.stderr, "** Do not use these results!!" sys.exit(1)
def main(): args = sanitize_help(get_parser()).parse_args() if not args.quiet: info('filter-abund-single.py', ['counting', 'SeqAn']) configure_logging(args.quiet) check_input_files(args.datafile, args.force) check_space([args.datafile], args.force) if args.savegraph: tablesize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.savegraph, tablesize, args.force) report_on_config(args) log_info('making countgraph') graph = khmer_args.create_countgraph(args) # first, load reads into graph rparser = khmer.ReadParser(args.datafile) threads = [] log_info('consuming input, round 1 -- {datafile}', datafile=args.datafile) for _ in range(args.threads): cur_thread = \ threading.Thread( target=graph.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thread) cur_thread.start() for _ in threads: _.join() log_info('Total number of unique k-mers: {nk}', nk=graph.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(graph, args.force) log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate) # now, trim. # the filtering function. def process_fn(record): name = record.name seq = record.sequence seqN = seq.replace('N', 'A') _, trim_at = graph.trim_on_abundance(seqN, args.cutoff) if trim_at >= args.ksize: # be sure to not to change the 'N's in the trimmed sequence - # so, return 'seq' and not 'seqN'. return name, seq[:trim_at] return None, None # the filtering loop log_info('filtering {datafile}', datafile=args.datafile) if args.outfile is None: outfile = os.path.basename(args.datafile) + '.abundfilt' else: outfile = args.outfile outfp = open(outfile, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) tsp = ThreadedSequenceProcessor(process_fn, verbose=not args.quiet) tsp.start(verbose_loader(args.datafile), outfp) log_info('output in {outfile}', outfile=outfile) if args.savegraph: log_info('Saving k-mer countgraph filename {graph}', graph=args.savegraph) graph.save(args.savegraph)
def main(): info('collect-reads.py', ['counting']) args = get_parser().parse_args() report_on_config(args) base = args.output_countingtable_filename filenames = args.input_sequence_filename for name in args.input_sequence_filename: check_input_files(name, False) check_space(args.input_sequence_filename, False) check_space_for_hashtable(args.n_tables * args.min_tablesize, False) print 'Saving k-mer counting table to %s' % base print 'Loading sequences from %s' % repr(filenames) if args.output: print 'Outputting sequences to', args.output print 'making k-mer counting table' htable = khmer.new_counting_hash(args.ksize, args.min_tablesize) htable.set_use_bigcount(args.bigcount) total_coverage = 0. n = 0 for index, filename in enumerate(filenames): for record in screed.open(filename): seq = record.sequence.upper() if 'N' in seq: seq = seq.replace('N', 'G') try: med, _, _ = htable.get_median_count(seq) except ValueError: continue total_coverage += med n += 1 if total_coverage / float(n) > args.coverage: print 'reached target average coverage:', \ total_coverage / float(n) break htable.consume(seq) if args.output: args.output.write(output_single(record)) if n % 100000 == 0: print '...', index, filename, n, total_coverage / float(n) if total_coverage / float(n) > args.coverage: break print 'Collected %d reads' % (n, ) if args.report_total_kmers: print >> sys.stderr, 'Total number of k-mers: {0}'.format( htable.n_occupied()) print 'saving', base htable.save(base) info_fp = open(base + '.info', 'w') info_fp.write('through end: %s\n' % filenames[-1]) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(htable, args.force, max_false_pos=.2) print 'fp rate estimated to be %1.3f' % fp_rate print >> info_fp, 'fp rate estimated to be %1.3f' % fp_rate print 'DONE.'
def main(): # pylint: disable=too-many-locals,too-many-branches info('abundance-dist-single.py', ['counting', 'SeqAn']) args = get_parser().parse_args() report_on_config(args) check_file_status(args.input_sequence_filename, args.force) check_space([args.input_sequence_filename], args.force) if args.savetable: check_space_for_hashtable(args.n_tables * args.min_tablesize, args.force) if (not args.squash_output and os.path.exists(args.output_histogram_filename)): print >> sys.stderr, 'ERROR: %s exists; not squashing.' % \ args.output_histogram_filename sys.exit(1) else: hist_fp = open(args.output_histogram_filename, 'w') print >>sys.stderr, 'making k-mer counting table' counting_hash = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables) counting_hash.set_use_bigcount(args.bigcount) print >> sys.stderr, 'building k-mer tracking table' tracking = khmer.new_hashbits(counting_hash.ksize(), args.min_tablesize, args.n_tables) print >>sys.stderr, 'kmer_size:', counting_hash.ksize() print >>sys.stderr, 'k-mer counting table sizes:', \ counting_hash.hashsizes() print >>sys.stderr, 'outputting to', args.output_histogram_filename # start loading rparser = khmer.ReadParser(args.input_sequence_filename) threads = [] print >>sys.stderr, 'consuming input, round 1 --', \ args.input_sequence_filename for _ in xrange(args.threads): thread = \ threading.Thread( target=counting_hash.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(thread) thread.start() for thread in threads: thread.join() if args.report_total_kmers: print >> sys.stderr, 'Total number of unique k-mers: {0}'.format( counting_hash.n_unique_kmers()) abundance_lists = [] def __do_abundance_dist__(read_parser): abundances = counting_hash.abundance_distribution_with_reads_parser( read_parser, tracking) abundance_lists.append(abundances) print >>sys.stderr, 'preparing hist from %s...' % \ args.input_sequence_filename rparser = khmer.ReadParser(args.input_sequence_filename) threads = [] print >>sys.stderr, 'consuming input, round 2 --', \ args.input_sequence_filename for _ in xrange(args.threads): thread = \ threading.Thread( target=__do_abundance_dist__, args=(rparser, ) ) threads.append(thread) thread.start() for thread in threads: thread.join() assert len(abundance_lists) == args.threads, len(abundance_lists) abundance = {} for abundance_list in abundance_lists: for i, count in enumerate(abundance_list): abundance[i] = abundance.get(i, 0) + count total = sum(abundance.values()) if 0 == total: print >> sys.stderr, \ "ERROR: abundance distribution is uniformly zero; " \ "nothing to report." print >> sys.stderr, "\tPlease verify that the input files are valid." sys.exit(1) sofar = 0 for _, i in sorted(abundance.items()): if i == 0 and not args.output_zero: continue sofar += i frac = sofar / float(total) print >> hist_fp, _, i, sofar, round(frac, 3) if sofar == total: break if args.savetable: print >>sys.stderr, 'Saving k-mer counting table ', args.savetable print >>sys.stderr, '...saving to', args.savetable counting_hash.save(args.savetable) print >> sys.stderr, 'wrote to: ' + args.output_histogram_filename
def main(): parser = get_parser() parser.epilog = parser.epilog.replace( ":doc:`partitioning-big-data`", "http://khmer.readthedocs.io/en/stable/user/" "partitioning-big-data.html") args = sanitize_help(parser).parse_args() graphbase = args.graphbase # @RamRS: This might need some more work infiles = [graphbase, graphbase + '.tagset'] if os.path.exists(graphbase + '.stoptags'): infiles.append(graphbase + '.stoptags') for _ in infiles: check_input_files(_, args.force) check_space(infiles, args.force) print('loading k-mer nodegraph %s' % graphbase, file=sys.stderr) graph = khmer.load_nodegraph(graphbase) print('loading tagset %s.tagset...' % graphbase, file=sys.stderr) graph.load_tagset(graphbase + '.tagset') initial_stoptags = False # @CTB regularize with make-initial if os.path.exists(graphbase + '.stoptags'): print('loading stoptags %s.stoptags' % graphbase, file=sys.stderr) graph.load_stop_tags(graphbase + '.stoptags') initial_stoptags = True pmap_files = glob.glob(args.graphbase + '.subset.*.pmap') print('loading %d pmap files (first one: %s)' % (len(pmap_files), pmap_files[0]), file=sys.stderr) print('---', file=sys.stderr) print('output stoptags will be in', graphbase + '.stoptags', file=sys.stderr) if initial_stoptags: print('(these output stoptags will include the already-loaded set)', file=sys.stderr) print('---', file=sys.stderr) # create countgraph ksize = graph.ksize() counting = khmer_args.create_countgraph(args, ksize=ksize) # load & merge for index, subset_file in enumerate(pmap_files): print('<-', subset_file, file=sys.stderr) subset = graph.load_subset_partitionmap(subset_file) print('** repartitioning subset... %s' % subset_file, file=sys.stderr) graph.repartition_largest_partition(subset, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) print('** merging subset... %s' % subset_file, file=sys.stderr) graph.merge_subset(subset) print('** repartitioning, round 2... %s' % subset_file, file=sys.stderr) size = graph.repartition_largest_partition( None, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) print('** repartitioned size:', size, file=sys.stderr) print('saving stoptags binary', file=sys.stderr) graph.save_stop_tags(graphbase + '.stoptags') os.rename(subset_file, subset_file + '.processed') print('(%d of %d)\n' % (index, len(pmap_files)), file=sys.stderr) print('done!', file=sys.stderr)
def main(): args = sanitize_help(get_parser()).parse_args() distfilename = args.prefix + '.dist' for infile in args.part_filenames: check_input_files(infile, args.force) check_space(args.part_filenames, args.force) print('---', file=sys.stderr) print('reading partitioned files:', repr(args.part_filenames), file=sys.stderr) if args.output_groups: print('outputting to files named "%s.groupN.fa"' % args.prefix, file=sys.stderr) print('min reads to keep a partition:', args.min_part_size, file=sys.stderr) print('max size of a group file:', args.max_size, file=sys.stderr) else: print('NOT outputting groups! Beware!', file=sys.stderr) if args.output_unassigned: print('outputting unassigned reads to "%s.unassigned.fa"' % args.prefix, file=sys.stderr) print('partition size distribution will go to %s' % distfilename, file=sys.stderr) print('---', file=sys.stderr) # suffix = None is_fastq = None with PartitionedReader(args.part_filenames, True, True) as reader: for read, _ in reader: if is_fastq is None: is_fastq = hasattr(read, 'quality') else: assert hasattr(read, 'quality') == is_fastq,\ "Input files must have consistent format." if is_fastq: suffix = "fq" else: suffix = "fa" # remember folks, generators exhaust themseleves extractor = PartitionExtractor(args.part_filenames, args.min_part_size, args.max_size) if args.output_unassigned: ofile = open('%s.unassigned.%s' % (args.prefix, suffix), 'wb') unassigned_fp = get_file_writer(ofile, args.gzip, args.bzip) extractor.process_unassigned(unassigned_fp) unassigned_fp.close() else: extractor.process_unassigned() extractor.output_histogram(distfilename) if not args.output_groups: sys.exit(0) extractor.develop_groups() print('%d groups' % extractor.group_n, file=sys.stderr) if extractor.group_n == 0: print('nothing to output; exiting!', file=sys.stderr) return # open a bunch of output files for the different groups group_fps = {} for index in range(extractor.group_n): fname = '%s.group%04d.%s' % (args.prefix, index, suffix) group_fp = get_file_writer(open(fname, 'wb'), args.gzip, args.bzip) group_fps[index] = group_fp # write 'em all out! # refresh the generator read_generator = PartitionExtractor.ReadGroupGenerator(extractor) with PartitionedReader(args.part_filenames) as reader: for read, group_n in read_generator(reader): outfp = group_fps[group_n] write_record(read, outfp) print('---', file=sys.stderr) print('Of %d total seqs,' % read_generator.total_seqs, file=sys.stderr) print('extracted %d partitioned seqs into group files,' % read_generator.part_seqs, file=sys.stderr) print('discarded %d sequences from small partitions (see -m),' % read_generator.toosmall_parts, file=sys.stderr) print('and found %d unpartitioned sequences (see -U).' % extractor.n_unassigned, file=sys.stderr) print('', file=sys.stderr) print('Created %d group files named %s.groupXXXX.%s' % (len(group_fps), args.prefix, suffix), file=sys.stderr)
def main(): info('sweep-reads-buffered.py', ['sweep']) parser = get_parser() args = parser.parse_args() if args.min_tablesize < MIN_HSIZE: args.min_tablesize = MIN_HSIZE if args.ksize < MIN_KSIZE: args.ksize = MIN_KSIZE report_on_config(args, hashtype='hashbits') K = args.ksize HT_SIZE = args.min_tablesize N_HT = args.n_tables traversal_range = args.traversal_range input_fastp = args.input_fastp if not args.outdir: outdir = os.path.dirname(input_fastp) else: outdir = args.outdir max_buffers = args.max_buffers output_pref = args.output_prefix buf_size = args.buffer_size max_reads = args.max_reads check_input_files(args.input_fastp, args.force) check_valid_file_exists(args.input_files) all_input_files = [input_fastp] all_input_files.extend(args.input_files) # Check disk space availability check_space(all_input_files, args.force) # figure out input file type (FA/FQ) -- based on first file ix = iter(screed.open(args.input_files[0])) record = ix.next() del ix extension = 'fa' if hasattr(record, 'quality'): # fastq! extension = 'fq' output_buffer = ReadBufferManager(max_buffers, max_reads, buf_size, output_pref, outdir, extension) # consume the partitioned fasta with which to label the graph ht = khmer.LabelHash(K, HT_SIZE, N_HT) try: print >> sys.stderr, 'consuming input sequences...' if args.label_by_pid: print >> sys.stderr, '...labeling by partition id (pid)' ht.consume_partitioned_fasta_and_tag_with_labels(input_fastp) elif args.label_by_seq: print >> sys.stderr, '...labeling by sequence' for n, record in enumerate(screed.open(input_fastp)): if n % 50000 == 0: print >>sys.stderr, \ '...consumed {n} sequences...'.format(n=n) ht.consume_sequence_and_tag_with_labels(record.sequence, n) else: print >>sys.stderr, \ '...labeling to create groups of size {s}'.format( s=args.group_size) label = -1 g = 0 try: outfp = open( '{pref}_base_{g}.{ext}'.format(pref=output_pref, g=g, ext=extension), 'wb') for n, record in enumerate(screed.open(input_fastp)): if n % args.group_size == 0: label += 1 if label > g: g = label outfp = open( '{pref}_base_{g}.{ext}'.format( pref=output_pref, g=g, ext=extension), 'wb') if n % 50000 == 0: print >>sys.stderr, \ '...consumed {n} sequences...'.format(n=n) ht.consume_sequence_and_tag_with_labels( record.sequence, label) write_record(record, outfp) except IOError as e: print >> sys.stderr, '!! ERROR !!', e print >> sys.stderr, '...error splitting input. exiting...' except IOError as e: print >> sys.stderr, '!! ERROR: !!', e print >> sys.stderr, '...error consuming \ {i}. exiting...'.format(i=input_fastp) print >> sys.stderr, 'done consuming input sequence. \ added {t} tags and {l} \ labels...'.format(t=ht.n_tags(), l=ht.n_labels()) label_dict = defaultdict(int) label_number_dist = [] n_orphaned = 0 n_labeled = 0 n_mlabeled = 0 total_t = time.clock() start_t = time.clock() for read_file in args.input_files: print >> sys.stderr, '** sweeping {read_file} for labels...'.format( read_file=read_file) file_t = 0.0 try: read_fp = screed.open(read_file) except IOError as error: print >> sys.stderr, '!! ERROR: !!', error print >> sys.stderr, '*** Could not open {fn}, skipping...'.format( fn=read_file) else: for _, record in enumerate(read_fp): if _ % 50000 == 0: end_t = time.clock() batch_t = end_t - start_t file_t += batch_t print >>sys.stderr, '\tswept {n} reads [{nc} labeled, \ {no} orphaned] \ ** {sec}s ({sect}s total)' \ .format(n=_, nc=n_labeled, no=n_orphaned, sec=batch_t, sect=file_t) start_t = time.clock() seq = record.sequence name = record.name try: labels = ht.sweep_label_neighborhood(seq, traversal_range) except ValueError as e: pass else: if hasattr(record, 'quality'): seq_str = fmt_fastq(name, seq, record.quality, labels) else: seq_str = fmt_fasta(name, seq, labels) label_number_dist.append(len(labels)) if labels: n_labeled += 1 if len(labels) > 1: output_buffer.queue(seq_str, 'multi') n_mlabeled += 1 label_dict['multi'] += 1 else: output_buffer.queue(seq_str, labels[0]) label_dict[labels[0]] += 1 else: n_orphaned += 1 output_buffer.queue(seq_str, 'orphaned') label_dict['orphaned'] += 1 print >> sys.stderr, '** End of file {fn}...'.format(fn=read_file) output_buffer.flush_all() read_fp.close() # gotta output anything left in the buffers at the end! print >> sys.stderr, '** End of run...' output_buffer.flush_all() total_t = time.clock() - total_t if output_buffer.num_write_errors > 0 or output_buffer.num_file_errors > 0: print >> sys.stderr, '! WARNING: Sweep finished with errors !' print >> sys.stderr, '** {writee} reads not written'.format( writee=output_buffer.num_write_errors) print >> sys.stderr, '** {filee} errors opening files'.format( filee=output_buffer.num_file_errors) print >> sys.stderr, 'swept {n_reads} for labels...'.format( n_reads=n_labeled + n_orphaned) print >> sys.stderr, '...with {nc} labeled and {no} orphaned'.format( nc=n_labeled, no=n_orphaned) print >> sys.stderr, '...and {nmc} multilabeled'.format(nmc=n_mlabeled) print >> sys.stderr, '** outputting label number distribution...' fn = os.path.join(outdir, '{pref}.dist.txt'.format(pref=output_pref)) with open(fn, 'wb') as outfp: for nc in label_number_dist: outfp.write('{nc}\n'.format(nc=nc)) fn = os.path.join(outdir, '{pref}.counts.csv'.format(pref=output_pref)) print >> sys.stderr, '** outputting label read counts...' with open(fn, 'wb') as outfp: for k in label_dict: outfp.write('{l},{c}\n'.format(l=k, c=label_dict[k]))
def main(): info('partition-graph.py', ['graph']) args = get_parser().parse_args() basename = args.basename filenames = [basename + '.pt', basename + '.tagset'] for _ in filenames: check_input_files(_, args.force) check_space(filenames, args.force) print >> sys.stderr, '--' print >> sys.stderr, 'SUBSET SIZE', args.subset_size print >> sys.stderr, 'N THREADS', args.threads if args.stoptags: print >> sys.stderr, 'stoptag file:', args.stoptags print >> sys.stderr, '--' print >> sys.stderr, 'loading ht %s.pt' % basename htable = khmer.load_hashbits(basename + '.pt') htable.load_tagset(basename + '.tagset') # do we want to load stop tags, and do they exist? if args.stoptags: print >> sys.stderr, 'loading stoptags from', args.stoptags htable.load_stop_tags(args.stoptags) # do we want to exhaustively traverse the graph? stop_big_traversals = args.no_big_traverse if stop_big_traversals: print >>sys.stderr, '** This script brakes for lumps:', \ ' stop_big_traversals is true.' else: print >>sys.stderr, '** Traverse all the things:', \ ' stop_big_traversals is false.' # # now, partition! # # divide the tags up into subsets divvy = htable.divide_tags_into_subsets(int(args.subset_size)) n_subsets = len(divvy) divvy.append(0) # build a queue of tasks: worker_q = Queue.Queue() # break up the subsets into a list of worker tasks for _ in range(0, n_subsets): start = divvy[_] end = divvy[_ + 1] worker_q.put((htable, _, start, end)) print >> sys.stderr, 'enqueued %d subset tasks' % n_subsets open('%s.info' % basename, 'w').write('%d subsets total\n' % (n_subsets)) n_threads = args.threads if n_subsets < n_threads: n_threads = n_subsets # start threads! print >> sys.stderr, 'starting %d threads' % n_threads print >> sys.stderr, '---' threads = [] for _ in range(n_threads): cur_thrd = threading.Thread(target=worker, args=(worker_q, basename, stop_big_traversals)) threads.append(cur_thrd) cur_thrd.start() print >> sys.stderr, 'done starting threads' # wait for threads for _ in threads: _.join() print >> sys.stderr, '---' print >>sys.stderr, 'done making subsets! see %s.subset.*.pmap' % \ (basename,)
def main(): # pylint: disable=too-many-locals,too-many-branches info('extract-partitions.py', ['graph']) args = get_parser().parse_args() distfilename = args.prefix + '.dist' n_unassigned = 0 for infile in args.part_filenames: check_file_status(infile, args.force) check_space(args.part_filenames, args.force) print >> sys.stderr, '---' print >> sys.stderr, 'reading partitioned files:', repr( args.part_filenames) if args.output_groups: print >>sys.stderr, 'outputting to files named "%s.groupN.fa"' % \ args.prefix print >>sys.stderr, 'min reads to keep a partition:', \ args.min_part_size print >> sys.stderr, 'max size of a group file:', args.max_size else: print >> sys.stderr, 'NOT outputting groups! Beware!' if args.output_unassigned: print >>sys.stderr, \ 'outputting unassigned reads to "%s.unassigned.fa"' % \ args.prefix print >>sys.stderr, 'partition size distribution will go to %s' \ % distfilename print >> sys.stderr, '---' # suffix = 'fa' is_fastq = False for index, read, pid in read_partition_file(args.part_filenames[0]): if hasattr(read, 'quality'): suffix = 'fq' is_fastq = True break for filename in args.part_filenames: for index, read, pid in read_partition_file(filename): if is_fastq: assert hasattr(read, 'quality'), \ "all input files must be FASTQ if the first one is" else: assert not hasattr(read, 'quality'), \ "all input files must be FASTA if the first one is" break if args.output_unassigned: unassigned_fp = open('%s.unassigned.%s' % (args.prefix, suffix), 'w') count = {} for filename in args.part_filenames: for index, read, pid in read_partition_file(filename): if index % 100000 == 0: print >> sys.stderr, '...', index count[pid] = count.get(pid, 0) + 1 if pid == 0: n_unassigned += 1 if args.output_unassigned: write_record(read, unassigned_fp) if args.output_unassigned: unassigned_fp.close() if 0 in count: # eliminate unpartitioned sequences del count[0] # develop histogram of partition sizes dist = {} for pid, size in count.items(): dist[size] = dist.get(size, 0) + 1 # output histogram distfp = open(distfilename, 'w') total = 0 wtotal = 0 for counter, index in sorted(dist.items()): total += index wtotal += counter * index distfp.write('%d %d %d %d\n' % (counter, index, total, wtotal)) distfp.close() if not args.output_groups: sys.exit(0) # sort groups by size divvy = sorted(count.items(), key=lambda y: y[1]) divvy = [y for y in divvy if y[1] > args.min_part_size] # divvy up into different groups, based on having max_size sequences # in each group. total = 0 group = set() group_n = 0 group_d = {} for partition_id, n_reads in divvy: group.add(partition_id) total += n_reads if total > args.max_size: for partition_id in group: group_d[partition_id] = group_n # print 'group_d', partition_id, group_n group_n += 1 group = set() total = 0 if group: for partition_id in group: group_d[partition_id] = group_n # print 'group_d', partition_id, group_n group_n += 1 print >> sys.stderr, '%d groups' % group_n if group_n == 0: print >> sys.stderr, 'nothing to output; exiting!' return # open a bunch of output files for the different groups group_fps = {} for _ in range(group_n): group_fp = open('%s.group%04d.%s' % (args.prefix, _, suffix), 'w') group_fps[_] = group_fp # write 'em all out! total_seqs = 0 part_seqs = 0 toosmall_parts = 0 for filename in args.part_filenames: for index, read, partition_id in read_partition_file(filename): total_seqs += 1 if index % 100000 == 0: print >> sys.stderr, '...x2', index if partition_id == 0: continue try: group_n = group_d[partition_id] except KeyError: assert count[partition_id] <= args.min_part_size toosmall_parts += 1 continue outfp = group_fps[group_n] write_record(read, outfp) part_seqs += 1 print >> sys.stderr, '---' print >> sys.stderr, 'Of %d total seqs,' % total_seqs print >>sys.stderr, 'extracted %d partitioned seqs into group files,' % \ part_seqs print >>sys.stderr, \ 'discarded %d sequences from small partitions (see -m),' % \ toosmall_parts print >>sys.stderr, 'and found %d unpartitioned sequences (see -U).' % \ n_unassigned print >> sys.stderr, '' print >>sys.stderr, 'Created %d group files named %s.groupXXXX.%s' % \ (len(group_fps), args.prefix, suffix)
def main(): # pylint: disable=too-many-branches,too-many-statements parser = sanitize_help(get_parser()) args = parser.parse_args() if not args.quiet: info('normalize-by-median.py', ['diginorm']) configure_logging(args.quiet) report_on_config(args) report_fp = args.report force_single = args.force_single # check for similar filenames # if we're using a single output file only check for identical filenames # otherwise, check for identical BASE names as well. filenames = [] basenames = [] for pathfilename in args.input_filenames: filenames.append(pathfilename) if args.single_output_file: continue # nothing more to worry about basename = os.path.basename(pathfilename) if basename in basenames: log_error('ERROR: Duplicate filename--Cannot handle this!') log_error('** Exiting!') sys.exit(1) basenames.append(basename) # check that files exist and there is sufficient output disk space. check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savegraph: graphsize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.savegraph, graphsize, args.force) # load or create counting table. if args.loadgraph: log_info('loading k-mer countgraph from {graph}', graph=args.loadgraph) countgraph = khmer.load_countgraph(args.loadgraph) else: log_info('making countgraph') countgraph = khmer_args.create_countgraph(args) # create an object to handle diginorm of all files norm = Normalizer(args.cutoff, countgraph) with_diagnostics = WithDiagnostics(norm, report_fp, args.report_frequency) # make a list of all filenames and if they're paired or not; # if we don't know if they're paired, default to allowing but not # forcing pairing. files = [] for element in filenames: files.append([element, args.paired]) if args.unpaired_reads: files.append([args.unpaired_reads, False]) corrupt_files = [] outfp = None output_name = None if args.single_output_file: outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip) else: if '-' in filenames or '/dev/stdin' in filenames: print( "Accepting input from stdin; output filename must " "be provided with '-o'.", file=sys.stderr) sys.exit(1) # # main loop: iterate over all files given, do diginorm. # for filename, require_paired in files: if not args.single_output_file: output_name = os.path.basename(filename) + '.keep' outfp = open(output_name, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) # failsafe context manager in case an input file breaks with catch_io_errors(filename, outfp, args.single_output_file, args.force, corrupt_files): screed_iter = screed.open(filename) reader = broken_paired_reader(screed_iter, min_length=args.ksize, force_single=force_single, require_paired=require_paired) # actually do diginorm for record in with_diagnostics(reader, filename): if record is not None: write_record(record, outfp) log_info('output in {name}', name=describe_file_handle(outfp)) if not args.single_output_file: outfp.close() # finished - print out some diagnostics. log_info('Total number of unique k-mers: {umers}', umers=countgraph.n_unique_kmers()) if args.savegraph: log_info('...saving to {name}', name=args.savegraph) countgraph.save(args.savegraph) fp_rate = \ khmer.calc_expected_collisions(countgraph, False, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate) if args.force and len(corrupt_files) > 0: log_error("** WARNING: Finished with errors!") log_error("** I/O Errors occurred in the following files:") log_error("\t" + " ".join(corrupt_files))
def main(): info('split-paired-reads.py') args = get_parser().parse_args() infile = args.infile filenames = [infile] check_input_files(infile, args.force) check_space(filenames, args.force) # decide where to put output files - specific directory? or just default? if infile == '/dev/stdin' or infile == '-': if not (args.output_first and args.output_second): print >> sys.stderr, ("Accepting input from stdin; " "output filenames must be provided.") sys.exit(1) elif args.output_directory: if not os.path.exists(args.output_directory): os.makedirs(args.output_directory) out1 = args.output_directory + '/' + os.path.basename(infile) + '.1' out2 = args.output_directory + '/' + os.path.basename(infile) + '.2' else: out1 = os.path.basename(infile) + '.1' out2 = os.path.basename(infile) + '.2' # OVERRIDE output file locations with -1, -2 if args.output_first: fp_out1 = args.output_first out1 = fp_out1.name else: # Use default filename created above fp_out1 = open(out1, 'w') if args.output_second: fp_out2 = args.output_second out2 = fp_out2.name else: # Use default filename created above fp_out2 = open(out2, 'w') counter1 = 0 counter2 = 0 index = None screed_iter = screed.open(infile, parse_description=False) # walk through all the reads in broken-paired mode. paired_iter = broken_paired_reader(screed_iter) for index, is_pair, record1, record2 in paired_iter: if index % 10000 == 0: print('...', index, file=sys.stderr) # are we requiring pairs? if args.force_paired and not is_pair: print('ERROR, %s is not part of a pair' % record1.name, file=sys.stderr) sys.exit(1) if is_pair: write_record(record1, fp_out1) counter1 += 1 write_record(record2, fp_out2) counter2 += 1 else: name = record1.name if check_is_left(name): write_record(record1, fp_out1) counter1 += 1 elif check_is_right(name): write_record(record1, fp_out2) counter2 += 1 else: print("Unrecognized format for read pair information: %s" % name, file=sys.stderr) print("Exiting.", file=sys.stderr) sys.exit(1) print("DONE; split %d sequences (%d left, %d right)" % (counter1 + counter2, counter1, counter2), file=sys.stderr) print("/1 reads in %s" % out1, file=sys.stderr) print("/2 reads in %s" % out2, file=sys.stderr)
def main(): # pylint: disable=too-many-branches,too-many-statements info('saturate-by-median.py', ['diginorm']) parser = sanitize_help(get_parser()) args = parser.parse_args() report_on_config(args) report_fp = args.report report_frequency = args.report_frequency check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, False) if args.savegraph: check_space_for_graph(args, 'countgraph', False) # list to save error files along with throwing exceptions if args.force: corrupt_files = [] if args.loadgraph: print('loading k-mer countgraph from', args.loadgraph) htable = khmer.load_countgraph(args.loadgraph) else: print('making countgraph') htable = create_countgraph(args) total = 0 discarded = 0 for index, input_filename in enumerate(args.input_filenames): total_acc = 0 discarded_acc = 0 try: total_acc, discarded_acc = normalize_by_median(input_filename, htable, args, report_fp, report_frequency) except IOError as err: handle_error(err, input_filename) if not args.force: print("NOTE: This can be overridden using the --force" " argument", file=sys.stderr) print('** Exiting!', file=sys.stderr) sys.exit(1) else: print('*** Skipping error file, moving on...', file=sys.stderr) corrupt_files.append(input_filename) else: if total_acc == 0 and discarded_acc == 0: print('SKIPPED empty file', input_filename) else: total += total_acc discarded += discarded_acc print('DONE with {inp}; kept {kept} of {total} or {perc:2}%'\ .format(inp=input_filename, kept=total - discarded, total=total, perc=int(100. - discarded / float(total) * 100.))) if args.savegraph: print('Saving k-mer countgraph through', input_filename) print('...saving to', args.savegraph) htable.save(args.savegraph) # re: threshold, see Zhang et al., # http://arxiv.org/abs/1309.2975 fp_rate = khmer.calc_expected_collisions(htable, args.force, max_false_pos=.8) print('fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate)) if args.force and len(corrupt_files) > 0: print("** WARNING: Finished with errors!", file=sys.stderr) print("** I/O Errors occurred in the following files:", file=sys.stderr) print("\t", " ".join(corrupt_files), file=sys.stderr)
def main(): info('find-knots.py', ['graph']) args = get_parser().parse_args() graphbase = args.graphbase # @RamRS: This might need some more work infiles = [graphbase + '.pt', graphbase + '.tagset'] if os.path.exists(graphbase + '.stoptags'): infiles.append(graphbase + '.stoptags') for _ in infiles: check_input_files(_, False) check_space(infiles) print >> sys.stderr, 'loading k-mer presence table %s.pt' % graphbase htable = khmer.load_hashbits(graphbase + '.pt') print >> sys.stderr, 'loading tagset %s.tagset...' % graphbase htable.load_tagset(graphbase + '.tagset') initial_stoptags = False # @CTB regularize with make-initial if os.path.exists(graphbase + '.stoptags'): print >> sys.stderr, 'loading stoptags %s.stoptags' % graphbase htable.load_stop_tags(graphbase + '.stoptags') initial_stoptags = True pmap_files = glob.glob(args.graphbase + '.subset.*.pmap') print >>sys.stderr, 'loading %d pmap files (first one: %s)' % \ (len(pmap_files), pmap_files[0]) print >> sys.stderr, '---' print >> sys.stderr, 'output stoptags will be in', graphbase + '.stoptags' if initial_stoptags: print >>sys.stderr, \ '(these output stoptags will include the already-loaded set)' print >> sys.stderr, '---' # create counting hash ksize = htable.ksize() counting = khmer.new_counting_hash(ksize, args.min_tablesize, args.n_tables) # load & merge for index, subset_file in enumerate(pmap_files): print >> sys.stderr, '<-', subset_file subset = htable.load_subset_partitionmap(subset_file) print >> sys.stderr, '** repartitioning subset... %s' % subset_file htable.repartition_largest_partition(subset, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) print >> sys.stderr, '** merging subset... %s' % subset_file htable.merge_subset(subset) print >> sys.stderr, '** repartitioning, round 2... %s' % subset_file size = htable.repartition_largest_partition( None, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) print >> sys.stderr, '** repartitioned size:', size print >> sys.stderr, 'saving stoptags binary' htable.save_stop_tags(graphbase + '.stoptags') os.rename(subset_file, subset_file + '.processed') print >> sys.stderr, '(%d of %d)\n' % (index, len(pmap_files)) print >> sys.stderr, 'done!'
def main(): info('load-graph.py', ['graph', 'SeqAn']) args = get_parser().parse_args() report_on_config(args, hashtype='hashbits') base = args.output_filename filenames = args.input_filenames for _ in args.input_filenames: check_input_files(_, args.force) check_space(args.input_filenames, args.force) check_space_for_hashtable( (float(args.n_tables * args.min_tablesize) / 8.), args.force) print >>sys.stderr, 'Saving k-mer presence table to %s' % base print >>sys.stderr, 'Loading kmers from sequences in %s' % repr(filenames) if args.no_build_tagset: print >>sys.stderr, 'We WILL NOT build the tagset.' else: print >>sys.stderr, 'We WILL build the tagset', \ ' (for partitioning/traversal).' print >>sys.stderr, 'making k-mer presence table' htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables) if args.no_build_tagset: target_method = htable.consume_fasta_with_reads_parser else: target_method = htable.consume_fasta_and_tag_with_reads_parser for _, filename in enumerate(filenames): rparser = khmer.ReadParser(filename) threads = [] print >>sys.stderr, 'consuming input', filename for num in xrange(args.threads): cur_thread = threading.Thread( target=target_method, args=(rparser,)) threads.append(cur_thread) cur_thread.start() for thread in threads: thread.join() if args.report_total_kmers: print >> sys.stderr, 'Total number of unique k-mers: {0}'.format( htable.n_unique_kmers()) print >>sys.stderr, 'saving k-mer presence table in', base + '.pt' htable.save(base + '.pt') if not args.no_build_tagset: print >>sys.stderr, 'saving tagset in', base + '.tagset' htable.save_tagset(base + '.tagset') info_fp = open(base + '.info', 'w') info_fp.write('%d unique k-mers' % htable.n_unique_kmers()) fp_rate = \ khmer.calc_expected_collisions(htable, args.force, max_false_pos=.15) # 0.18 is ACTUAL MAX. Do not change. print >>sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate if args.write_fp_rate: print >> info_fp, \ '\nfalse positive rate estimated to be %1.3f' % fp_rate print >> sys.stderr, 'wrote to', base + '.info and', base + '.pt' if not args.no_build_tagset: print >> sys.stderr, 'and ' + base + '.tagset'
def main(): info('load-into-counting.py', ['counting', 'SeqAn']) args = get_parser().parse_args() report_on_config(args) base = args.output_countingtable_filename filenames = args.input_sequence_filename for name in args.input_sequence_filename: check_input_files(name, args.force) check_space(args.input_sequence_filename, args.force) check_space_for_hashtable(args.n_tables * args.min_tablesize, args.force) check_file_writable(base) check_file_writable(base + ".info") print >> sys.stderr, 'Saving k-mer counting table to %s' % base print >> sys.stderr, 'Loading kmers from sequences in %s' % repr(filenames) # clobber the '.info' file now, as we always open in append mode below if os.path.exists(base + '.info'): os.remove(base + '.info') print >> sys.stderr, 'making k-mer counting table' htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables) htable.set_use_bigcount(args.bigcount) filename = None total_num_reads = 0 for index, filename in enumerate(filenames): rparser = khmer.ReadParser(filename) threads = [] print >> sys.stderr, 'consuming input', filename for _ in xrange(args.threads): cur_thrd = \ threading.Thread( target=htable.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thrd) cur_thrd.start() for thread in threads: thread.join() if index > 0 and index % 10 == 0: check_space_for_hashtable(args.n_tables * args.min_tablesize, args.force) print >> sys.stderr, 'mid-save', base htable.save(base) with open(base + '.info', 'a') as info_fh: print >> info_fh, 'through', filename total_num_reads += rparser.num_reads n_kmers = htable.n_unique_kmers() if args.report_total_kmers: print >> sys.stderr, 'Total number of unique k-mers:', n_kmers with open(base + '.info', 'a') as info_fp: print >> info_fp, 'Total number of unique k-mers:', n_kmers print >> sys.stderr, 'saving', base htable.save(base) # Change max_false_pos=0.2 only if you really grok it. HINT: You don't fp_rate = \ khmer.calc_expected_collisions(htable, args.force, max_false_pos=.2) with open(base + '.info', 'a') as info_fp: print >> info_fp, 'fp rate estimated to be %1.3f\n' % fp_rate if args.summary_info: mr_fmt = args.summary_info.lower() mr_file = base + '.info.' + mr_fmt print >> sys.stderr, "Writing summmary info to", mr_file with open(mr_file, 'w') as mr_fh: if mr_fmt == 'json': mr_data = { "ht_name": os.path.basename(base), "fpr": fp_rate, "num_kmers": n_kmers, "files": filenames, "mrinfo_version": "0.2.0", "num_reads": total_num_reads, } json.dump(mr_data, mr_fh) mr_fh.write('\n') elif mr_fmt == 'tsv': mr_fh.write("ht_name\tfpr\tnum_kmers\tnum_reads\tfiles\n") vals = [ os.path.basename(base), "{:1.3f}".format(fp_rate), str(n_kmers), str(total_num_reads), ";".join(filenames), ] mr_fh.write("\t".join(vals) + "\n") print >> sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate print >> sys.stderr, 'DONE.' print >> sys.stderr, 'wrote to:', base + '.info'
def main(): info('trim-low-abund.py', ['streaming']) parser = sanitize_help(get_parser()) args = parser.parse_args() ### if len(set(args.input_filenames)) != len(args.input_filenames): print("Error: Cannot input the same filename multiple times.", file=sys.stderr) sys.exit(1) if args.trim_at_coverage != DEFAULT_TRIM_AT_COVERAGE and \ not args.variable_coverage: print("Error: --trim-at-coverage/-Z given, but", "--variable-coverage/-V not specified.", file=sys.stderr) sys.exit(1) if args.diginorm_coverage != DEFAULT_DIGINORM_COVERAGE and \ not args.diginorm: print("Error: --diginorm-coverage given, but", "--diginorm not specified.", file=sys.stderr) sys.exit(1) if args.diginorm and args.single_pass: print("Error: --diginorm and --single-pass are incompatible!\n" "You probably want to use normalize-by-median.py instead.", file=sys.stderr) sys.exit(1) ### report_on_config(args) check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savegraph: graphsize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.savegraph, graphsize, args.force) if ('-' in args.input_filenames or '/dev/stdin' in args.input_filenames) \ and not args.output: print("Accepting input from stdin; output filename must " "be provided with -o.", file=sys.stderr) sys.exit(1) if args.loadgraph: print('loading countgraph from', args.loadgraph, file=sys.stderr) ct = khmer.load_countgraph(args.loadgraph) else: print('making countgraph', file=sys.stderr) ct = khmer_args.create_countgraph(args) K = ct.ksize() tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir) print('created temporary directory %s; ' 'use -T to change location' % tempdir, file=sys.stderr) trimmer = Trimmer(ct, not args.variable_coverage, args.cutoff, args.trim_at_coverage) if args.diginorm: trimmer.set_diginorm(args.diginorm_coverage) # ### FIRST PASS ### save_pass2_total = 0 written_bp = 0 written_reads = 0 # only create the file writer once if outfp is specified; otherwise, # create it for each file. if args.output: trimfp = get_file_writer(args.output, args.gzip, args.bzip) pass2list = [] for filename in args.input_filenames: # figure out temporary filename for 2nd pass pass2filename = os.path.basename(filename) + '.pass2' pass2filename = os.path.join(tempdir, pass2filename) pass2fp = open(pass2filename, 'w') # construct output filenames if args.output is None: # note: this will be saved in trimfp. outfp = open(os.path.basename(filename) + '.abundtrim', 'wb') # get file handle w/gzip, bzip trimfp = get_file_writer(outfp, args.gzip, args.bzip) # record all this info pass2list.append((filename, pass2filename, trimfp)) # input file stuff: get a broken_paired reader. screed_iter = screed.open(filename) paired_iter = broken_paired_reader(screed_iter, min_length=K, force_single=args.ignore_pairs) # main loop through the file. n_start = trimmer.n_reads save_start = trimmer.n_saved watermark = REPORT_EVERY_N_READS for read in trimmer.pass1(paired_iter, pass2fp): if (trimmer.n_reads - n_start) > watermark: print('...', filename, trimmer.n_saved, trimmer.n_reads, trimmer.n_bp, written_reads, written_bp, file=sys.stderr) watermark += REPORT_EVERY_N_READS # write out the trimmed/etc sequences that AREN'T going to be # revisited in a 2nd pass. write_record(read, trimfp) written_bp += len(read) written_reads += 1 pass2fp.close() print('%s: kept aside %d of %d from first pass, in %s' % (filename, trimmer.n_saved - save_start, trimmer.n_reads - n_start, filename), file=sys.stderr) # first pass goes across all the data, so record relevant stats... n_reads = trimmer.n_reads n_bp = trimmer.n_bp n_skipped = trimmer.n_skipped bp_skipped = trimmer.bp_skipped save_pass2_total = trimmer.n_saved # ### SECOND PASS. ### # nothing should have been skipped yet! assert trimmer.n_skipped == 0 assert trimmer.bp_skipped == 0 if args.single_pass: pass2list = [] # go back through all the files again. for _, pass2filename, trimfp in pass2list: print('second pass: looking at sequences kept aside in %s' % pass2filename, file=sys.stderr) # note that for this second pass, we don't care about paired # reads - they will be output in the same order they're read in, # so pairs will stay together if not orphaned. This is in contrast # to the first loop. Hence, force_single=True below. screed_iter = screed.open(pass2filename, parse_description=False) paired_iter = broken_paired_reader(screed_iter, min_length=K, force_single=True) watermark = REPORT_EVERY_N_READS for read in trimmer.pass2(paired_iter): if (trimmer.n_reads - n_start) > watermark: print('... x 2', trimmer.n_reads - n_start, pass2filename, trimmer.n_saved, trimmer.n_reads, trimmer.n_bp, written_reads, written_bp, file=sys.stderr) watermark += REPORT_EVERY_N_READS write_record(read, trimfp) written_reads += 1 written_bp += len(read) print('removing %s' % pass2filename, file=sys.stderr) os.unlink(pass2filename) # if we created our own trimfps, close 'em. if not args.output: trimfp.close() print('removing temp directory & contents (%s)' % tempdir, file=sys.stderr) shutil.rmtree(tempdir) trimmed_reads = trimmer.trimmed_reads n_passes = 1.0 + (float(save_pass2_total) / n_reads) percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\ n_reads * 100.0 print('read %d reads, %d bp' % (n_reads, n_bp,), file=sys.stderr) print('wrote %d reads, %d bp' % (written_reads, written_bp,), file=sys.stderr) print('looked at %d reads twice (%.2f passes)' % (save_pass2_total, n_passes), file=sys.stderr) print('removed %d reads and trimmed %d reads (%.2f%%)' % (n_reads - written_reads, trimmed_reads, percent_reads_trimmed), file=sys.stderr) print('trimmed or removed %.2f%% of bases (%d total)' % ((1 - (written_bp / float(n_bp))) * 100.0, n_bp - written_bp), file=sys.stderr) if args.variable_coverage: percent_reads_hicov = 100.0 * float(n_reads - n_skipped) / n_reads print('%d reads were high coverage (%.2f%%);' % (n_reads - n_skipped, percent_reads_hicov), file=sys.stderr) print('skipped %d reads/%d bases because of low coverage' % (n_skipped, bp_skipped), file=sys.stderr) fp_rate = \ khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 print('fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate), file=sys.stderr) print('output in *.abundtrim', file=sys.stderr) if args.savegraph: print("Saving k-mer countgraph to", args.savegraph, file=sys.stderr) ct.save(args.savegraph)
def main(): # pylint: disable=too-many-branches,too-many-statements info('normalize-by-median.py', ['diginorm']) args = get_parser().parse_args() report_on_config(args) report_fp = args.report # check for similar filenames filenames = [] for pathfilename in args.input_filenames: filename = pathfilename.split('/')[-1] if (filename in filenames): print >> sys.stderr, "WARNING: At least two input files are named \ %s . (The script normalize-by-median.py can not handle this, only one .keep \ file for one of the input files will be generated.)" % filename else: filenames.append(filename) # check for others check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savetable: check_space_for_hashtable(args.n_tables * args.min_tablesize, args.force) # list to save error files along with throwing exceptions corrupt_files = [] if args.loadtable: print 'loading k-mer counting table from', args.loadtable htable = khmer.load_counting_hash(args.loadtable) else: print >> sys.stderr, 'making k-mer counting table' htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables) input_filename = None for index, input_filename in enumerate(args.input_filenames): total_acc, discarded_acc, corrupt_files = \ normalize_by_median_and_check( input_filename, htable, args.single_output_file, args.fail_save, args.paired, args.cutoff, args.force, corrupt_files, report_fp) if (args.dump_frequency > 0 and index > 0 and index % args.dump_frequency == 0): print 'Backup: Saving k-mer counting file through', input_filename if args.savetable: hashname = args.savetable print '...saving to', hashname else: hashname = 'backup.ct' print 'Nothing given for savetable, saving to', hashname htable.save(hashname) if args.paired and args.unpaired_reads: args.paired = False output_name = args.unpaired_reads if not args.single_output_file: output_name = os.path.basename(args.unpaired_reads) + '.keep' outfp = open(output_name, 'w') total_acc, discarded_acc, corrupt_files = \ normalize_by_median_and_check( args.unpaired_reads, htable, args.single_output_file, args.fail_save, args.paired, args.cutoff, args.force, corrupt_files, report_fp) if args.report_total_kmers: print >> sys.stderr, 'Total number of unique k-mers: {0}'.format( htable.n_unique_kmers()) if args.savetable: print 'Saving k-mer counting table through', input_filename print '...saving to', args.savetable htable.save(args.savetable) fp_rate = \ khmer.calc_expected_collisions(htable, args.force, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 print >> sys.stderr, \ 'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate) if args.force and len(corrupt_files) > 0: print >> sys.stderr, "** WARNING: Finished with errors!" print >> sys.stderr, "** IOErrors occurred in the following files:" print >> sys.stderr, "\t", " ".join(corrupt_files)
def main(): info('sample-reads-randomly.py') args = get_parser().parse_args() for _ in args.filenames: check_input_files(_, args.force) check_space(args.filenames, args.force) # seed the random number generator? if args.random_seed: random.seed(args.random_seed) # bound n_samples num_samples = max(args.num_samples, 1) # # Figure out what the output filename is going to be # output_file = args.output_file if output_file: if num_samples > 1: sys.stderr.write( "Error: cannot specify -o with more than one sample.") if not args.force: sys.exit(1) output_filename = output_file.name else: filename = args.filenames[0] output_filename = os.path.basename(filename) + '.subset' if num_samples == 1: print >>sys.stderr, 'Subsampling %d reads using reservoir sampling.' %\ args.num_reads print >>sys.stderr, 'Subsampled reads will be placed in %s' % \ output_filename print >> sys.stderr, '' else: # > 1 print >>sys.stderr, 'Subsampling %d reads, %d times,' \ % (args.num_reads, num_samples), ' using reservoir sampling.' print >>sys.stderr, 'Subsampled reads will be placed in %s.N' \ % output_filename print >> sys.stderr, '' reads = [] for n in range(num_samples): reads.append([]) # read through all the sequences and load/resample the reservoir for filename in args.filenames: print >> sys.stderr, 'opening', filename, 'for reading' screed_iter = screed.open(filename, parse_description=False) for count, (_, ispair, rcrd1, rcrd2) in enumerate( broken_paired_reader(screed_iter, force_single=args.force_single)): if count % 10000 == 0: print >> sys.stderr, '...', count, 'reads scanned' if count >= args.max_reads: print >>sys.stderr, 'reached upper limit of %d reads' % \ args.max_reads, '(see -M); exiting' break # collect first N reads if count < args.num_reads: for n in range(num_samples): reads[n].append((rcrd1, rcrd2)) else: assert len(reads[n]) <= count # use reservoir sampling to replace reads at random # see http://en.wikipedia.org/wiki/Reservoir_sampling for n in range(num_samples): guess = random.randint(1, count) if guess <= args.num_reads: reads[n][guess - 1] = (rcrd1, rcrd2) # output all the subsampled reads: if len(reads) == 1: print >>sys.stderr, 'Writing %d sequences to %s' % \ (len(reads[0]), output_filename) if not output_file: output_file = open(output_filename, 'w') for records in reads[0]: write_record(records[0], output_file) if records[1] is not None: write_record(records[1], output_file) else: for n in range(num_samples): n_filename = output_filename + '.%d' % n print >>sys.stderr, 'Writing %d sequences to %s' % \ (len(reads[n]), n_filename) output_file = open(n_filename, 'w') for records in reads[n]: write_record(records[0], output_file) if records[1] is not None: write_record(records[1], output_file)
def main(): parser = sanitize_help(get_parser()) args = parser.parse_args() configure_logging(args.quiet) ### if len(set(args.input_filenames)) != len(args.input_filenames): log_error("Error: Cannot input the same filename multiple times.") sys.exit(1) if args.trim_at_coverage != DEFAULT_TRIM_AT_COVERAGE and \ not args.variable_coverage: log_error("Error: --trim-at-coverage/-Z given, but " "--variable-coverage/-V not specified.") sys.exit(1) if args.diginorm_coverage != DEFAULT_DIGINORM_COVERAGE and \ not args.diginorm: log_error("Error: --diginorm-coverage given, but " "--diginorm not specified.") sys.exit(1) if args.diginorm and args.single_pass: log_error("Error: --diginorm and --single-pass are incompatible!\n" "You probably want to use normalize-by-median.py instead.") sys.exit(1) ### graphtype = 'countgraph' if not args.small_count else 'smallcountgraph' report_on_config(args, graphtype=graphtype) check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savegraph: graphsize = calculate_graphsize(args, graphtype) check_space_for_graph(args.savegraph, graphsize, args.force) if ('-' in args.input_filenames or '/dev/stdin' in args.input_filenames) \ and not args.output: log_error("Accepting input from stdin; output filename must " "be provided with -o.") sys.exit(1) if args.loadgraph: log_info('loading countgraph from {graph}', graph=args.loadgraph) if args.small_count: ct = SmallCountgraph.load(args.loadgraph) else: ct = Countgraph.load(args.loadgraph) else: log_info('making countgraph') ct = khmer_args.create_countgraph(args) K = ct.ksize() tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir) log_info('created temporary directory {temp};\n' 'use -T to change location', temp=tempdir) trimmer = Trimmer(ct, not args.variable_coverage, args.cutoff, args.trim_at_coverage) if args.diginorm: trimmer.set_diginorm(args.diginorm_coverage) # ### FIRST PASS ### save_pass2_total = 0 written_bp = 0 written_reads = 0 # only create the file writer once if outfp is specified; otherwise, # create it for each file. if args.output: trimfp = get_file_writer(args.output, args.gzip, args.bzip) pass2list = [] for filename in args.input_filenames: # figure out temporary filename for 2nd pass pass2filename = filename.replace(os.path.sep, '-') + '.pass2' pass2filename = os.path.join(tempdir, pass2filename) pass2fp = open(pass2filename, 'w') # construct output filenames if args.output is None: # note: this will be saved in trimfp. outfp = open(os.path.basename(filename) + '.abundtrim', 'wb') # get file handle w/gzip, bzip trimfp = get_file_writer(outfp, args.gzip, args.bzip) # record all this info pass2list.append((filename, pass2filename, trimfp)) # input file stuff: get a broken_paired reader. paired_iter = broken_paired_reader(ReadParser(filename), min_length=K, force_single=args.ignore_pairs) # main loop through the file. n_start = trimmer.n_reads save_start = trimmer.n_saved watermark = REPORT_EVERY_N_READS for read in trimmer.pass1(paired_iter, pass2fp): if (trimmer.n_reads - n_start) > watermark: log_info("... {filename} {n_saved} {n_reads} {n_bp} " "{w_reads} {w_bp}", filename=filename, n_saved=trimmer.n_saved, n_reads=trimmer.n_reads, n_bp=trimmer.n_bp, w_reads=written_reads, w_bp=written_bp) watermark += REPORT_EVERY_N_READS # write out the trimmed/etc sequences that AREN'T going to be # revisited in a 2nd pass. write_record(read, trimfp) written_bp += len(read) written_reads += 1 pass2fp.close() log_info("{filename}: kept aside {kept} of {total} from first pass", filename=filename, kept=trimmer.n_saved - save_start, total=trimmer.n_reads - n_start) # first pass goes across all the data, so record relevant stats... n_reads = trimmer.n_reads n_bp = trimmer.n_bp n_skipped = trimmer.n_skipped bp_skipped = trimmer.bp_skipped save_pass2_total = trimmer.n_saved # ### SECOND PASS. ### # nothing should have been skipped yet! assert trimmer.n_skipped == 0 assert trimmer.bp_skipped == 0 if args.single_pass: pass2list = [] # go back through all the files again. for _, pass2filename, trimfp in pass2list: log_info('second pass: looking at sequences kept aside in {pass2}', pass2=pass2filename) # note that for this second pass, we don't care about paired # reads - they will be output in the same order they're read in, # so pairs will stay together if not orphaned. This is in contrast # to the first loop. Hence, force_single=True below. read_parser = ReadParser(pass2filename) paired_iter = broken_paired_reader(read_parser, min_length=K, force_single=True) watermark = REPORT_EVERY_N_READS for read in trimmer.pass2(paired_iter): if (trimmer.n_reads - n_start) > watermark: log_info('... x 2 {a} {b} {c} {d} {e} {f} {g}', a=trimmer.n_reads - n_start, b=pass2filename, c=trimmer.n_saved, d=trimmer.n_reads, e=trimmer.n_bp, f=written_reads, g=written_bp) watermark += REPORT_EVERY_N_READS write_record(read, trimfp) written_reads += 1 written_bp += len(read) read_parser.close() log_info('removing {pass2}', pass2=pass2filename) os.unlink(pass2filename) # if we created our own trimfps, close 'em. if not args.output: trimfp.close() try: log_info('removing temp directory & contents ({temp})', temp=tempdir) shutil.rmtree(tempdir) except OSError as oe: log_info('WARNING: unable to remove {temp} (probably an NFS issue); ' 'please remove manually', temp=tempdir) trimmed_reads = trimmer.trimmed_reads n_passes = 1.0 + (float(save_pass2_total) / n_reads) percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\ n_reads * 100.0 log_info('read {read} reads, {bp} bp', read=n_reads, bp=n_bp) log_info('wrote {wr} reads, {wbp} bp', wr=written_reads, wbp=written_bp) log_info('looked at {st} reads twice ({np:.2f} passes)', st=save_pass2_total, np=n_passes) log_info('removed {r} reads and trimmed {t} reads ({p:.2f}%)', r=n_reads - written_reads, t=trimmed_reads, p=percent_reads_trimmed) log_info('trimmed or removed {p:.2f}%% of bases ({bp} total)', p=(1 - (written_bp / float(n_bp))) * 100.0, bp=n_bp - written_bp) if args.variable_coverage: percent_reads_hicov = 100.0 * float(n_reads - n_skipped) / n_reads log_info('{n} reads were high coverage ({p:.2f}%);', n=n_reads - n_skipped, p=percent_reads_hicov) log_info('skipped {r} reads/{bp} bases because of low coverage', r=n_skipped, bp=bp_skipped) fp_rate = \ khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate) if args.output is None: log_info('output in *.abundtrim') elif args.output.name == 1: log_info('output streamed to stdout') elif args.output.name: log_info('output in {}'.format(args.output.name)) if args.savegraph: log_info("Saving k-mer countgraph to {graph}", graph=args.savegraph) ct.save(args.savegraph) if args.summary_info is not None: # note that when streaming to stdout the name of args.output will # be set to 1 if args.output is not None and args.output.name != 1: base = args.output.name # no explicit name or stdout stream -> use a default name else: base = 'trim-low-abund-{}'.format( time.strftime("%Y-%m-%dT%H:%M:%S")) info = {'fpr': fp_rate, 'reads': n_reads, 'basepairs': n_bp, 'reads_written': written_reads, 'basepairs_written': written_bp, 'reads_skipped': n_skipped, 'basepairs_skipped': bp_skipped, 'reads_removed': n_reads - written_reads, 'reads_trimmed': trimmed_reads, 'basepairs_removed_or_trimmed': n_bp - written_bp } store_provenance_info(info, fname=base, format=args.summary_info)
def main(): info('extract-paired-reads.py') args = get_parser().parse_args() infile = args.infile check_input_files(infile, args.force) check_space([infile], args.force) # decide where to put output files - specific directory? or just default? if infile == '/dev/stdin' or infile == '-': if not (args.output_paired and args.output_single): print( "Accepting input from stdin; output filenames must be " "provided.", file=sys.stderr) sys.exit(1) elif args.output_dir: if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) out1 = args.output_dir + '/' + os.path.basename(infile) + '.se' out2 = args.output_dir + '/' + os.path.basename(infile) + '.pe' else: out1 = os.path.basename(infile) + '.se' out2 = os.path.basename(infile) + '.pe' # OVERRIDE default output file locations with -p, -s if args.output_paired: paired_fp = args.output_paired out2 = paired_fp.name else: # Don't override, just open the default filename from above paired_fp = open(out2, 'w') if args.output_single: single_fp = args.output_single out1 = single_fp.name else: # Don't override, just open the default filename from above single_fp = open(out1, 'w') print('reading file "%s"' % infile, file=sys.stderr) print('outputting interleaved pairs to "%s"' % out2, file=sys.stderr) print('outputting orphans to "%s"' % out1, file=sys.stderr) n_pe = 0 n_se = 0 screed_iter = screed.open(infile, parse_description=False) for index, is_pair, read1, read2 in broken_paired_reader(screed_iter): if index % 100000 == 0 and index > 0: print('...', index, file=sys.stderr) if is_pair: write_record_pair(read1, read2, paired_fp) n_pe += 1 else: write_record(read1, single_fp) n_se += 1 single_fp.close() paired_fp.close() if n_pe == 0: raise Exception("no paired reads!? check file formats...") print('DONE; read %d sequences,' ' %d pairs and %d singletons' % (n_pe * 2 + n_se, n_pe, n_se), file=sys.stderr) print('wrote to: %s and %s' % (out2, out1), file=sys.stderr)
def main(): # pylint: disable=too-many-branches,too-many-statements info('normalize-by-median.py', ['diginorm']) args = get_parser().parse_args() report_on_config(args) report_fp = args.report # check for similar filenames filenames = [] for pathfilename in args.input_filenames: filename = pathfilename.split('/')[-1] if (filename in filenames): print >>sys.stderr, "WARNING: At least two input files are named \ %s . (The script normalize-by-median.py can not handle this, only one .keep \ file for one of the input files will be generated.)" % filename else: filenames.append(filename) # check for others check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savetable: check_space_for_hashtable( args.n_tables * args.min_tablesize, args.force) # list to save error files along with throwing exceptions corrupt_files = [] if args.loadtable: print 'loading k-mer counting table from', args.loadtable htable = khmer.load_counting_hash(args.loadtable) else: print >> sys.stderr, 'making k-mer counting table' htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables) input_filename = None for index, input_filename in enumerate(args.input_filenames): total_acc, discarded_acc, corrupt_files = \ normalize_by_median_and_check( input_filename, htable, args.single_output_file, args.fail_save, args.paired, args.cutoff, args.force, corrupt_files, report_fp) if (args.dump_frequency > 0 and index > 0 and index % args.dump_frequency == 0): print 'Backup: Saving k-mer counting file through', input_filename if args.savetable: hashname = args.savetable print '...saving to', hashname else: hashname = 'backup.ct' print 'Nothing given for savetable, saving to', hashname htable.save(hashname) if args.paired and args.unpaired_reads: args.paired = False output_name = args.unpaired_reads if not args.single_output_file: output_name = os.path.basename(args.unpaired_reads) + '.keep' outfp = open(output_name, 'w') total_acc, discarded_acc, corrupt_files = \ normalize_by_median_and_check( args.unpaired_reads, htable, args.single_output_file, args.fail_save, args.paired, args.cutoff, args.force, corrupt_files, report_fp) if args.report_total_kmers: print >> sys.stderr, 'Total number of unique k-mers: {0}'.format( htable.n_unique_kmers()) if args.savetable: print 'Saving k-mer counting table through', input_filename print '...saving to', args.savetable htable.save(args.savetable) fp_rate = \ khmer.calc_expected_collisions(htable, args.force, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 print >> sys.stderr, \ 'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate) if args.force and len(corrupt_files) > 0: print >> sys.stderr, "** WARNING: Finished with errors!" print >> sys.stderr, "** IOErrors occurred in the following files:" print >> sys.stderr, "\t", " ".join(corrupt_files)
def main(): # pylint: disable=too-many-branches,too-many-statements info('normalize-by-median.py', ['diginorm']) args = get_parser().parse_args() report_on_config(args) report_fp = args.report force_single = args.force_single # if optimization args are given, do optimization args = oxutils.do_sanity_checking(args, 0.1) # check for similar filenames # if we're using a single output file only check for identical filenames # otherwise, check for identical BASE names as well. filenames = [] basenames = [] for pathfilename in args.input_filenames: filenames.append(pathfilename) if args.single_output_file: continue # nothing more to worry about basename = os.path.basename(pathfilename) if basename in basenames: print('ERROR: Duplicate filename--Cannot handle this!', file=sys.stderr) print('** Exiting!', file=sys.stderr) sys.exit(1) basenames.append(basename) # check that files exist and there is sufficient output disk space. check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savetable: check_space_for_hashtable(args, 'countgraph', args.force) # load or create counting table. if args.loadtable: print('loading k-mer counting table from ' + args.loadtable, file=sys.stderr) htable = khmer.load_counting_hash(args.loadtable) if args.unique_kmers != 0: print('Warning: You have specified a number of unique kmers' ' but are loading a precreated counting table--' 'argument optimization will NOT be done.', file=sys.stderr) else: print('making countgraph', file=sys.stderr) htable = khmer_args.create_countgraph(args) # create an object to handle diginorm of all files norm = Normalizer(args.cutoff, htable) # make a list of all filenames and if they're paired or not; # if we don't know if they're paired, default to allowing but not # forcing pairing. files = [] for element in filenames: files.append([element, args.paired]) if args.unpaired_reads: files.append([args.unpaired_reads, False]) corrupt_files = [] outfp = None output_name = None if args.single_output_file: if args.single_output_file is sys.stdout: output_name = '/dev/stdout' else: output_name = args.single_output_file.name outfp = args.single_output_file # # main loop: iterate over all files given, do diginorm. # for filename, require_paired in files: if not args.single_output_file: output_name = os.path.basename(filename) + '.keep' outfp = open(output_name, 'w') # failsafe context manager in case an input file breaks with catch_io_errors(filename, outfp, args.single_output_file, args.force, corrupt_files): screed_iter = screed.open(filename, parse_description=False) reader = broken_paired_reader(screed_iter, min_length=args.ksize, force_single=force_single, require_paired=require_paired) # actually do diginorm for record in with_diagnostics(filename, norm, reader, report_fp): if record is not None: write_record(record, outfp) print('output in ' + output_name, file=sys.stderr) if output_name is not '/dev/stdout': outfp.close() # finished - print out some diagnostics. print('Total number of unique k-mers: {0}' .format(htable.n_unique_kmers()), file=sys.stderr) if args.savetable: print('...saving to ' + args.savetable, file=sys.stderr) htable.save(args.savetable) fp_rate = \ khmer.calc_expected_collisions(htable, args.force, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 print('fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate), file=sys.stderr) if args.force and len(corrupt_files) > 0: print("** WARNING: Finished with errors!", file=sys.stderr) print("** I/O Errors occurred in the following files:", file=sys.stderr) print("\t", " ".join(corrupt_files), file=sys.stderr)
def main(): info('sweep-reads-buffered.py', ['sweep']) parser = get_parser() args = parser.parse_args() if args.min_tablesize < MIN_HSIZE: args.min_tablesize = MIN_HSIZE if args.ksize < MIN_KSIZE: args.ksize = MIN_KSIZE report_on_config(args, hashtype='hashbits') K = args.ksize HT_SIZE = args.min_tablesize N_HT = args.n_tables traversal_range = args.traversal_range input_fastp = args.input_fastp if not args.outdir: outdir = os.path.dirname(input_fastp) else: outdir = args.outdir max_buffers = args.max_buffers output_pref = args.output_prefix buf_size = args.buffer_size max_reads = args.max_reads check_input_files(args.input_fastp, args.force) check_valid_file_exists(args.input_files) all_input_files = [input_fastp] all_input_files.extend(args.input_files) # Check disk space availability check_space(all_input_files, args.force) # figure out input file type (FA/FQ) -- based on first file ix = iter(screed.open(args.input_files[0])) record = ix.next() del ix extension = 'fa' if hasattr(record, 'quality'): # fastq! extension = 'fq' output_buffer = ReadBufferManager( max_buffers, max_reads, buf_size, output_pref, outdir, extension) # consume the partitioned fasta with which to label the graph ht = khmer.LabelHash(K, HT_SIZE, N_HT) try: print >>sys.stderr, 'consuming input sequences...' if args.label_by_pid: print >>sys.stderr, '...labeling by partition id (pid)' ht.consume_partitioned_fasta_and_tag_with_labels(input_fastp) elif args.label_by_seq: print >>sys.stderr, '...labeling by sequence' for n, record in enumerate(screed.open(input_fastp)): if n % 50000 == 0: print >>sys.stderr, \ '...consumed {n} sequences...'.format(n=n) ht.consume_sequence_and_tag_with_labels(record.sequence, n) else: print >>sys.stderr, \ '...labeling to create groups of size {s}'.format( s=args.group_size) label = -1 g = 0 try: outfp = open('{pref}_base_{g}.{ext}'.format(pref=output_pref, g=g, ext=extension ), 'wb') for n, record in enumerate(screed.open(input_fastp)): if n % args.group_size == 0: label += 1 if label > g: g = label outfp = open('{pref}_base_{g}.{ext}'.format( pref=output_pref, g=g, ext=extension), 'wb') if n % 50000 == 0: print >>sys.stderr, \ '...consumed {n} sequences...'.format(n=n) ht.consume_sequence_and_tag_with_labels(record.sequence, label) write_record(record, outfp) except IOError as e: print >>sys.stderr, '!! ERROR !!', e print >>sys.stderr, '...error splitting input. exiting...' except IOError as e: print >>sys.stderr, '!! ERROR: !!', e print >>sys.stderr, '...error consuming \ {i}. exiting...'.format(i=input_fastp) print >>sys.stderr, 'done consuming input sequence. \ added {t} tags and {l} \ labels...'.format(t=ht.n_tags(), l=ht.n_labels()) label_dict = defaultdict(int) label_number_dist = [] n_orphaned = 0 n_labeled = 0 n_mlabeled = 0 total_t = time.clock() start_t = time.clock() for read_file in args.input_files: print >>sys.stderr, '** sweeping {read_file} for labels...'.format( read_file=read_file) file_t = 0.0 try: read_fp = screed.open(read_file) except IOError as error: print >>sys.stderr, '!! ERROR: !!', error print >>sys.stderr, '*** Could not open {fn}, skipping...'.format( fn=read_file) else: for _, record in enumerate(read_fp): if _ % 50000 == 0: end_t = time.clock() batch_t = end_t - start_t file_t += batch_t print >>sys.stderr, '\tswept {n} reads [{nc} labeled, \ {no} orphaned] \ ** {sec}s ({sect}s total)' \ .format(n=_, nc=n_labeled, no=n_orphaned, sec=batch_t, sect=file_t) start_t = time.clock() seq = record.sequence name = record.name try: labels = ht.sweep_label_neighborhood(seq, traversal_range) except ValueError as e: pass else: if hasattr(record, 'quality'): seq_str = fmt_fastq(name, seq, record.quality, labels) else: seq_str = fmt_fasta(name, seq, labels) label_number_dist.append(len(labels)) if labels: n_labeled += 1 if len(labels) > 1: output_buffer.queue(seq_str, 'multi') n_mlabeled += 1 label_dict['multi'] += 1 else: output_buffer.queue(seq_str, labels[0]) label_dict[labels[0]] += 1 else: n_orphaned += 1 output_buffer.queue(seq_str, 'orphaned') label_dict['orphaned'] += 1 print >>sys.stderr, '** End of file {fn}...'.format(fn=read_file) output_buffer.flush_all() read_fp.close() # gotta output anything left in the buffers at the end! print >>sys.stderr, '** End of run...' output_buffer.flush_all() total_t = time.clock() - total_t if output_buffer.num_write_errors > 0 or output_buffer.num_file_errors > 0: print >>sys.stderr, '! WARNING: Sweep finished with errors !' print >>sys.stderr, '** {writee} reads not written'.format( writee=output_buffer.num_write_errors) print >>sys.stderr, '** {filee} errors opening files'.format( filee=output_buffer.num_file_errors) print >>sys.stderr, 'swept {n_reads} for labels...'.format( n_reads=n_labeled + n_orphaned) print >>sys.stderr, '...with {nc} labeled and {no} orphaned'.format( nc=n_labeled, no=n_orphaned) print >>sys.stderr, '...and {nmc} multilabeled'.format(nmc=n_mlabeled) print >>sys.stderr, '** outputting label number distribution...' fn = os.path.join(outdir, '{pref}.dist.txt'.format(pref=output_pref)) with open(fn, 'wb') as outfp: for nc in label_number_dist: outfp.write('{nc}\n'.format(nc=nc)) fn = os.path.join(outdir, '{pref}.counts.csv'.format(pref=output_pref)) print >>sys.stderr, '** outputting label read counts...' with open(fn, 'wb') as outfp: for k in label_dict: outfp.write('{l},{c}\n'.format(l=k, c=label_dict[k]))
def main(): info('trim-low-abund.py', ['streaming']) parser = sanitize_help(get_parser()) args = parser.parse_args() ### if len(set(args.input_filenames)) != len(args.input_filenames): print("Error: Cannot input the same filename multiple times.", file=sys.stderr) sys.exit(1) ### report_on_config(args) check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savegraph: graphsize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.savegraph, graphsize, args.force) if ('-' in args.input_filenames or '/dev/stdin' in args.input_filenames) \ and not args.output: print( "Accepting input from stdin; output filename must " "be provided with -o.", file=sys.stderr) sys.exit(1) if args.loadgraph: print('loading countgraph from', args.loadgraph, file=sys.stderr) ct = khmer.load_countgraph(args.loadgraph) else: print('making countgraph', file=sys.stderr) ct = khmer_args.create_countgraph(args) K = ct.ksize() CUTOFF = args.cutoff NORMALIZE_LIMIT = args.normalize_to tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir) print('created temporary directory %s; ' 'use -T to change location' % tempdir, file=sys.stderr) # ### FIRST PASS ### save_pass2_total = 0 n_bp = 0 n_reads = 0 written_bp = 0 written_reads = 0 trimmed_reads = 0 pass2list = [] for filename in args.input_filenames: pass2filename = os.path.basename(filename) + '.pass2' pass2filename = os.path.join(tempdir, pass2filename) if args.output is None: trimfp = get_file_writer( open(os.path.basename(filename) + '.abundtrim', 'wb'), args.gzip, args.bzip) else: trimfp = get_file_writer(args.output, args.gzip, args.bzip) pass2list.append((filename, pass2filename, trimfp)) screed_iter = screed.open(filename) pass2fp = open(pass2filename, 'w') save_pass2 = 0 n = 0 paired_iter = broken_paired_reader(screed_iter, min_length=K, force_single=args.ignore_pairs) for n, is_pair, read1, read2 in paired_iter: if n % 10000 == 0: print('...', n, filename, save_pass2, n_reads, n_bp, written_reads, written_bp, file=sys.stderr) # we want to track paired reads here, to make sure that pairs # are not split between first pass and second pass. if is_pair: n_reads += 2 n_bp += len(read1.sequence) + len(read2.sequence) seq1 = read1.sequence.replace('N', 'A') seq2 = read2.sequence.replace('N', 'A') med1, _, _ = ct.get_median_count(seq1) med2, _, _ = ct.get_median_count(seq2) if med1 < NORMALIZE_LIMIT or med2 < NORMALIZE_LIMIT: ct.consume(seq1) ct.consume(seq2) write_record_pair(read1, read2, pass2fp) save_pass2 += 2 else: _, trim_at1 = ct.trim_on_abundance(seq1, CUTOFF) _, trim_at2 = ct.trim_on_abundance(seq2, CUTOFF) if trim_at1 >= K: read1 = trim_record(read1, trim_at1) if trim_at2 >= K: read2 = trim_record(read2, trim_at2) if trim_at1 != len(seq1): trimmed_reads += 1 if trim_at2 != len(seq2): trimmed_reads += 1 write_record_pair(read1, read2, trimfp) written_reads += 2 written_bp += trim_at1 + trim_at2 else: n_reads += 1 n_bp += len(read1.sequence) seq = read1.sequence.replace('N', 'A') med, _, _ = ct.get_median_count(seq) # has this portion of the graph saturated? if not, # consume & save => pass2. if med < NORMALIZE_LIMIT: ct.consume(seq) write_record(read1, pass2fp) save_pass2 += 1 else: # trim!! _, trim_at = ct.trim_on_abundance(seq, CUTOFF) if trim_at >= K: new_read = trim_record(read1, trim_at) write_record(new_read, trimfp) written_reads += 1 written_bp += trim_at if trim_at != len(read1.sequence): trimmed_reads += 1 pass2fp.close() print('%s: kept aside %d of %d from first pass, in %s' % (filename, save_pass2, n, filename), file=sys.stderr) save_pass2_total += save_pass2 # ### SECOND PASS. ### skipped_n = 0 skipped_bp = 0 for _, pass2filename, trimfp in pass2list: print('second pass: looking at sequences kept aside in %s' % pass2filename, file=sys.stderr) # note that for this second pass, we don't care about paired # reads - they will be output in the same order they're read in, # so pairs will stay together if not orphaned. This is in contrast # to the first loop. for n, read in enumerate(screed.open(pass2filename)): if n % 10000 == 0: print('... x 2', n, pass2filename, written_reads, written_bp, file=sys.stderr) seq = read.sequence.replace('N', 'A') med, _, _ = ct.get_median_count(seq) # do we retain low-abundance components unchanged? if med < NORMALIZE_LIMIT and args.variable_coverage: write_record(read, trimfp) written_reads += 1 written_bp += len(read.sequence) skipped_n += 1 skipped_bp += len(read.sequence) # otherwise, examine/trim/truncate. else: # med >= NORMALIZE LIMIT or not args.variable_coverage _, trim_at = ct.trim_on_abundance(seq, CUTOFF) if trim_at >= K: new_read = trim_record(read, trim_at) write_record(new_read, trimfp) written_reads += 1 written_bp += trim_at if trim_at != len(read.sequence): trimmed_reads += 1 print('removing %s' % pass2filename, file=sys.stderr) os.unlink(pass2filename) print('removing temp directory & contents (%s)' % tempdir, file=sys.stderr) shutil.rmtree(tempdir) n_passes = 1.0 + (float(save_pass2_total) / n_reads) percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\ n_reads * 100.0 print('read %d reads, %d bp' % ( n_reads, n_bp, ), file=sys.stderr) print('wrote %d reads, %d bp' % ( written_reads, written_bp, ), file=sys.stderr) print('looked at %d reads twice (%.2f passes)' % (save_pass2_total, n_passes), file=sys.stderr) print('removed %d reads and trimmed %d reads (%.2f%%)' % (n_reads - written_reads, trimmed_reads, percent_reads_trimmed), file=sys.stderr) print('trimmed or removed %.2f%% of bases (%d total)' % ((1 - (written_bp / float(n_bp))) * 100.0, n_bp - written_bp), file=sys.stderr) if args.variable_coverage: percent_reads_hicov = 100.0 * float(n_reads - skipped_n) / n_reads print('%d reads were high coverage (%.2f%%);' % (n_reads - skipped_n, percent_reads_hicov), file=sys.stderr) print('skipped %d reads/%d bases because of low coverage' % (skipped_n, skipped_bp), file=sys.stderr) fp_rate = \ khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 print('fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate), file=sys.stderr) print('output in *.abundtrim', file=sys.stderr) if args.savegraph: print("Saving k-mer countgraph to", args.savegraph, file=sys.stderr) ct.save(args.savegraph)
def main(): # pylint: disable=too-many-locals,too-many-statements info('do-partition.py', ['graph']) args = get_parser().parse_args() report_on_config(args, hashtype='hashbits') for infile in args.input_filenames: check_file_status(infile, args.force) check_space(args.input_filenames, args.force) print >>sys.stderr, 'Saving k-mer presence table to %s' % args.graphbase print >>sys.stderr, 'Loading kmers from sequences in %s' % \ repr(args.input_filenames) print >>sys.stderr, '--' print >>sys.stderr, 'SUBSET SIZE', args.subset_size print >>sys.stderr, 'N THREADS', args.threads print >>sys.stderr, '--' # load-graph print >>sys.stderr, 'making k-mer presence table' htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables) for _, filename in enumerate(args.input_filenames): print >>sys.stderr, 'consuming input', filename htable.consume_fasta_and_tag(filename) fp_rate = khmer.calc_expected_collisions(htable) print >>sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate if fp_rate > 0.15: # 0.18 is ACTUAL MAX. Do not change. print >> sys.stderr, "**" print >> sys.stderr, ("** ERROR: the graph structure is too small for" " this data set. Increase k-mer presence table " "size/num of tables.") print >> sys.stderr, "**" if not args.force: sys.exit(1) # partition-graph # do we want to exhaustively traverse the graph? stop_big_traversals = args.no_big_traverse if stop_big_traversals: print >>sys.stderr, '** This script brakes for lumps: ', \ 'stop_big_traversals is true.' else: print >>sys.stderr, '** Traverse all the things:', \ ' stop_big_traversals is false.' # # now, partition! # # divide the tags up into subsets divvy = htable.divide_tags_into_subsets(int(args.subset_size)) n_subsets = len(divvy) divvy.append(0) # build a queue of tasks: worker_q = Queue.Queue() # break up the subsets into a list of worker tasks for _ in range(0, n_subsets): start = divvy[_] end = divvy[_ + 1] worker_q.put((htable, _, start, end)) print >>sys.stderr, 'enqueued %d subset tasks' % n_subsets open('%s.info' % args.graphbase, 'w').write('%d subsets total\n' % (n_subsets)) if n_subsets < args.threads: args.threads = n_subsets # start threads! print >>sys.stderr, 'starting %d threads' % args.threads print >>sys.stderr, '---' threads = [] for _ in range(args.threads): cur_thread = threading.Thread(target=worker, args=(worker_q, args.graphbase, stop_big_traversals)) threads.append(cur_thread) cur_thread.start() assert threading.active_count() == args.threads + 1 print >>sys.stderr, 'done starting threads' # wait for threads for _ in threads: _.join() print >>sys.stderr, '---' print >>sys.stderr, 'done making subsets! see %s.subset.*.pmap' % \ (args.graphbase,) # merge-partitions pmap_files = glob.glob(args.graphbase + '.subset.*.pmap') print >>sys.stderr, 'loading %d pmap files (first one: %s)' % \ (len(pmap_files), pmap_files[0]) htable = khmer.new_hashbits(args.ksize, 1, 1) for pmap_file in pmap_files: print >>sys.stderr, 'merging', pmap_file htable.merge_subset_from_disk(pmap_file) if args.remove_subsets: print >>sys.stderr, 'removing pmap files' for pmap_file in pmap_files: os.unlink(pmap_file) # annotate-partitions for infile in args.input_filenames: print >>sys.stderr, 'outputting partitions for', infile outfile = os.path.basename(infile) + '.part' part_count = htable.output_partitions(infile, outfile) print >>sys.stderr, 'output %d partitions for %s' % ( part_count, infile) print >>sys.stderr, 'partitions are in', outfile
def main(): info('filter-abund-single.py', ['counting', 'SeqAn']) args = get_parser().parse_args() check_input_files(args.datafile, args.force) check_space([args.datafile], args.force) if args.savetable: check_space_for_hashtable(args.n_tables * args.min_tablesize, args.force) report_on_config(args) print('making k-mer counting table', file=sys.stderr) htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables) # first, load reads into hash table rparser = khmer.ReadParser(args.datafile) threads = [] print('consuming input, round 1 --', args.datafile, file=sys.stderr) for _ in range(args.threads): cur_thread = \ threading.Thread( target=htable.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thread) cur_thread.start() for _ in threads: _.join() if args.report_total_kmers: print('Total number of unique k-mers: {0}'.format( htable.n_unique_kmers()), file=sys.stderr) fp_rate = khmer.calc_expected_collisions(htable, args.force) print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr) # now, trim. # the filtering function. def process_fn(record): name = record.name seq = record.sequence seqN = seq.replace('N', 'A') _, trim_at = htable.trim_on_abundance(seqN, args.cutoff) if trim_at >= args.ksize: # be sure to not to change the 'N's in the trimmed sequence - # so, return 'seq' and not 'seqN'. return name, seq[:trim_at] return None, None # the filtering loop print('filtering', args.datafile, file=sys.stderr) outfile = os.path.basename(args.datafile) + '.abundfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(args.datafile), outfp) print('output in', outfile, file=sys.stderr) if args.savetable: print('Saving k-mer counting table filename', args.savetable, file=sys.stderr) print('...saving to', args.savetable, file=sys.stderr) htable.save(args.savetable) print('wrote to: ', outfile, file=sys.stderr)