def main(): args = sanitize_help(get_parser()).parse_args() if not args.quiet: info('filter-abund.py', ['counting']) configure_logging(args.quiet) infiles = args.input_filename if ('-' in infiles or '/dev/stdin' in infiles) and not \ args.single_output_file: log_error("Accepting input from stdin; output filename must " "be provided with -o.") sys.exit(1) for filename in infiles: check_input_files(filename, args.force) check_space(infiles, args.force) log_info('loading countgraph: {graph}', graph=args.input_graph) countgraph = khmer.load_countgraph(args.input_graph) ksize = countgraph.ksize() log_info("K: {ksize}", ksize=ksize) if args.single_output_file: outfile = args.single_output_file.name outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip) # the filtering loop for infile in infiles: log_info('filtering {infile}', infile=infile) if not args.single_output_file: outfile = os.path.basename(infile) + '.abundfilt' outfp = open(outfile, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) paired_iter = broken_paired_reader(ReadParser(infile), min_length=ksize, force_single=True) for n, is_pair, read1, read2 in paired_iter: assert not is_pair assert read2 is None trimmed_record, _ = trim_record(countgraph, read1, args.cutoff, args.variable_coverage, args.normalize_to) if trimmed_record: write_record(trimmed_record, outfp) log_info('output in {outfile}', outfile=outfile)
def main(): args = sanitize_help(get_parser()).parse_args() configure_logging(args.quiet) infiles = args.input_filename if ('-' in infiles or '/dev/stdin' in infiles) and not \ args.single_output_file: log_error("Accepting input from stdin; output filename must " "be provided with -o.") sys.exit(1) for filename in infiles: check_input_files(filename, args.force) check_space(infiles, args.force) log_info('loading countgraph: {graph}', graph=args.input_graph) countgraph = khmer.load_countgraph(args.input_graph) ksize = countgraph.ksize() log_info("K: {ksize}", ksize=ksize) if args.single_output_file: outfile = args.single_output_file.name outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip) # the filtering loop for infile in infiles: log_info('filtering {infile}', infile=infile) if not args.single_output_file: outfile = os.path.basename(infile) + '.abundfilt' outfp = open(outfile, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) paired_iter = broken_paired_reader(ReadParser(infile), min_length=ksize, force_single=True) for n, is_pair, read1, read2 in paired_iter: assert not is_pair assert read2 is None trimmed_record, _ = trim_record(countgraph, read1, args.cutoff, args.variable_coverage, args.normalize_to) if trimmed_record: write_record(trimmed_record, outfp) log_info('output in {outfile}', outfile=outfile)
def catch_io_errors(ifile, out1, out2, out3, out4, out5, out6, force, corrupt_files): """Context manager to do boilerplate handling of IOErrors.""" try: yield except (IOError, OSError, ValueError) as error: log_error('** ERROR: {error}', error=str(error)) log_error('** Failed on {name}: ', name=ifile) if not force: log_error('** Exiting!') sys.exit(1) else: log_error('*** Skipping error file, moving on...') corrupt_files.append(ifile)
def catch_io_errors(ifile, out, single_out, force, corrupt_files): """Context manager to do boilerplate handling of IOErrors.""" try: yield except (IOError, OSError, ValueError) as error: log_error('** ERROR: {error}', error=str(error)) log_error('** Failed on {name}: ', name=ifile) if not single_out: os.remove(out.name) if not force: log_error('** Exiting!') sys.exit(1) else: log_error('*** Skipping error file, moving on...') corrupt_files.append(ifile)
def main(): # pylint: disable=too-many-branches,too-many-statements start_time = time.time() parser = sanitize_help(get_parser()) args = parser.parse_args() configure_logging(args.quiet) report_on_config(args) report_fp = args.report force_single = args.force_single # check for similar filenames # if we're using a single output file only check for identical filenames # otherwise, check for identical BASE names as well. filenames = [] basenames = [] for pathfilename in args.input_filenames: filenames.append(pathfilename) if args.single_output_file: continue # nothing more to worry about basename = os.path.basename(pathfilename) if basename in basenames: log_error('ERROR: Duplicate filename--Cannot handle this!') log_error('** Exiting!') sys.exit(1) basenames.append(basename) # check that files exist and there is sufficient output disk space. check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savegraph is not None: graphsize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.savegraph, graphsize, args.force) # load or create counting table. if args.loadgraph: log_info('loading k-mer countgraph from {graph}', graph=args.loadgraph) countgraph1 = Countgraph.load(args.loadgraph) # load second counting table. if args.loadgraph2: log_info('loading k-mer countgraph from {graph}', graph=args.loadgraph2) countgraph2 = Countgraph.load(args.loadgraph2) # make a list of all filenames and if they're paired or not; # if we don't know if they're paired, default to allowing but not # forcing pairing. files = [] for element in filenames: files.append([element, args.paired]) if args.unpaired_reads: files.append([args.unpaired_reads, False]) # # main loop: iterate over all files given, do diginorm. # for filename, require_paired in files: if not args.single_output_file: output_name = os.path.basename(filename) + '.keep' outfp = open(output_name, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) screed_iter = clean_input_reads(screed.open(filename)) reader = broken_paired_reader(screed_iter, min_length=args.ksize, force_single=force_single, require_paired=require_paired) # actually do diginorm for _, is_paired, read0, read1 in reader: for record in snarf(is_paired, read0, read1, countgraph1, countgraph2): if record is not None: write_record(record, outfp) print("--- %s seconds ---" % (time.time() - start_time))
def main(): # pylint: disable=too-many-branches,too-many-statements parser = sanitize_help(get_parser()) args = parser.parse_args() configure_logging(args.quiet) report_on_config(args) report_fp = args.report force_single = args.force_single # check for similar filenames # if we're using a single output file only check for identical filenames # otherwise, check for identical BASE names as well. filenames = [] basenames = [] for pathfilename in args.input_filenames: filenames.append(pathfilename) if args.single_output_file: continue # nothing more to worry about basename = os.path.basename(pathfilename) if basename in basenames: log_error('ERROR: Duplicate filename--Cannot handle this!') log_error('** Exiting!') sys.exit(1) basenames.append(basename) # check that files exist and there is sufficient output disk space. check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savegraph is not None: graphsize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.savegraph, graphsize, args.force) # load or create counting table. if args.loadgraph: log_info('loading k-mer countgraph from {graph}', graph=args.loadgraph) countgraph = khmer.load_countgraph(args.loadgraph) else: log_info('making countgraph') countgraph = khmer_args.create_countgraph(args) # create an object to handle diginorm of all files norm = Normalizer(args.cutoff, countgraph) with_diagnostics = WithDiagnostics(norm, report_fp, args.report_frequency) # make a list of all filenames and if they're paired or not; # if we don't know if they're paired, default to allowing but not # forcing pairing. files = [] for element in filenames: files.append([element, args.paired]) if args.unpaired_reads: files.append([args.unpaired_reads, False]) corrupt_files = [] outfp = None output_name = None if args.single_output_file: outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip) else: if '-' in filenames or '/dev/stdin' in filenames: print( "Accepting input from stdin; output filename must " "be provided with '-o'.", file=sys.stderr) sys.exit(1) # # main loop: iterate over all files given, do diginorm. # for filename, require_paired in files: if not args.single_output_file: output_name = os.path.basename(filename) + '.keep' outfp = open(output_name, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) # failsafe context manager in case an input file breaks with catch_io_errors(filename, outfp, args.single_output_file, args.force, corrupt_files): screed_iter = clean_input_reads(screed.open(filename)) reader = broken_paired_reader(screed_iter, min_length=args.ksize, force_single=force_single, require_paired=require_paired) # actually do diginorm for record in with_diagnostics(reader, filename): if record is not None: write_record(record, outfp) log_info('output in {name}', name=describe_file_handle(outfp)) if not args.single_output_file: outfp.close() # finished - print out some diagnostics. log_info('Total number of unique k-mers: {umers}', umers=countgraph.n_unique_kmers()) if args.savegraph is not None: log_info('...saving to {name}', name=args.savegraph) countgraph.save(args.savegraph) fp_rate = \ khmer.calc_expected_collisions(countgraph, False, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate) if args.force and len(corrupt_files) > 0: log_error("** WARNING: Finished with errors!") log_error("** I/O Errors occurred in the following files:") log_error("\t" + " ".join(corrupt_files))
def main(): # pylint: disable=too-many-branches,too-many-statements parser = sanitize_help(get_parser()) args = parser.parse_args() configure_logging(args.quiet) report_on_config(args) report_fp = args.report force_single = args.force_single # check for similar filenames # if we're using a single output file only check for identical filenames # otherwise, check for identical BASE names as well. filenames = [] basenames = [] for pathfilename in args.input_filenames: filenames.append(pathfilename) if args.single_output_file: continue # nothing more to worry about basename = os.path.basename(pathfilename) if basename in basenames: log_error('ERROR: Duplicate filename--Cannot handle this!') log_error('** Exiting!') sys.exit(1) basenames.append(basename) # check that files exist and there is sufficient output disk space. check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savegraph is not None: graphsize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.savegraph, graphsize, args.force) # load or create counting table. if args.loadgraph: log_info('loading k-mer countgraph from {graph}', graph=args.loadgraph) countgraph = Countgraph.load(args.loadgraph) else: log_info('making countgraph') countgraph = khmer_args.create_countgraph(args) # create an object to handle diginorm of all files norm = Normalizer(args.cutoff, countgraph) with_diagnostics = WithDiagnostics(norm, report_fp, args.report_frequency) # make a list of all filenames and if they're paired or not; # if we don't know if they're paired, default to allowing but not # forcing pairing. files = [] for element in filenames: files.append([element, args.paired]) if args.unpaired_reads: files.append([args.unpaired_reads, False]) corrupt_files = [] outfp = None output_name = None if args.single_output_file: outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip) else: if '-' in filenames or '/dev/stdin' in filenames: print("Accepting input from stdin; output filename must " "be provided with '-o'.", file=sys.stderr) sys.exit(1) # # main loop: iterate over all files given, do diginorm. # for filename, require_paired in files: if not args.single_output_file: output_name = os.path.basename(filename) + '.keep' outfp = open(output_name, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) # failsafe context manager in case an input file breaks with catch_io_errors(filename, outfp, args.single_output_file, args.force, corrupt_files): screed_iter = clean_input_reads(screed.open(filename)) reader = broken_paired_reader(screed_iter, min_length=args.ksize, force_single=force_single, require_paired=require_paired) # actually do diginorm for record in with_diagnostics(reader, filename): if record is not None: write_record(record, outfp) log_info('output in {name}', name=describe_file_handle(outfp)) if not args.single_output_file: outfp.close() # finished - print out some diagnostics. log_info('Total number of unique k-mers: {umers}', umers=countgraph.n_unique_kmers()) if args.savegraph is not None: log_info('...saving to {name}', name=args.savegraph) countgraph.save(args.savegraph) fp_rate = \ khmer.calc_expected_collisions(countgraph, False, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate) if args.force and len(corrupt_files) > 0: log_error("** WARNING: Finished with errors!") log_error("** I/O Errors occurred in the following files:") log_error("\t" + " ".join(corrupt_files))
def main(): # pylint: disable=too-many-locals,too-many-branches args = sanitize_help(get_parser()).parse_args() graph_type = 'smallcountgraph' if args.small_count else 'countgraph' configure_logging(args.quiet) report_on_config(args, graph_type) check_input_files(args.input_sequence_filename, args.force) if args.savegraph is not None: graphsize = calculate_graphsize(args, graph_type) check_space_for_graph(args.savegraph, graphsize, args.force) if (not args.squash_output and os.path.exists(args.output_histogram_filename)): log_error('ERROR: {output} exists; not squashing.', output=args.output_histogram_filename) sys.exit(1) else: hist_fp = open(args.output_histogram_filename, 'w') hist_fp_csv = csv.writer(hist_fp) # write headers: hist_fp_csv.writerow( ['abundance', 'count', 'cumulative', 'cumulative_fraction']) log_info('making countgraph') # In case the user specified a maximum memory usage, use 8/(9+eps) of that # for the countgraph and 1/(9+eps) for the tracking nodegraph # `eps` is used to account for the memory used by the python interpreter countgraph = khmer_args.create_countgraph(args, multiplier=8 / (9. + 0.3)) countgraph.set_use_bigcount(args.bigcount) log_info('building k-mer tracking graph') tracking = khmer_args.create_matching_nodegraph(countgraph) log_info('kmer_size: {ksize}', ksize=countgraph.ksize()) log_info('k-mer countgraph sizes: {sizes}', sizes=countgraph.hashsizes()) log_info('outputting to {output}', output=args.output_histogram_filename) # start loading rparser = khmer.ReadParser(args.input_sequence_filename) threads = [] log_info('consuming input, round 1 -- {input}', input=args.input_sequence_filename) for _ in range(args.threads): thread = \ threading.Thread( target=countgraph.consume_seqfile_with_reads_parser, args=(rparser, ) ) threads.append(thread) thread.start() for thread in threads: thread.join() log_info('Total number of unique k-mers: {nk}', nk=countgraph.n_unique_kmers()) abundance_lists = [] def __do_abundance_dist__(read_parser): abundances = countgraph.abundance_distribution_with_reads_parser( read_parser, tracking) abundance_lists.append(abundances) log_info('preparing hist from {seqfile}...', seqfile=args.input_sequence_filename) rparser = khmer.ReadParser(args.input_sequence_filename) threads = [] log_info('consuming input, round 2 -- {filename}', filename=args.input_sequence_filename) for _ in range(args.threads): thread = \ threading.Thread( target=__do_abundance_dist__, args=(rparser, ) ) threads.append(thread) thread.start() for thread in threads: thread.join() assert len(abundance_lists) == args.threads, len(abundance_lists) abundance = {} for abundance_list in abundance_lists: for i, count in enumerate(abundance_list): abundance[i] = abundance.get(i, 0) + count total = sum(abundance.values()) if 0 == total: log_error("ERROR: abundance distribution is uniformly zero; " "nothing to report.") log_error("\tPlease verify that the input files are valid.") sys.exit(1) sofar = 0 for _, i in sorted(abundance.items()): if i == 0 and not args.output_zero: continue sofar += i frac = sofar / float(total) hist_fp_csv.writerow([_, i, sofar, round(frac, 3)]) if sofar == total: break if args.savegraph is not None: log_info('Saving k-mer countgraph to {savegraph}', savegraph=args.savegraph) countgraph.save(args.savegraph) log_info('wrote to: {output}', output=args.output_histogram_filename)
def main(): parser = sanitize_help(get_parser()) args = parser.parse_args() if not args.quiet: info('trim-low-abund.py', ['streaming']) configure_logging(args.quiet) ### if len(set(args.input_filenames)) != len(args.input_filenames): log_error("Error: Cannot input the same filename multiple times.") sys.exit(1) if args.trim_at_coverage != DEFAULT_TRIM_AT_COVERAGE and \ not args.variable_coverage: log_error("Error: --trim-at-coverage/-Z given, but " "--variable-coverage/-V not specified.") sys.exit(1) if args.diginorm_coverage != DEFAULT_DIGINORM_COVERAGE and \ not args.diginorm: log_error("Error: --diginorm-coverage given, but " "--diginorm not specified.") sys.exit(1) if args.diginorm and args.single_pass: log_error("Error: --diginorm and --single-pass are incompatible!\n" "You probably want to use normalize-by-median.py instead.") sys.exit(1) ### report_on_config(args) check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savegraph: graphsize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.savegraph, graphsize, args.force) if ('-' in args.input_filenames or '/dev/stdin' in args.input_filenames) \ and not args.output: log_error("Accepting input from stdin; output filename must " "be provided with -o.") sys.exit(1) if args.loadgraph: log_info('loading countgraph from {graph}', graph=args.loadgraph) ct = khmer.load_countgraph(args.loadgraph) else: log_info('making countgraph') ct = khmer_args.create_countgraph(args) K = ct.ksize() tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir) log_info( 'created temporary directory {temp};\n' 'use -T to change location', temp=tempdir) trimmer = Trimmer(ct, not args.variable_coverage, args.cutoff, args.trim_at_coverage) if args.diginorm: trimmer.set_diginorm(args.diginorm_coverage) # ### FIRST PASS ### save_pass2_total = 0 written_bp = 0 written_reads = 0 # only create the file writer once if outfp is specified; otherwise, # create it for each file. if args.output: trimfp = get_file_writer(args.output, args.gzip, args.bzip) pass2list = [] for filename in args.input_filenames: # figure out temporary filename for 2nd pass pass2filename = os.path.basename(filename) + '.pass2' pass2filename = os.path.join(tempdir, pass2filename) pass2fp = open(pass2filename, 'w') # construct output filenames if args.output is None: # note: this will be saved in trimfp. outfp = open(os.path.basename(filename) + '.abundtrim', 'wb') # get file handle w/gzip, bzip trimfp = get_file_writer(outfp, args.gzip, args.bzip) # record all this info pass2list.append((filename, pass2filename, trimfp)) # input file stuff: get a broken_paired reader. screed_iter = screed.open(filename) paired_iter = broken_paired_reader(screed_iter, min_length=K, force_single=args.ignore_pairs) # main loop through the file. n_start = trimmer.n_reads save_start = trimmer.n_saved watermark = REPORT_EVERY_N_READS for read in trimmer.pass1(paired_iter, pass2fp): if (trimmer.n_reads - n_start) > watermark: log_info( "... {filename} {n_saved} {n_reads} {n_bp} " "{w_reads} {w_bp}", filename=filename, n_saved=trimmer.n_saved, n_reads=trimmer.n_reads, n_bp=trimmer.n_bp, w_reads=written_reads, w_bp=written_bp) watermark += REPORT_EVERY_N_READS # write out the trimmed/etc sequences that AREN'T going to be # revisited in a 2nd pass. write_record(read, trimfp) written_bp += len(read) written_reads += 1 pass2fp.close() log_info("{filename}: kept aside {kept} of {total} from first pass", filename=filename, kept=trimmer.n_saved - save_start, total=trimmer.n_reads - n_start) # first pass goes across all the data, so record relevant stats... n_reads = trimmer.n_reads n_bp = trimmer.n_bp n_skipped = trimmer.n_skipped bp_skipped = trimmer.bp_skipped save_pass2_total = trimmer.n_saved # ### SECOND PASS. ### # nothing should have been skipped yet! assert trimmer.n_skipped == 0 assert trimmer.bp_skipped == 0 if args.single_pass: pass2list = [] # go back through all the files again. for _, pass2filename, trimfp in pass2list: log_info('second pass: looking at sequences kept aside in {pass2}', pass2=pass2filename) # note that for this second pass, we don't care about paired # reads - they will be output in the same order they're read in, # so pairs will stay together if not orphaned. This is in contrast # to the first loop. Hence, force_single=True below. screed_iter = screed.open(pass2filename, parse_description=False) paired_iter = broken_paired_reader(screed_iter, min_length=K, force_single=True) watermark = REPORT_EVERY_N_READS for read in trimmer.pass2(paired_iter): if (trimmer.n_reads - n_start) > watermark: log_info('... x 2 {a} {b} {c} {d} {e} {f} {g}', a=trimmer.n_reads - n_start, b=pass2filename, c=trimmer.n_saved, d=trimmer.n_reads, e=trimmer.n_bp, f=written_reads, g=written_bp) watermark += REPORT_EVERY_N_READS write_record(read, trimfp) written_reads += 1 written_bp += len(read) log_info('removing {pass2}', pass2=pass2filename) os.unlink(pass2filename) # if we created our own trimfps, close 'em. if not args.output: trimfp.close() log_info('removing temp directory & contents ({temp})', temp=tempdir) shutil.rmtree(tempdir) trimmed_reads = trimmer.trimmed_reads n_passes = 1.0 + (float(save_pass2_total) / n_reads) percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\ n_reads * 100.0 log_info('read {read} reads, {bp} bp', read=n_reads, bp=n_bp) log_info('wrote {wr} reads, {wbp} bp', wr=written_reads, wbp=written_bp) log_info('looked at {st} reads twice ({np:.2f} passes)', st=save_pass2_total, np=n_passes) log_info('removed {r} reads and trimmed {t} reads ({p:.2f}%)', r=n_reads - written_reads, t=trimmed_reads, p=percent_reads_trimmed) log_info('trimmed or removed {p:.2f}%% of bases ({bp} total)', p=(1 - (written_bp / float(n_bp))) * 100.0, bp=n_bp - written_bp) if args.variable_coverage: percent_reads_hicov = 100.0 * float(n_reads - n_skipped) / n_reads log_info('{n} reads were high coverage ({p:.2f}%);', n=n_reads - n_skipped, p=percent_reads_hicov) log_info('skipped {r} reads/{bp} bases because of low coverage', r=n_skipped, bp=bp_skipped) fp_rate = \ khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate) log_info('output in *.abundtrim') if args.savegraph: log_info("Saving k-mer countgraph to {graph}", graph=args.savegraph) ct.save(args.savegraph)
def main(): args = sanitize_help(get_parser()).parse_args() configure_logging(args.quiet) infiles = [args.input_count_graph_filename, args.input_sequence_filename] for infile in infiles: check_input_files(infile, False) log_info('Loading counting graph from {graph}', graph=args.input_count_graph_filename) countgraph = khmer.load_countgraph(args.input_count_graph_filename) if not countgraph.get_use_bigcount() and args.bigcount: log_warn("WARNING: The loaded graph has bigcount DISABLED while " "bigcount reporting is ENABLED--counts higher than 255 will " "not be reported.") countgraph.set_use_bigcount(args.bigcount) kmer_size = countgraph.ksize() hashsizes = countgraph.hashsizes() tracking = khmer._Nodegraph( # pylint: disable=protected-access kmer_size, hashsizes) log_info('K: {ksize}', ksize=kmer_size) log_info('outputting to {output}', output=args.output_histogram_filename) if args.output_histogram_filename in ('-', '/dev/stdout'): pass elif os.path.exists(args.output_histogram_filename): if not args.squash_output: log_error('ERROR: {output} exists; not squashing.', output=args.output_histogram_filename) sys.exit(1) log_info('** squashing existing file {output}', output=args.output_histogram_filename) log_info('preparing hist...') abundances = countgraph.abundance_distribution( args.input_sequence_filename, tracking) total = sum(abundances) if 0 == total: log_error("ERROR: abundance distribution is uniformly zero; " "nothing to report.") log_error("\tPlease verify that the input files are valid.") sys.exit(1) if args.output_histogram_filename in ('-', '/dev/stdout'): countgraph_fp = sys.stdout else: countgraph_fp = open(args.output_histogram_filename, 'w') countgraph_fp_csv = csv.writer(countgraph_fp) # write headers: countgraph_fp_csv.writerow( ['abundance', 'count', 'cumulative', 'cumulative_fraction']) sofar = 0 for _, i in enumerate(abundances): if i == 0 and not args.output_zero: continue sofar += i frac = sofar / float(total) countgraph_fp_csv.writerow([_, i, sofar, round(frac, 3)]) if sofar == total: break
def main(): args = sanitize_help(get_parser()).parse_args() if not args.quiet: info('abundance-dist.py', ['counting']) configure_logging(args.quiet) infiles = [args.input_count_graph_filename, args.input_sequence_filename] for infile in infiles: check_input_files(infile, False) log_info('Loading counting graph from {graph}', graph=args.input_count_graph_filename) countgraph = khmer.load_countgraph( args.input_count_graph_filename) if not countgraph.get_use_bigcount() and args.bigcount: log_warn("WARNING: The loaded graph has bigcount DISABLED while " "bigcount reporting is ENABLED--counts higher than 255 will " "not be reported.") countgraph.set_use_bigcount(args.bigcount) kmer_size = countgraph.ksize() hashsizes = countgraph.hashsizes() tracking = khmer._Nodegraph( # pylint: disable=protected-access kmer_size, hashsizes) log_info('K: {ksize}', ksize=kmer_size) log_info('outputting to {output}', output=args.output_histogram_filename) if args.output_histogram_filename in ('-', '/dev/stdout'): pass elif os.path.exists(args.output_histogram_filename): if not args.squash_output: log_error('ERROR: {output} exists; not squashing.', output=args.output_histogram_filename) sys.exit(1) log_info('** squashing existing file {output}', output=args.output_histogram_filename) log_info('preparing hist...') abundances = countgraph.abundance_distribution( args.input_sequence_filename, tracking) total = sum(abundances) if 0 == total: log_error("ERROR: abundance distribution is uniformly zero; " "nothing to report.") log_error("\tPlease verify that the input files are valid.") sys.exit(1) if args.output_histogram_filename in ('-', '/dev/stdout'): countgraph_fp = sys.stdout else: countgraph_fp = open(args.output_histogram_filename, 'w') countgraph_fp_csv = csv.writer(countgraph_fp) # write headers: countgraph_fp_csv.writerow(['abundance', 'count', 'cumulative', 'cumulative_fraction']) sofar = 0 for _, i in enumerate(abundances): if i == 0 and not args.output_zero: continue sofar += i frac = sofar / float(total) countgraph_fp_csv.writerow([_, i, sofar, round(frac, 3)]) if sofar == total: break
def main(): args = sanitize_help(get_parser()).parse_args() if not args.quiet: info('filter-abund.py', ['counting']) configure_logging(args.quiet) infiles = args.input_filename if ('-' in infiles or '/dev/stdin' in infiles) and not \ args.single_output_file: log_error("Accepting input from stdin; output filename must " "be provided with -o.") sys.exit(1) for filename in infiles: check_input_files(filename, args.force) check_space(infiles, args.force) log_info('loading countgraph: {graph}', graph=args.input_graph) countgraph = khmer.load_countgraph(args.input_graph) ksize = countgraph.ksize() log_info("K: {ksize}", ksize=ksize) # the filtering function. def process_fn(record): name = record.name seq = record.sequence seqN = seq.replace('N', 'A') if args.variable_coverage: # only trim when sequence has high enough C med, _, _ = countgraph.get_median_count(seqN) if med < args.normalize_to: return name, seq _, trim_at = countgraph.trim_on_abundance(seqN, args.cutoff) if trim_at >= ksize: # be sure to not to change the 'N's in the trimmed sequence - # so, return 'seq' and not 'seqN'. return name, seq[:trim_at] return None, None if args.single_output_file: outfile = args.single_output_file.name outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip) # the filtering loop for infile in infiles: log_info('filtering {infile}', infile=infile) if not args.single_output_file: outfile = os.path.basename(infile) + '.abundfilt' outfp = open(outfile, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) tsp = ThreadedSequenceProcessor(process_fn, n_workers=args.threads, verbose=not args.quiet) tsp.start(verbose_loader(infile), outfp) log_info('output in {outfile}', outfile=outfile)
def main(): parser = sanitize_help(get_parser()) args = parser.parse_args() if not args.quiet: info('trim-low-abund.py', ['streaming']) configure_logging(args.quiet) ### if len(set(args.input_filenames)) != len(args.input_filenames): log_error("Error: Cannot input the same filename multiple times.") sys.exit(1) if args.trim_at_coverage != DEFAULT_TRIM_AT_COVERAGE and \ not args.variable_coverage: log_error("Error: --trim-at-coverage/-Z given, but " "--variable-coverage/-V not specified.") sys.exit(1) if args.diginorm_coverage != DEFAULT_DIGINORM_COVERAGE and \ not args.diginorm: log_error("Error: --diginorm-coverage given, but " "--diginorm not specified.") sys.exit(1) if args.diginorm and args.single_pass: log_error("Error: --diginorm and --single-pass are incompatible!\n" "You probably want to use normalize-by-median.py instead.") sys.exit(1) ### report_on_config(args) check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savegraph: graphsize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.savegraph, graphsize, args.force) if ('-' in args.input_filenames or '/dev/stdin' in args.input_filenames) \ and not args.output: log_error("Accepting input from stdin; output filename must " "be provided with -o.") sys.exit(1) if args.loadgraph: log_info('loading countgraph from {graph}', graph=args.loadgraph) ct = khmer.load_countgraph(args.loadgraph) else: log_info('making countgraph') ct = khmer_args.create_countgraph(args) K = ct.ksize() tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir) log_info('created temporary directory {temp};\n' 'use -T to change location', temp=tempdir) trimmer = Trimmer(ct, not args.variable_coverage, args.cutoff, args.trim_at_coverage) if args.diginorm: trimmer.set_diginorm(args.diginorm_coverage) # ### FIRST PASS ### save_pass2_total = 0 written_bp = 0 written_reads = 0 # only create the file writer once if outfp is specified; otherwise, # create it for each file. if args.output: trimfp = get_file_writer(args.output, args.gzip, args.bzip) pass2list = [] for filename in args.input_filenames: # figure out temporary filename for 2nd pass pass2filename = os.path.basename(filename) + '.pass2' pass2filename = os.path.join(tempdir, pass2filename) pass2fp = open(pass2filename, 'w') # construct output filenames if args.output is None: # note: this will be saved in trimfp. outfp = open(os.path.basename(filename) + '.abundtrim', 'wb') # get file handle w/gzip, bzip trimfp = get_file_writer(outfp, args.gzip, args.bzip) # record all this info pass2list.append((filename, pass2filename, trimfp)) # input file stuff: get a broken_paired reader. screed_iter = screed.open(filename) paired_iter = broken_paired_reader(screed_iter, min_length=K, force_single=args.ignore_pairs) # main loop through the file. n_start = trimmer.n_reads save_start = trimmer.n_saved watermark = REPORT_EVERY_N_READS for read in trimmer.pass1(paired_iter, pass2fp): if (trimmer.n_reads - n_start) > watermark: log_info("... {filename} {n_saved} {n_reads} {n_bp} " "{w_reads} {w_bp}", filename=filename, n_saved=trimmer.n_saved, n_reads=trimmer.n_reads, n_bp=trimmer.n_bp, w_reads=written_reads, w_bp=written_bp) watermark += REPORT_EVERY_N_READS # write out the trimmed/etc sequences that AREN'T going to be # revisited in a 2nd pass. write_record(read, trimfp) written_bp += len(read) written_reads += 1 pass2fp.close() log_info("{filename}: kept aside {kept} of {total} from first pass", filename=filename, kept=trimmer.n_saved - save_start, total=trimmer.n_reads - n_start) # first pass goes across all the data, so record relevant stats... n_reads = trimmer.n_reads n_bp = trimmer.n_bp n_skipped = trimmer.n_skipped bp_skipped = trimmer.bp_skipped save_pass2_total = trimmer.n_saved # ### SECOND PASS. ### # nothing should have been skipped yet! assert trimmer.n_skipped == 0 assert trimmer.bp_skipped == 0 if args.single_pass: pass2list = [] # go back through all the files again. for _, pass2filename, trimfp in pass2list: log_info('second pass: looking at sequences kept aside in {pass2}', pass2=pass2filename) # note that for this second pass, we don't care about paired # reads - they will be output in the same order they're read in, # so pairs will stay together if not orphaned. This is in contrast # to the first loop. Hence, force_single=True below. screed_iter = screed.open(pass2filename, parse_description=False) paired_iter = broken_paired_reader(screed_iter, min_length=K, force_single=True) watermark = REPORT_EVERY_N_READS for read in trimmer.pass2(paired_iter): if (trimmer.n_reads - n_start) > watermark: log_info('... x 2 {a} {b} {c} {d} {e} {f} {g}', a=trimmer.n_reads - n_start, b=pass2filename, c=trimmer.n_saved, d=trimmer.n_reads, e=trimmer.n_bp, f=written_reads, g=written_bp) watermark += REPORT_EVERY_N_READS write_record(read, trimfp) written_reads += 1 written_bp += len(read) log_info('removing {pass2}', pass2=pass2filename) os.unlink(pass2filename) # if we created our own trimfps, close 'em. if not args.output: trimfp.close() log_info('removing temp directory & contents ({temp})', temp=tempdir) shutil.rmtree(tempdir) trimmed_reads = trimmer.trimmed_reads n_passes = 1.0 + (float(save_pass2_total) / n_reads) percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\ n_reads * 100.0 log_info('read {read} reads, {bp} bp', read=n_reads, bp=n_bp) log_info('wrote {wr} reads, {wbp} bp', wr=written_reads, wbp=written_bp) log_info('looked at {st} reads twice ({np:.2f} passes)', st=save_pass2_total, np=n_passes) log_info('removed {r} reads and trimmed {t} reads ({p:.2f}%)', r=n_reads - written_reads, t=trimmed_reads, p=percent_reads_trimmed) log_info('trimmed or removed {p:.2f}%% of bases ({bp} total)', p=(1 - (written_bp / float(n_bp))) * 100.0, bp=n_bp - written_bp) if args.variable_coverage: percent_reads_hicov = 100.0 * float(n_reads - n_skipped) / n_reads log_info('{n} reads were high coverage ({p:.2f}%);', n=n_reads - n_skipped, p=percent_reads_hicov) log_info('skipped {r} reads/{bp} bases because of low coverage', r=n_skipped, bp=bp_skipped) fp_rate = \ khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate) log_info('output in *.abundtrim') if args.savegraph: log_info("Saving k-mer countgraph to {graph}", graph=args.savegraph) ct.save(args.savegraph)
def main(): # pylint: disable=too-many-locals,too-many-branches args = sanitize_help(get_parser()).parse_args() graph_type = 'smallcountgraph' if args.small_count else 'countgraph' configure_logging(args.quiet) report_on_config(args, graph_type) check_input_files(args.input_sequence_filename, args.force) if args.savegraph is not None: graphsize = calculate_graphsize(args, graph_type) check_space_for_graph(args.savegraph, graphsize, args.force) if (not args.squash_output and os.path.exists(args.output_histogram_filename)): log_error('ERROR: {output} exists; not squashing.', output=args.output_histogram_filename) sys.exit(1) else: hist_fp = open(args.output_histogram_filename, 'w') hist_fp_csv = csv.writer(hist_fp) # write headers: hist_fp_csv.writerow(['abundance', 'count', 'cumulative', 'cumulative_fraction']) log_info('making countgraph') # In case the user specified a maximum memory usage, use 8/(9+eps) of that # for the countgraph and 1/(9+eps) for the tracking nodegraph # `eps` is used to account for the memory used by the python interpreter countgraph = khmer_args.create_countgraph(args, multiplier=8 / (9. + 0.3)) log_info('building k-mer tracking graph') tracking = khmer_args.create_matching_nodegraph(countgraph) log_info('kmer_size: {ksize}', ksize=countgraph.ksize()) log_info('k-mer countgraph sizes: {sizes}', sizes=countgraph.hashsizes()) log_info('outputting to {output}', output=args.output_histogram_filename) # start loading rparser = khmer.ReadParser(args.input_sequence_filename) threads = [] log_info('consuming input, round 1 -- {input}', input=args.input_sequence_filename) for _ in range(args.threads): thread = \ threading.Thread( target=countgraph.consume_seqfile, args=(rparser, ) ) threads.append(thread) thread.start() for thread in threads: thread.join() log_info('Total number of unique k-mers: {nk}', nk=countgraph.n_unique_kmers()) abundance_lists = [] def __do_abundance_dist__(read_parser): abundances = countgraph.abundance_distribution( read_parser, tracking) abundance_lists.append(abundances) log_info('preparing hist from {seqfile}...', seqfile=args.input_sequence_filename) rparser = khmer.ReadParser(args.input_sequence_filename) threads = [] log_info('consuming input, round 2 -- {filename}', filename=args.input_sequence_filename) for _ in range(args.threads): thread = \ threading.Thread( target=__do_abundance_dist__, args=(rparser, ) ) threads.append(thread) thread.start() for thread in threads: thread.join() assert len(abundance_lists) == args.threads, len(abundance_lists) abundance = {} for abundance_list in abundance_lists: for i, count in enumerate(abundance_list): abundance[i] = abundance.get(i, 0) + count total = sum(abundance.values()) if 0 == total: log_error("ERROR: abundance distribution is uniformly zero; " "nothing to report.") log_error("\tPlease verify that the input files are valid.") sys.exit(1) sofar = 0 for _, i in sorted(abundance.items()): if i == 0 and not args.output_zero: continue sofar += i frac = sofar / float(total) hist_fp_csv.writerow([_, i, sofar, round(frac, 3)]) if sofar == total: break if args.savegraph is not None: log_info('Saving k-mer countgraph to {savegraph}', savegraph=args.savegraph) countgraph.save(args.savegraph) log_info('wrote to: {output}', output=args.output_histogram_filename)