def main(): args = sanitize_help(get_parser()).parse_args() configure_logging(args.quiet) check_input_files(args.datafile, args.force) check_space([args.datafile], args.force) if args.savegraph: tablesize = calculate_graphsize(args, "countgraph") check_space_for_graph(args.savegraph, tablesize, args.force) report_on_config(args) log_info("making countgraph") graph = khmer_args.create_countgraph(args) # first, load reads into graph rparser = khmer.ReadParser(args.datafile) threads = [] log_info("consuming input, round 1 -- {datafile}", datafile=args.datafile) for _ in range(args.threads): cur_thread = threading.Thread(target=graph.consume_fasta_with_reads_parser, args=(rparser,)) threads.append(cur_thread) cur_thread.start() for _ in threads: _.join() log_info("Total number of unique k-mers: {nk}", nk=graph.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(graph, args.force) log_info("fp rate estimated to be {fpr:1.3f}", fpr=fp_rate) # the filtering loop log_info("filtering {datafile}", datafile=args.datafile) if args.outfile is None: outfile = os.path.basename(args.datafile) + ".abundfilt" else: outfile = args.outfile outfp = open(outfile, "wb") outfp = get_file_writer(outfp, args.gzip, args.bzip) paired_iter = broken_paired_reader(ReadParser(args.datafile), min_length=graph.ksize(), force_single=True) for n, is_pair, read1, read2 in paired_iter: assert not is_pair assert read2 is None trimmed_record, _ = trim_record(graph, read1, args.cutoff, args.variable_coverage, args.normalize_to) if trimmed_record: print((trimmed_record,)) write_record(trimmed_record, outfp) log_info("output in {outfile}", outfile=outfile) if args.savegraph: log_info("Saving k-mer countgraph filename {graph}", graph=args.savegraph) graph.save(args.savegraph)
def test_create_countgraph_4_multiplier(): ksize = khmer_args.DEFAULT_K n_tables = khmer_args.DEFAULT_N_TABLES max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE max_mem = 1e7 args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem, 0) countgraph = khmer_args.create_countgraph(args, multiplier=2.0) assert sum(countgraph.hashsizes()) < max_mem / 2.0, \ sum(countgraph.hashsizes())
def test_create_countgraph_3(): # tests too-big ksize ksize = khmer_args.DEFAULT_K n_tables = khmer_args.DEFAULT_N_TABLES max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE max_mem = 1e7 args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem, 0) old_stderr = sys.stderr sys.stderr = capture = StringIO() try: khmer_args.create_countgraph(args, ksize=35) assert 0, "should not reach this" except SystemExit: err = capture.getvalue() assert 'khmer only supports k-mer sizes <= 32.' in err, err finally: sys.stderr = old_stderr
def test_create_countgraph_2(): # tests overriding ksize by passing into create_nodegraph explicitly. ksize = khmer_args.DEFAULT_K n_tables = khmer_args.DEFAULT_N_TABLES max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE max_mem = 1e7 args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem, 0) countgraph = khmer_args.create_countgraph(args, ksize=15) assert countgraph.ksize() == 15
def test_create_countgraph_4(): # tests too-big n_tables WITHOUT force ksize = khmer_args.DEFAULT_K n_tables = 21 # some number larger than 20 max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE max_mem = 1e7 args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem, 0, 0) old_stderr = sys.stderr sys.stderr = capture = StringIO() try: khmer_args.create_countgraph(args, ksize=None) assert 0, "should not reach this" except SystemExit: err = capture.getvalue() assert 'khmer only supports number of tables <= 20.' in err, err finally: sys.stderr = old_stderr
def main(): info('make-initial-stoptags.py', ['graph']) args = get_parser().parse_args() graphbase = args.graphbase # @RamRS: This might need some more work infiles = [graphbase + '.pt', graphbase + '.tagset'] if args.stoptags: infiles.append(args.stoptags) for _ in infiles: check_input_files(_, args.force) check_space(infiles, args.force) print('loading htable %s.pt' % graphbase, file=sys.stderr) htable = khmer.load_hashbits(graphbase + '.pt') # do we want to load stop tags, and do they exist? if args.stoptags: print('loading stoptags from', args.stoptags, file=sys.stderr) htable.load_stop_tags(args.stoptags) print('loading tagset %s.tagset...' % graphbase, file=sys.stderr) htable.load_tagset(graphbase + '.tagset') ksize = htable.ksize() counting = khmer_args.create_countgraph(args) # divide up into SUBSET_SIZE fragments divvy = htable.divide_tags_into_subsets(args.subset_size) # pick off the first one if len(divvy) == 1: start, end = 0, 0 else: start, end = divvy[:2] # partition! print('doing pre-partitioning from', start, 'to', end, file=sys.stderr) subset = htable.do_subset_partition(start, end) # now, repartition... print('repartitioning to find HCKs.', file=sys.stderr) htable.repartition_largest_partition(subset, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) print('saving stop tags', file=sys.stderr) htable.save_stop_tags(graphbase + '.stoptags') print('wrote to:', graphbase + '.stoptags', file=sys.stderr)
def test_create_countgraph_1(): ksize = khmer_args.DEFAULT_K n_tables = khmer_args.DEFAULT_N_TABLES max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE max_mem = 1e7 args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem, 0) countgraph = khmer_args.create_countgraph(args) expected_hashsz = utils.longify([2499997, 2499989, 2499983, 2499967]) assert countgraph.hashsizes() == expected_hashsz, countgraph.hashsizes() assert sum(countgraph.hashsizes()) < max_mem, sum(countgraph.hashsizes())
def test_create_countgraph_4(): # tests too-big n_tables WITHOUT force ksize = khmer_args.DEFAULT_K n_tables = 21 # some number larger than 20 max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE max_mem = 1e7 args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem, 0, False, 0) old_stderr = sys.stderr sys.stderr = capture = StringIO() try: khmer_args.create_countgraph(args, ksize=None) assert 0, "should not reach this" except SystemExit: err = capture.getvalue() assert 'khmer only supports number of tables <= 20.' in err, err finally: sys.stderr = old_stderr
def test_create_countgraph_5(): # tests too-big n_tables WITH force ksize = khmer_args.DEFAULT_K n_tables = 21 # some number larger than 20 max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE max_mem = 1e7 args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem, 0, 1) old_stderr = sys.stderr sys.stderr = capture = StringIO() try: khmer_args.create_countgraph(args, ksize=None) message = "Warning: Maximum recommended number of tables is 20, " + \ "discarded by force nonetheless!" assert message in capture.getvalue() except SystemExit as e: print(str(e)) finally: sys.stderr = old_stderr
def main(): args = sanitize_help(get_parser()).parse_args() graphbase = args.graphbase # @RamRS: This might need some more work infiles = [graphbase, graphbase + '.tagset'] if args.stoptags: infiles.append(args.stoptags) for _ in infiles: check_input_files(_, args.force) print('loading nodegraph %s.pt' % graphbase, file=sys.stderr) nodegraph = Nodegraph.load(graphbase) # do we want to load stop tags, and do they exist? if args.stoptags: print('loading stoptags from', args.stoptags, file=sys.stderr) nodegraph.load_stop_tags(args.stoptags) print('loading tagset %s.tagset...' % graphbase, file=sys.stderr) nodegraph.load_tagset(graphbase + '.tagset') counting = khmer_args.create_countgraph(args) # divide up into SUBSET_SIZE fragments divvy = nodegraph.divide_tags_into_subsets(args.subset_size) divvy = list(divvy) # pick off the first one if len(divvy) == 1: start, end = 0, 0 else: start, end = divvy[:2] # partition! print('doing pre-partitioning from', start, 'to', end, file=sys.stderr) subset = nodegraph.do_subset_partition(start, end) # now, repartition... print('repartitioning to find HCKs.', file=sys.stderr) nodegraph.repartition_largest_partition(counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD, subs=subset) print('saving stop tags', file=sys.stderr) nodegraph.save_stop_tags(graphbase + '.stoptags') print('wrote to:', graphbase + '.stoptags', file=sys.stderr)
def test_create_countgraph_5(): # tests too-big n_tables WITH force ksize = khmer_args.DEFAULT_K n_tables = 21 # some number larger than 20 max_tablesize = khmer_args.DEFAULT_MAX_TABLESIZE max_mem = 1e7 args = FakeArgparseObject(ksize, n_tables, max_tablesize, max_mem, 0, False, 1) old_stderr = sys.stderr sys.stderr = capture = StringIO() try: khmer_args.create_countgraph(args, ksize=None) message = "Warning: Maximum recommended number of tables is 20, " + \ "discarded by force nonetheless!" assert message in capture.getvalue() except SystemExit as e: print(str(e)) finally: sys.stderr = old_stderr
def main(): parser = build_nodegraph_args() parser.add_argument('-o', '--outfile', help='output file; default is "infile".sweep2') parser.add_argument('-q', '--quiet') parser.add_argument('input_filename') parser.add_argument('read_filename') args = parser.parse_args() inp = args.input_filename readsfile = args.read_filename outfile = os.path.basename(readsfile) + '.sweep2' if args.outfile: outfile = args.outfile outfp = open(outfile, 'w') # create a nodegraph data structure ht = khmer_args.create_countgraph(args) # load contigs, connect into N partitions print('loading input reads from', inp) ht.consume_seqfile(inp) print('starting sweep.') m = 0 K = ht.ksize() instream = screed.open(readsfile) for n, is_pair, read1, read2 in broken_paired_reader(instream): if n % 10000 == 0: print('...', n, m) if is_pair: count1 = ht.get_median_count(read1.sequence)[0] count2 = ht.get_median_count(read2.sequence)[0] if count1 or count2: m += 1 write_record_pair(read1, read2, outfp) else: count = ht.get_median_count(read1.sequence)[0] if count: m += 1 write_record(read1, outfp)
def main(): parser = build_nodegraph_args() parser.add_argument('-o', '--outfile', help='output file; default is "infile".sweep2') parser.add_argument('-q', '--quiet') parser.add_argument('input_filename') parser.add_argument('read_filename') args = parser.parse_args() inp = args.input_filename readsfile = args.read_filename outfile = os.path.basename(readsfile) + '.sweep2' if args.outfile: outfile = args.outfile outfp = open(outfile, 'w') # create a nodegraph data structure ht = khmer_args.create_countgraph(args) # load contigs, connect into N partitions print('loading input reads from', inp) ht.consume_fasta(inp) print('starting sweep.') m = 0 K = ht.ksize() instream = screed.open(readsfile) for n, is_pair, read1, read2 in broken_paired_reader(instream): if n % 10000 == 0: print('...', n, m) if is_pair: count1 = ht.get_median_count(read1.sequence)[0] count2 = ht.get_median_count(read2.sequence)[0] if count1 or count2: m += 1 write_record_pair(read1, read2, outfp) else: count = ht.get_median_count(read1.sequence)[0] if count: m += 1 write_record(read1, outfp)
def main(): parser = build_counting_args() parser.add_argument('--min-abundance', default=50, type=int) parser.add_argument('input_files', nargs='+') parser.add_argument('-o', '--out', type=argparse.FileType('wb'), default=sys.stdout) args = parser.parse_args() countgraph = khmer_args.create_countgraph(args, multiplier=1.1) count = 0 for fn in args.input_files: short = os.path.basename(fn) for n, record in enumerate(screed.open(fn)): if n % 100000 == 0: print('Processed {n} reads...'.format(n=n), file=sys.stderr) countgraph.consume(record.sequence) if countgraph.median_at_least(record.sequence, args.min_abundance): args.out.write('>{fn}:{name}:{c}\n{seq}\n'.format(fn=short, c=count, name=record.name, seq=record.sequence)) count += 1
def main(): p = build_counting_args(descr='Streaming assembly with tracking info') p.add_argument('fastq_files', nargs='+') p.add_argument('-o', type=argparse.FileType('w'), default='assembly-stats.csv') args = p.parse_args() cg = create_countgraph(args) kept = 0 hdn = khmer.HashSet(args.ksize) lh = khmer._GraphLabels(cg) next_label = 1 next_orf = 1 output = set() statswriter = csv.DictWriter(args.o, delimiter=',', fieldnames=[ 'read_n', 'action', 'cov', 'n_hdn', 'contig_n', 'orf_n', 'new' ]) for filename in args.fastq_files: for n, record in enumerate(screed.open(filename)): if n and n % 10000 == 0: print('...', n, file=sys.stderr) if len(record.sequence) < args.ksize: continue cov, _, _ = cg.get_median_count(record.sequence) if cov < 20: kept += 1 cg.consume(record.sequence) statswriter.writerow({ 'read_n': n, 'action': 'c', 'cov': cov, 'n_hdn': None, 'contig_n': None, 'orf_n': None, 'new': None }) elif cov < 30: #print('intermediate', next_label, file=sys.stderr) seq, pos = cg.trim_on_abundance(record.sequence, 3) if len(seq) < args.ksize: continue cg.consume(seq) hdn = cg.find_high_degree_nodes(seq) lh.label_across_high_degree_nodes(seq, hdn, next_label) next_label += 1 statswriter.writerow({ 'read_n': n, 'action': 'l', 'cov': cov, 'n_hdn': len(hdn), 'contig_n': None, 'orf_n': None, 'new': None }) elif cov == 30: contigs = lh.assemble_labeled_path( record.sequence[:args.ksize]) for contig_n, contig in enumerate(contigs): statswriter.writerow({ 'read_n': n, 'action': 'a', 'cov': cov, 'n_hdn': None, 'contig_n': contig_n, 'orf_n': None, 'new': None }) for t in translate(contig): for orf_n, o in enumerate(extract_orfs(t)): if hash(o) not in output: new = True output.add(hash(o)) print('>orf%d\n%s' % (next_orf, o)) next_orf += 1 else: new = False statswriter.writerow({ 'read_n': n, 'action': 'a', 'cov': cov, 'n_hdn': None, 'contig_n': contig_n, 'orf_n': orf_n, 'new': new })
def main(): info('collect-reads.py', ['counting']) args = sanitize_help(get_parser()).parse_args() report_on_config(args) base = args.output_countgraph_filename filenames = args.input_sequence_filename for name in args.input_sequence_filename: check_input_files(name, False) check_space(args.input_sequence_filename, False) tablesize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.output_countgraph_filename, tablesize, False) print('Saving k-mer countgraph to %s' % base) print('Loading sequences from %s' % repr(filenames)) if args.output: print('Outputting sequences to', args.output) print('making countgraph', file=sys.stderr) htable = khmer_args.create_countgraph(args) htable.set_use_bigcount(args.bigcount) total_coverage = 0. n = 0 for index, filename in enumerate(filenames): for record in screed.open(filename): seq = record.sequence.upper() if 'N' in seq: seq = seq.replace('N', 'A') try: med, _, _ = htable.get_median_count(seq) except ValueError: continue total_coverage += med n += 1 if total_coverage / float(n) > args.coverage: print('reached target average coverage:', \ total_coverage / float(n)) break htable.consume(seq) if args.output: args.output.write(output_single(record)) if n % 100000 == 0: print('...', index, filename, n, total_coverage / float(n)) if total_coverage / float(n) > args.coverage: break print('Collected %d reads' % (n, )) if args.report_total_kmers: print('Total number of k-mers: {0}'.format(htable.n_occupied()), file=sys.stderr) print('saving', base) htable.save(base) info_fp = open(base + '.info', 'w') info_fp.write('through end: %s\n' % filenames[-1]) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(htable, False, max_false_pos=.2) print('fp rate estimated to be %1.3f' % fp_rate) print('fp rate estimated to be %1.3f' % fp_rate, file=info_fp) print('DONE.')
def main(): # pylint: disable=too-many-branches,too-many-statements parser = sanitize_help(get_parser()) args = parser.parse_args() configure_logging(args.quiet) report_on_config(args) report_fp = args.report force_single = args.force_single # check for similar filenames # if we're using a single output file only check for identical filenames # otherwise, check for identical BASE names as well. filenames = [] basenames = [] for pathfilename in args.input_filenames: filenames.append(pathfilename) if args.single_output_file: continue # nothing more to worry about basename = os.path.basename(pathfilename) if basename in basenames: log_error('ERROR: Duplicate filename--Cannot handle this!') log_error('** Exiting!') sys.exit(1) basenames.append(basename) # check that files exist and there is sufficient output disk space. check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savegraph is not None: graphsize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.savegraph, graphsize, args.force) # load or create counting table. if args.loadgraph: log_info('loading k-mer countgraph from {graph}', graph=args.loadgraph) countgraph = Countgraph.load(args.loadgraph) else: log_info('making countgraph') countgraph = khmer_args.create_countgraph(args) # create an object to handle diginorm of all files norm = Normalizer(args.cutoff, countgraph) with_diagnostics = WithDiagnostics(norm, report_fp, args.report_frequency) # make a list of all filenames and if they're paired or not; # if we don't know if they're paired, default to allowing but not # forcing pairing. files = [] for element in filenames: files.append([element, args.paired]) if args.unpaired_reads: files.append([args.unpaired_reads, False]) corrupt_files = [] outfp = None output_name = None if args.single_output_file: outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip) else: if '-' in filenames or '/dev/stdin' in filenames: print("Accepting input from stdin; output filename must " "be provided with '-o'.", file=sys.stderr) sys.exit(1) # # main loop: iterate over all files given, do diginorm. # for filename, require_paired in files: if not args.single_output_file: output_name = os.path.basename(filename) + '.keep' outfp = open(output_name, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) # failsafe context manager in case an input file breaks with catch_io_errors(filename, outfp, args.single_output_file, args.force, corrupt_files): screed_iter = clean_input_reads(screed.open(filename)) reader = broken_paired_reader(screed_iter, min_length=args.ksize, force_single=force_single, require_paired=require_paired) # actually do diginorm for record in with_diagnostics(reader, filename): if record is not None: write_record(record, outfp) log_info('output in {name}', name=describe_file_handle(outfp)) if not args.single_output_file: outfp.close() # finished - print out some diagnostics. log_info('Total number of unique k-mers: {umers}', umers=countgraph.n_unique_kmers()) if args.savegraph is not None: log_info('...saving to {name}', name=args.savegraph) countgraph.save(args.savegraph) fp_rate = \ khmer.calc_expected_collisions(countgraph, False, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate) if args.force and len(corrupt_files) > 0: log_error("** WARNING: Finished with errors!") log_error("** I/O Errors occurred in the following files:") log_error("\t" + " ".join(corrupt_files))
def main(): args = sanitize_help(get_parser()).parse_args() if not args.quiet: info('filter-abund-single.py', ['counting', 'SeqAn']) configure_logging(args.quiet) check_input_files(args.datafile, args.force) check_space([args.datafile], args.force) if args.savegraph: tablesize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.savegraph, tablesize, args.force) report_on_config(args) log_info('making countgraph') graph = khmer_args.create_countgraph(args) # first, load reads into graph rparser = khmer.ReadParser(args.datafile) threads = [] log_info('consuming input, round 1 -- {datafile}', datafile=args.datafile) for _ in range(args.threads): cur_thread = \ threading.Thread( target=graph.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thread) cur_thread.start() for _ in threads: _.join() log_info('Total number of unique k-mers: {nk}', nk=graph.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(graph, args.force) log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate) # now, trim. # the filtering function. def process_fn(record): name = record.name seq = record.sequence seqN = seq.replace('N', 'A') _, trim_at = graph.trim_on_abundance(seqN, args.cutoff) if trim_at >= args.ksize: # be sure to not to change the 'N's in the trimmed sequence - # so, return 'seq' and not 'seqN'. return name, seq[:trim_at] return None, None # the filtering loop log_info('filtering {datafile}', datafile=args.datafile) if args.outfile is None: outfile = os.path.basename(args.datafile) + '.abundfilt' else: outfile = args.outfile outfp = open(outfile, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) tsp = ThreadedSequenceProcessor(process_fn, verbose=not args.quiet) tsp.start(verbose_loader(args.datafile), outfp) log_info('output in {outfile}', outfile=outfile) if args.savegraph: log_info('Saving k-mer countgraph filename {graph}', graph=args.savegraph) graph.save(args.savegraph)
def main(): # pylint: disable=too-many-branches,too-many-statements info('saturate-by-median.py', ['diginorm']) parser = sanitize_help(get_parser()) args = parser.parse_args() report_on_config(args) report_fp = args.report report_frequency = args.report_frequency check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, False) if args.savegraph: check_space_for_graph(args, 'countgraph', False) # list to save error files along with throwing exceptions if args.force: corrupt_files = [] if args.loadgraph: print('loading k-mer countgraph from', args.loadgraph) htable = khmer.load_countgraph(args.loadgraph) else: print('making countgraph') htable = create_countgraph(args) total = 0 discarded = 0 for index, input_filename in enumerate(args.input_filenames): total_acc = 0 discarded_acc = 0 try: total_acc, discarded_acc = normalize_by_median(input_filename, htable, args, report_fp, report_frequency) except IOError as err: handle_error(err, input_filename) if not args.force: print("NOTE: This can be overridden using the --force" " argument", file=sys.stderr) print('** Exiting!', file=sys.stderr) sys.exit(1) else: print('*** Skipping error file, moving on...', file=sys.stderr) corrupt_files.append(input_filename) else: if total_acc == 0 and discarded_acc == 0: print('SKIPPED empty file', input_filename) else: total += total_acc discarded += discarded_acc print('DONE with {inp}; kept {kept} of {total} or {perc:2}%'\ .format(inp=input_filename, kept=total - discarded, total=total, perc=int(100. - discarded / float(total) * 100.))) if args.savegraph: print('Saving k-mer countgraph through', input_filename) print('...saving to', args.savegraph) htable.save(args.savegraph) # re: threshold, see Zhang et al., # http://arxiv.org/abs/1309.2975 fp_rate = khmer.calc_expected_collisions(htable, args.force, max_false_pos=.8) print('fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate)) if args.force and len(corrupt_files) > 0: print("** WARNING: Finished with errors!", file=sys.stderr) print("** I/O Errors occurred in the following files:", file=sys.stderr) print("\t", " ".join(corrupt_files), file=sys.stderr)
def main(): # pylint: disable=too-many-locals,too-many-branches args = sanitize_help(get_parser()).parse_args() graph_type = 'smallcountgraph' if args.small_count else 'countgraph' configure_logging(args.quiet) report_on_config(args, graph_type) check_input_files(args.input_sequence_filename, args.force) if args.savegraph is not None: graphsize = calculate_graphsize(args, graph_type) check_space_for_graph(args.savegraph, graphsize, args.force) if (not args.squash_output and os.path.exists(args.output_histogram_filename)): log_error('ERROR: {output} exists; not squashing.', output=args.output_histogram_filename) sys.exit(1) else: hist_fp = open(args.output_histogram_filename, 'w') hist_fp_csv = csv.writer(hist_fp) # write headers: hist_fp_csv.writerow( ['abundance', 'count', 'cumulative', 'cumulative_fraction']) log_info('making countgraph') # In case the user specified a maximum memory usage, use 8/(9+eps) of that # for the countgraph and 1/(9+eps) for the tracking nodegraph # `eps` is used to account for the memory used by the python interpreter countgraph = khmer_args.create_countgraph(args, multiplier=8 / (9. + 0.3)) countgraph.set_use_bigcount(args.bigcount) log_info('building k-mer tracking graph') tracking = khmer_args.create_matching_nodegraph(countgraph) log_info('kmer_size: {ksize}', ksize=countgraph.ksize()) log_info('k-mer countgraph sizes: {sizes}', sizes=countgraph.hashsizes()) log_info('outputting to {output}', output=args.output_histogram_filename) # start loading rparser = khmer.ReadParser(args.input_sequence_filename) threads = [] log_info('consuming input, round 1 -- {input}', input=args.input_sequence_filename) for _ in range(args.threads): thread = \ threading.Thread( target=countgraph.consume_seqfile_with_reads_parser, args=(rparser, ) ) threads.append(thread) thread.start() for thread in threads: thread.join() log_info('Total number of unique k-mers: {nk}', nk=countgraph.n_unique_kmers()) abundance_lists = [] def __do_abundance_dist__(read_parser): abundances = countgraph.abundance_distribution_with_reads_parser( read_parser, tracking) abundance_lists.append(abundances) log_info('preparing hist from {seqfile}...', seqfile=args.input_sequence_filename) rparser = khmer.ReadParser(args.input_sequence_filename) threads = [] log_info('consuming input, round 2 -- {filename}', filename=args.input_sequence_filename) for _ in range(args.threads): thread = \ threading.Thread( target=__do_abundance_dist__, args=(rparser, ) ) threads.append(thread) thread.start() for thread in threads: thread.join() assert len(abundance_lists) == args.threads, len(abundance_lists) abundance = {} for abundance_list in abundance_lists: for i, count in enumerate(abundance_list): abundance[i] = abundance.get(i, 0) + count total = sum(abundance.values()) if 0 == total: log_error("ERROR: abundance distribution is uniformly zero; " "nothing to report.") log_error("\tPlease verify that the input files are valid.") sys.exit(1) sofar = 0 for _, i in sorted(abundance.items()): if i == 0 and not args.output_zero: continue sofar += i frac = sofar / float(total) hist_fp_csv.writerow([_, i, sofar, round(frac, 3)]) if sofar == total: break if args.savegraph is not None: log_info('Saving k-mer countgraph to {savegraph}', savegraph=args.savegraph) countgraph.save(args.savegraph) log_info('wrote to: {output}', output=args.output_histogram_filename)
def main(): parser = sanitize_help(get_parser()) args = parser.parse_args() if not args.quiet: info('trim-low-abund.py', ['streaming']) configure_logging(args.quiet) ### if len(set(args.input_filenames)) != len(args.input_filenames): log_error("Error: Cannot input the same filename multiple times.") sys.exit(1) if args.trim_at_coverage != DEFAULT_TRIM_AT_COVERAGE and \ not args.variable_coverage: log_error("Error: --trim-at-coverage/-Z given, but " "--variable-coverage/-V not specified.") sys.exit(1) if args.diginorm_coverage != DEFAULT_DIGINORM_COVERAGE and \ not args.diginorm: log_error("Error: --diginorm-coverage given, but " "--diginorm not specified.") sys.exit(1) if args.diginorm and args.single_pass: log_error("Error: --diginorm and --single-pass are incompatible!\n" "You probably want to use normalize-by-median.py instead.") sys.exit(1) ### report_on_config(args) check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savegraph: graphsize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.savegraph, graphsize, args.force) if ('-' in args.input_filenames or '/dev/stdin' in args.input_filenames) \ and not args.output: log_error("Accepting input from stdin; output filename must " "be provided with -o.") sys.exit(1) if args.loadgraph: log_info('loading countgraph from {graph}', graph=args.loadgraph) ct = khmer.load_countgraph(args.loadgraph) else: log_info('making countgraph') ct = khmer_args.create_countgraph(args) K = ct.ksize() tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir) log_info( 'created temporary directory {temp};\n' 'use -T to change location', temp=tempdir) trimmer = Trimmer(ct, not args.variable_coverage, args.cutoff, args.trim_at_coverage) if args.diginorm: trimmer.set_diginorm(args.diginorm_coverage) # ### FIRST PASS ### save_pass2_total = 0 written_bp = 0 written_reads = 0 # only create the file writer once if outfp is specified; otherwise, # create it for each file. if args.output: trimfp = get_file_writer(args.output, args.gzip, args.bzip) pass2list = [] for filename in args.input_filenames: # figure out temporary filename for 2nd pass pass2filename = os.path.basename(filename) + '.pass2' pass2filename = os.path.join(tempdir, pass2filename) pass2fp = open(pass2filename, 'w') # construct output filenames if args.output is None: # note: this will be saved in trimfp. outfp = open(os.path.basename(filename) + '.abundtrim', 'wb') # get file handle w/gzip, bzip trimfp = get_file_writer(outfp, args.gzip, args.bzip) # record all this info pass2list.append((filename, pass2filename, trimfp)) # input file stuff: get a broken_paired reader. screed_iter = screed.open(filename) paired_iter = broken_paired_reader(screed_iter, min_length=K, force_single=args.ignore_pairs) # main loop through the file. n_start = trimmer.n_reads save_start = trimmer.n_saved watermark = REPORT_EVERY_N_READS for read in trimmer.pass1(paired_iter, pass2fp): if (trimmer.n_reads - n_start) > watermark: log_info( "... {filename} {n_saved} {n_reads} {n_bp} " "{w_reads} {w_bp}", filename=filename, n_saved=trimmer.n_saved, n_reads=trimmer.n_reads, n_bp=trimmer.n_bp, w_reads=written_reads, w_bp=written_bp) watermark += REPORT_EVERY_N_READS # write out the trimmed/etc sequences that AREN'T going to be # revisited in a 2nd pass. write_record(read, trimfp) written_bp += len(read) written_reads += 1 pass2fp.close() log_info("{filename}: kept aside {kept} of {total} from first pass", filename=filename, kept=trimmer.n_saved - save_start, total=trimmer.n_reads - n_start) # first pass goes across all the data, so record relevant stats... n_reads = trimmer.n_reads n_bp = trimmer.n_bp n_skipped = trimmer.n_skipped bp_skipped = trimmer.bp_skipped save_pass2_total = trimmer.n_saved # ### SECOND PASS. ### # nothing should have been skipped yet! assert trimmer.n_skipped == 0 assert trimmer.bp_skipped == 0 if args.single_pass: pass2list = [] # go back through all the files again. for _, pass2filename, trimfp in pass2list: log_info('second pass: looking at sequences kept aside in {pass2}', pass2=pass2filename) # note that for this second pass, we don't care about paired # reads - they will be output in the same order they're read in, # so pairs will stay together if not orphaned. This is in contrast # to the first loop. Hence, force_single=True below. screed_iter = screed.open(pass2filename, parse_description=False) paired_iter = broken_paired_reader(screed_iter, min_length=K, force_single=True) watermark = REPORT_EVERY_N_READS for read in trimmer.pass2(paired_iter): if (trimmer.n_reads - n_start) > watermark: log_info('... x 2 {a} {b} {c} {d} {e} {f} {g}', a=trimmer.n_reads - n_start, b=pass2filename, c=trimmer.n_saved, d=trimmer.n_reads, e=trimmer.n_bp, f=written_reads, g=written_bp) watermark += REPORT_EVERY_N_READS write_record(read, trimfp) written_reads += 1 written_bp += len(read) log_info('removing {pass2}', pass2=pass2filename) os.unlink(pass2filename) # if we created our own trimfps, close 'em. if not args.output: trimfp.close() log_info('removing temp directory & contents ({temp})', temp=tempdir) shutil.rmtree(tempdir) trimmed_reads = trimmer.trimmed_reads n_passes = 1.0 + (float(save_pass2_total) / n_reads) percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\ n_reads * 100.0 log_info('read {read} reads, {bp} bp', read=n_reads, bp=n_bp) log_info('wrote {wr} reads, {wbp} bp', wr=written_reads, wbp=written_bp) log_info('looked at {st} reads twice ({np:.2f} passes)', st=save_pass2_total, np=n_passes) log_info('removed {r} reads and trimmed {t} reads ({p:.2f}%)', r=n_reads - written_reads, t=trimmed_reads, p=percent_reads_trimmed) log_info('trimmed or removed {p:.2f}%% of bases ({bp} total)', p=(1 - (written_bp / float(n_bp))) * 100.0, bp=n_bp - written_bp) if args.variable_coverage: percent_reads_hicov = 100.0 * float(n_reads - n_skipped) / n_reads log_info('{n} reads were high coverage ({p:.2f}%);', n=n_reads - n_skipped, p=percent_reads_hicov) log_info('skipped {r} reads/{bp} bases because of low coverage', r=n_skipped, bp=bp_skipped) fp_rate = \ khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate) log_info('output in *.abundtrim') if args.savegraph: log_info("Saving k-mer countgraph to {graph}", graph=args.savegraph) ct.save(args.savegraph)
def main(): # pylint: disable=too-many-locals,too-many-branches info('abundance-dist-single.py', ['counting', 'SeqAn']) args = get_parser().parse_args() report_on_config(args) check_input_files(args.input_sequence_filename, args.force) check_space([args.input_sequence_filename], args.force) if args.savetable: check_space_for_hashtable(args, 'countgraph', args.force) if (not args.squash_output and os.path.exists(args.output_histogram_filename)): print('ERROR: %s exists; not squashing.' % args.output_histogram_filename, file=sys.stderr) sys.exit(1) else: hist_fp = open(args.output_histogram_filename, 'w') if args.csv: hist_fp_csv = csv.writer(hist_fp) # write headers: hist_fp_csv.writerow( ['abundance', 'count', 'cumulative', 'cumulative_fraction']) print('making countgraph', file=sys.stderr) counting_hash = khmer_args.create_countgraph(args, multiplier=1.1) counting_hash.set_use_bigcount(args.bigcount) print('building k-mer tracking table', file=sys.stderr) tracking = khmer_args.create_nodegraph(args, multiplier=1.1) print('kmer_size:', counting_hash.ksize(), file=sys.stderr) print('k-mer counting table sizes:', counting_hash.hashsizes(), file=sys.stderr) print('outputting to', args.output_histogram_filename, file=sys.stderr) # start loading rparser = khmer.ReadParser(args.input_sequence_filename) threads = [] print('consuming input, round 1 --', args.input_sequence_filename, file=sys.stderr) for _ in range(args.threads): thread = \ threading.Thread( target=counting_hash.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(thread) thread.start() for thread in threads: thread.join() if args.report_total_kmers: print('Total number of unique k-mers: {0}'.format( counting_hash.n_unique_kmers()), file=sys.stderr) abundance_lists = [] def __do_abundance_dist__(read_parser): abundances = counting_hash.abundance_distribution_with_reads_parser( read_parser, tracking) abundance_lists.append(abundances) print('preparing hist from %s...' % args.input_sequence_filename, file=sys.stderr) rparser = khmer.ReadParser(args.input_sequence_filename) threads = [] print('consuming input, round 2 --', args.input_sequence_filename, file=sys.stderr) for _ in range(args.threads): thread = \ threading.Thread( target=__do_abundance_dist__, args=(rparser, ) ) threads.append(thread) thread.start() for thread in threads: thread.join() assert len(abundance_lists) == args.threads, len(abundance_lists) abundance = {} for abundance_list in abundance_lists: for i, count in enumerate(abundance_list): abundance[i] = abundance.get(i, 0) + count total = sum(abundance.values()) if 0 == total: print( "ERROR: abundance distribution is uniformly zero; " "nothing to report.", file=sys.stderr) print("\tPlease verify that the input files are valid.", file=sys.stderr) sys.exit(1) sofar = 0 for _, i in sorted(abundance.items()): if i == 0 and not args.output_zero: continue sofar += i frac = sofar / float(total) if args.csv: hist_fp_csv.writerow([_, i, sofar, round(frac, 3)]) else: print(_, i, sofar, round(frac, 3), file=hist_fp) if sofar == total: break if args.savetable: print('Saving k-mer counting table ', args.savetable, file=sys.stderr) print('...saving to', args.savetable, file=sys.stderr) counting_hash.save(args.savetable) print('wrote to: ' + args.output_histogram_filename, file=sys.stderr)
def main(): info('find-knots.py', ['graph']) args = get_parser().parse_args() graphbase = args.graphbase # @RamRS: This might need some more work infiles = [graphbase + '.pt', graphbase + '.tagset'] if os.path.exists(graphbase + '.stoptags'): infiles.append(graphbase + '.stoptags') for _ in infiles: check_input_files(_, args.force) check_space(infiles, args.force) print('loading k-mer presence table %s.pt' % graphbase, file=sys.stderr) htable = khmer.load_hashbits(graphbase + '.pt') print('loading tagset %s.tagset...' % graphbase, file=sys.stderr) htable.load_tagset(graphbase + '.tagset') initial_stoptags = False # @CTB regularize with make-initial if os.path.exists(graphbase + '.stoptags'): print('loading stoptags %s.stoptags' % graphbase, file=sys.stderr) htable.load_stop_tags(graphbase + '.stoptags') initial_stoptags = True pmap_files = glob.glob(args.graphbase + '.subset.*.pmap') print('loading %d pmap files (first one: %s)' % (len(pmap_files), pmap_files[0]), file=sys.stderr) print('---', file=sys.stderr) print('output stoptags will be in', graphbase + '.stoptags', file=sys.stderr) if initial_stoptags: print( '(these output stoptags will include the already-loaded set)', file=sys.stderr) print('---', file=sys.stderr) # create counting hash ksize = htable.ksize() counting = khmer_args.create_countgraph(args, ksize=ksize) # load & merge for index, subset_file in enumerate(pmap_files): print('<-', subset_file, file=sys.stderr) subset = htable.load_subset_partitionmap(subset_file) print('** repartitioning subset... %s' % subset_file, file=sys.stderr) htable.repartition_largest_partition(subset, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) print('** merging subset... %s' % subset_file, file=sys.stderr) htable.merge_subset(subset) print('** repartitioning, round 2... %s' % subset_file, file=sys.stderr) size = htable.repartition_largest_partition( None, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) print('** repartitioned size:', size, file=sys.stderr) print('saving stoptags binary', file=sys.stderr) htable.save_stop_tags(graphbase + '.stoptags') os.rename(subset_file, subset_file + '.processed') print('(%d of %d)\n' % (index, len(pmap_files)), file=sys.stderr) print('done!', file=sys.stderr)
def main(): info('correct-reads.py', ['streaming']) args = sanitize_help(get_parser()).parse_args() ### if len(set(args.input_filenames)) != len(args.input_filenames): print("Error: Cannot input the same filename multiple times.", file=sys.stderr) sys.exit(1) ### report_on_config(args) check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) tablesize = calculate_graphsize(args, 'countgraph') if args.savegraph: check_space_for_graph(args.savegraph, tablesize, args.force) K = args.ksize CUTOFF = args.cutoff NORMALIZE_LIMIT = args.normalize_to if args.loadgraph: print('loading k-mer countgraph from', args.loadgraph, file=sys.stderr) ct = Countgraph.load(args.loadgraph) else: print('making k-mer countgraph', file=sys.stderr) ct = create_countgraph(args, multiplier=8 / (9. + 0.3)) tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir) print('created temporary directory %s; use -T to change location' % tempdir, file=sys.stderr) aligner = khmer.ReadAligner(ct, args.cutoff, args.bits_theta) # ### FIRST PASS ### save_pass2_total = 0 n_bp = 0 n_reads = 0 written_bp = 0 written_reads = 0 corrected_reads = 0 pass2list = [] for filename in args.input_filenames: pass2filename = os.path.basename(filename) + '.pass2' pass2filename = os.path.join(tempdir, pass2filename) if args.out is None: corrfp = open(os.path.basename(filename) + '.corr', 'w') else: corrfp = args.out pass2list.append((filename, pass2filename, corrfp)) screed_iter = screed.open(filename, parse_description=False) pass2fp = open(pass2filename, 'w') save_pass2 = 0 n = 0 paired_iter = broken_paired_reader(screed_iter, min_length=K, force_single=args.ignore_pairs) for n, is_pair, read1, read2 in paired_iter: if n % 10000 == 0: print('...', n, filename, save_pass2, n_reads, n_bp, written_reads, written_bp, file=sys.stderr) # we want to track paired reads here, to make sure that pairs # are not split between first pass and second pass. if is_pair: n_reads += 2 n_bp += len(read1.sequence) + len(read2.sequence) seq1 = read1.sequence.replace('N', 'A') seq2 = read2.sequence.replace('N', 'A') med1, _, _ = ct.get_median_count(seq1) med2, _, _ = ct.get_median_count(seq2) if med1 < NORMALIZE_LIMIT or med2 < NORMALIZE_LIMIT: ct.consume(seq1) ct.consume(seq2) write_record_pair(read1, read2, pass2fp) save_pass2 += 2 else: is_aligned, new_seq1 = correct_sequence(aligner, seq1) if is_aligned: if new_seq1 != read1.sequence: corrected_reads += 1 read1.sequence = new_seq1 if hasattr(read1, 'quality'): fix_quality(read1) is_aligned, new_seq2 = correct_sequence(aligner, seq2) if is_aligned: if new_seq2 != read2.sequence: corrected_reads += 1 read2.sequence = new_seq2 if hasattr(read2, 'quality'): fix_quality(read2) write_record_pair(read1, read2, corrfp) written_reads += 2 written_bp += len(read1) written_bp += len(read2) else: n_reads += 1 n_bp += len(read1.sequence) seq = read1.sequence.replace('N', 'A') med, _, _ = ct.get_median_count(seq) # has this portion of the graph saturated? if not, # consume & save => pass2. if med < NORMALIZE_LIMIT: ct.consume(seq) write_record(read1, pass2fp) save_pass2 += 1 else: # trim!! is_aligned, new_seq = correct_sequence(aligner, seq) if is_aligned: if new_seq != read1.sequence: corrected_reads += 1 read1.sequence = new_seq if hasattr(read1, 'quality'): fix_quality(read1) write_record(read1, corrfp) written_reads += 1 written_bp += len(new_seq) pass2fp.close() print('%s: kept aside %d of %d from first pass, in %s' % (filename, save_pass2, n, filename), file=sys.stderr) save_pass2_total += save_pass2 # ### SECOND PASS. ### skipped_n = 0 skipped_bp = 0 for _, pass2filename, corrfp in pass2list: print(('second pass: looking at sequences kept aside in %s') % pass2filename, file=sys.stderr) # note that for this second pass, we don't care about paired # reads - they will be output in the same order they're read in, # so pairs will stay together if not orphaned. This is in contrast # to the first loop. for n, read in enumerate( screed.open(pass2filename, parse_description=False)): if n % 10000 == 0: print('... x 2', n, pass2filename, written_reads, written_bp, file=sys.stderr) seq = read.sequence.replace('N', 'A') med, _, _ = ct.get_median_count(seq) # do we retain low-abundance components unchanged? if med < NORMALIZE_LIMIT and args.variable_coverage: write_record(read, corrfp) written_reads += 1 written_bp += len(read.sequence) skipped_n += 1 skipped_bp += len(read.sequence) # otherwise, examine/correct. else: # med >= NORMALIZE LIMIT or not args.variable_coverage is_aligned, new_seq = correct_sequence(aligner, seq) if is_aligned: if new_seq != read.sequence: corrected_reads += 1 read.sequence = new_seq if hasattr(read, 'quality'): fix_quality(read) write_record(read, corrfp) written_reads += 1 written_bp += len(new_seq) print('removing %s' % pass2filename, file=sys.stderr) os.unlink(pass2filename) print('removing temp directory & contents (%s)' % tempdir, file=sys.stderr) shutil.rmtree(tempdir) n_passes = 1.0 + (float(save_pass2_total) / n_reads) percent_reads_corrected = float(corrected_reads + (n_reads - written_reads)) /\ n_reads * 100.0 print('read %d reads, %d bp' % ( n_reads, n_bp, ), file=sys.stderr) print('wrote %d reads, %d bp' % ( written_reads, written_bp, ), file=sys.stderr) print('looked at %d reads twice (%.2f passes)' % (save_pass2_total, n_passes), file=sys.stderr) print('removed %d reads and corrected %d reads (%.2f%%)' % (n_reads - written_reads, corrected_reads, percent_reads_corrected), file=sys.stderr) print('removed %.2f%% of bases (%d total)' % ((1 - (written_bp / float(n_bp))) * 100.0, n_bp - written_bp), file=sys.stderr) if args.variable_coverage: percent_reads_hicov = 100.0 * float(n_reads - skipped_n) / n_reads print('%d reads were high coverage (%.2f%%);' % (n_reads - skipped_n, percent_reads_hicov), file=sys.stderr) print(('skipped %d reads/%d bases because of low coverage') % (skipped_n, skipped_bp), file=sys.stderr) fp_rate = \ khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 print('fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate), file=sys.stderr) print('output in *.corr', file=sys.stderr) if args.savegraph: print("Saving k-mer countgraph to", args.savegraph, file=sys.stderr) ct.save(args.savegraph)
def main(): # pylint: disable=too-many-locals,too-many-branches args = sanitize_help(get_parser()).parse_args() graph_type = 'smallcountgraph' if args.small_count else 'countgraph' configure_logging(args.quiet) report_on_config(args, graph_type) check_input_files(args.input_sequence_filename, args.force) if args.savegraph is not None: graphsize = calculate_graphsize(args, graph_type) check_space_for_graph(args.savegraph, graphsize, args.force) if (not args.squash_output and os.path.exists(args.output_histogram_filename)): log_error('ERROR: {output} exists; not squashing.', output=args.output_histogram_filename) sys.exit(1) else: hist_fp = open(args.output_histogram_filename, 'w') hist_fp_csv = csv.writer(hist_fp) # write headers: hist_fp_csv.writerow(['abundance', 'count', 'cumulative', 'cumulative_fraction']) log_info('making countgraph') # In case the user specified a maximum memory usage, use 8/(9+eps) of that # for the countgraph and 1/(9+eps) for the tracking nodegraph # `eps` is used to account for the memory used by the python interpreter countgraph = khmer_args.create_countgraph(args, multiplier=8 / (9. + 0.3)) log_info('building k-mer tracking graph') tracking = khmer_args.create_matching_nodegraph(countgraph) log_info('kmer_size: {ksize}', ksize=countgraph.ksize()) log_info('k-mer countgraph sizes: {sizes}', sizes=countgraph.hashsizes()) log_info('outputting to {output}', output=args.output_histogram_filename) # start loading rparser = khmer.ReadParser(args.input_sequence_filename) threads = [] log_info('consuming input, round 1 -- {input}', input=args.input_sequence_filename) for _ in range(args.threads): thread = \ threading.Thread( target=countgraph.consume_seqfile, args=(rparser, ) ) threads.append(thread) thread.start() for thread in threads: thread.join() log_info('Total number of unique k-mers: {nk}', nk=countgraph.n_unique_kmers()) abundance_lists = [] def __do_abundance_dist__(read_parser): abundances = countgraph.abundance_distribution( read_parser, tracking) abundance_lists.append(abundances) log_info('preparing hist from {seqfile}...', seqfile=args.input_sequence_filename) rparser = khmer.ReadParser(args.input_sequence_filename) threads = [] log_info('consuming input, round 2 -- {filename}', filename=args.input_sequence_filename) for _ in range(args.threads): thread = \ threading.Thread( target=__do_abundance_dist__, args=(rparser, ) ) threads.append(thread) thread.start() for thread in threads: thread.join() assert len(abundance_lists) == args.threads, len(abundance_lists) abundance = {} for abundance_list in abundance_lists: for i, count in enumerate(abundance_list): abundance[i] = abundance.get(i, 0) + count total = sum(abundance.values()) if 0 == total: log_error("ERROR: abundance distribution is uniformly zero; " "nothing to report.") log_error("\tPlease verify that the input files are valid.") sys.exit(1) sofar = 0 for _, i in sorted(abundance.items()): if i == 0 and not args.output_zero: continue sofar += i frac = sofar / float(total) hist_fp_csv.writerow([_, i, sofar, round(frac, 3)]) if sofar == total: break if args.savegraph is not None: log_info('Saving k-mer countgraph to {savegraph}', savegraph=args.savegraph) countgraph.save(args.savegraph) log_info('wrote to: {output}', output=args.output_histogram_filename)
def main(): # pylint: disable=too-many-locals,too-many-branches info('abundance-dist-single.py', ['counting', 'SeqAn']) args = sanitize_help(get_parser()).parse_args() report_on_config(args) check_input_files(args.input_sequence_filename, args.force) if args.savegraph: graphsize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.savegraph, graphsize, args.force) if (not args.squash_output and os.path.exists(args.output_histogram_filename)): print('ERROR: %s exists; not squashing.' % args.output_histogram_filename, file=sys.stderr) sys.exit(1) else: hist_fp = open(args.output_histogram_filename, 'w') hist_fp_csv = csv.writer(hist_fp) # write headers: hist_fp_csv.writerow(['abundance', 'count', 'cumulative', 'cumulative_fraction']) print('making countgraph', file=sys.stderr) countgraph = khmer_args.create_countgraph(args, multiplier=1.1) countgraph.set_use_bigcount(args.bigcount) print('building k-mer tracking graph', file=sys.stderr) tracking = khmer_args.create_nodegraph(args, multiplier=1.1) print('kmer_size:', countgraph.ksize(), file=sys.stderr) print('k-mer countgraph sizes:', countgraph.hashsizes(), file=sys.stderr) print('outputting to', args.output_histogram_filename, file=sys.stderr) # start loading rparser = khmer.ReadParser(args.input_sequence_filename) threads = [] print('consuming input, round 1 --', args.input_sequence_filename, file=sys.stderr) for _ in range(args.threads): thread = \ threading.Thread( target=countgraph.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(thread) thread.start() for thread in threads: thread.join() print('Total number of unique k-mers: {0}'.format( countgraph.n_unique_kmers()), file=sys.stderr) abundance_lists = [] def __do_abundance_dist__(read_parser): abundances = countgraph.abundance_distribution_with_reads_parser( read_parser, tracking) abundance_lists.append(abundances) print('preparing hist from %s...' % args.input_sequence_filename, file=sys.stderr) rparser = khmer.ReadParser(args.input_sequence_filename) threads = [] print('consuming input, round 2 --', args.input_sequence_filename, file=sys.stderr) for _ in range(args.threads): thread = \ threading.Thread( target=__do_abundance_dist__, args=(rparser, ) ) threads.append(thread) thread.start() for thread in threads: thread.join() assert len(abundance_lists) == args.threads, len(abundance_lists) abundance = {} for abundance_list in abundance_lists: for i, count in enumerate(abundance_list): abundance[i] = abundance.get(i, 0) + count total = sum(abundance.values()) if 0 == total: print("ERROR: abundance distribution is uniformly zero; " "nothing to report.", file=sys.stderr) print( "\tPlease verify that the input files are valid.", file=sys.stderr) sys.exit(1) sofar = 0 for _, i in sorted(abundance.items()): if i == 0 and not args.output_zero: continue sofar += i frac = sofar / float(total) hist_fp_csv.writerow([_, i, sofar, round(frac, 3)]) if sofar == total: break if args.savegraph: print('Saving k-mer countgraph ', args.savegraph, file=sys.stderr) print('...saving to', args.savegraph, file=sys.stderr) countgraph.save(args.savegraph) print('wrote to: ' + args.output_histogram_filename, file=sys.stderr)
def main(): info('collect-reads.py', ['counting']) args = get_parser().parse_args() report_on_config(args) base = args.output_countgraph_filename filenames = args.input_sequence_filename for name in args.input_sequence_filename: check_input_files(name, False) check_space(args.input_sequence_filename, False) tablesize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.output_countgraph_filename, tablesize, False) print('Saving k-mer countgraph to %s' % base) print('Loading sequences from %s' % repr(filenames)) if args.output: print('Outputting sequences to', args.output) print('making countgraph', file=sys.stderr) htable = khmer_args.create_countgraph(args) htable.set_use_bigcount(args.bigcount) total_coverage = 0. n = 0 for index, filename in enumerate(filenames): for record in screed.open(filename): seq = record.sequence.upper() if 'N' in seq: seq = seq.replace('N', 'A') try: med, _, _ = htable.get_median_count(seq) except ValueError: continue total_coverage += med n += 1 if total_coverage / float(n) > args.coverage: print('reached target average coverage:', \ total_coverage / float(n)) break htable.consume(seq) if args.output: args.output.write(output_single(record)) if n % 100000 == 0: print('...', index, filename, n, total_coverage / float(n)) if total_coverage / float(n) > args.coverage: break print('Collected %d reads' % (n,)) if args.report_total_kmers: print('Total number of k-mers: {0}'.format( htable.n_occupied()), file=sys.stderr) print('saving', base) htable.save(base) info_fp = open(base + '.info', 'w') info_fp.write('through end: %s\n' % filenames[-1]) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(htable, False, max_false_pos=.2) print('fp rate estimated to be %1.3f' % fp_rate) print('fp rate estimated to be %1.3f' % fp_rate, file=info_fp) print('DONE.')
def main(): # pylint: disable=too-many-branches,too-many-statements info('normalize-by-median.py', ['diginorm']) args = get_parser().parse_args() report_on_config(args) report_fp = args.report force_single = args.force_single # check for similar filenames # if we're using a single output file only check for identical filenames # otherwise, check for identical BASE names as well. filenames = [] basenames = [] for pathfilename in args.input_filenames: filenames.append(pathfilename) if args.single_output_file: continue # nothing more to worry about basename = os.path.basename(pathfilename) if basename in basenames: print('ERROR: Duplicate filename--Cannot handle this!', file=sys.stderr) print('** Exiting!', file=sys.stderr) sys.exit(1) basenames.append(basename) # check that files exist and there is sufficient output disk space. check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savetable: check_space_for_hashtable(args, 'countgraph', args.force) # load or create counting table. if args.loadtable: print('loading k-mer counting table from ' + args.loadtable, file=sys.stderr) htable = khmer.load_counting_hash(args.loadtable) else: print('making countgraph', file=sys.stderr) htable = khmer_args.create_countgraph(args) input_filename = None # create an object to handle diginorm of all files norm = Normalizer(args.cutoff, htable) # make a list of all filenames and if they're paired or not; # if we don't know if they're paired, default to allowing but not # forcing pairing. files = [] for e in filenames: files.append([e, args.paired]) if args.unpaired_reads: files.append([args.unpaired_reads, False]) corrupt_files = [] outfp = None output_name = None if args.single_output_file: if args.single_output_file is sys.stdout: output_name = '/dev/stdout' else: output_name = args.single_output_file.name outfp = args.single_output_file # # main loop: iterate over all files given, do diginorm. # for filename, require_paired in files: if not args.single_output_file: output_name = os.path.basename(filename) + '.keep' outfp = open(output_name, 'w') # failsafe context manager in case an input file breaks with CatchIOErrors(filename, outfp, args.single_output_file, args.force, corrupt_files): screed_iter = screed.open(filename, parse_description=False) reader = broken_paired_reader(screed_iter, min_length=args.ksize, force_single=force_single, require_paired=require_paired) # actually do diginorm for record in WithDiagnostics(filename, norm, reader, report_fp): if record is not None: write_record(record, outfp) print('output in ' + output_name, file=sys.stderr) if output_name is not '/dev/stdout': outfp.close() # finished - print out some diagnostics. print('Total number of unique k-mers: {0}' .format(htable.n_unique_kmers()), file=sys.stderr) if args.savetable: print('...saving to ' + args.savetable, file=sys.stderr) htable.save(args.savetable) fp_rate = \ khmer.calc_expected_collisions(htable, args.force, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 print('fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate), file=sys.stderr) if args.force and len(corrupt_files) > 0: print("** WARNING: Finished with errors!", file=sys.stderr) print("** I/O Errors occurred in the following files:", file=sys.stderr) print("\t", " ".join(corrupt_files), file=sys.stderr)
def main(): p = build_counting_args(descr='Streaming assembly with tracking info') p.add_argument('fastq_files', nargs='+') p.add_argument('-o', type=argparse.FileType('w'), default='assembly-stats.csv') args = p.parse_args() cg = create_countgraph(args) kept = 0 hdn = khmer.HashSet(args.ksize) lh = khmer._GraphLabels(cg) next_label = 1 next_orf = 1 output = set() statswriter = csv.DictWriter(args.o, delimiter=',', fieldnames=['read_n', 'action', 'cov', 'n_hdn', 'contig_n', 'orf_n', 'new']) for filename in args.fastq_files: for n, record in enumerate(screed.open(filename)): if n and n % 10000 == 0: print('...', n, file=sys.stderr) if len(record.sequence) < args.ksize: continue cov, _, _ = cg.get_median_count(record.sequence) if cov < 20: kept += 1 cg.consume(record.sequence) statswriter.writerow({'read_n': n, 'action': 'c', 'cov': cov, 'n_hdn': None, 'contig_n': None, 'orf_n': None, 'new': None}) elif cov < 30: #print('intermediate', next_label, file=sys.stderr) seq, pos = cg.trim_on_abundance(record.sequence, 3) if len(seq) < args.ksize: continue cg.consume(seq) hdn = cg.find_high_degree_nodes(seq) lh.label_across_high_degree_nodes(seq, hdn, next_label) next_label += 1 statswriter.writerow({'read_n': n, 'action': 'l', 'cov': cov, 'n_hdn': len(hdn), 'contig_n': None, 'orf_n': None, 'new': None}) elif cov == 30: contigs = lh.assemble_labeled_path(record.sequence[:args.ksize]) for contig_n, contig in enumerate(contigs): statswriter.writerow({'read_n': n, 'action': 'a', 'cov': cov, 'n_hdn': None, 'contig_n': contig_n, 'orf_n': None, 'new': None}) for t in translate(contig): for orf_n, o in enumerate(extract_orfs(t)): if hash(o) not in output: new = True output.add(hash(o)) print('>orf%d\n%s' % (next_orf, o)) next_orf += 1 else: new = False statswriter.writerow({'read_n': n, 'action': 'a', 'cov': cov, 'n_hdn': None, 'contig_n': contig_n, 'orf_n': orf_n, 'new': new})
def main(): info('trim-low-abund.py', ['streaming']) parser = sanitize_help(get_parser()) args = parser.parse_args() ### if len(set(args.input_filenames)) != len(args.input_filenames): print("Error: Cannot input the same filename multiple times.", file=sys.stderr) sys.exit(1) ### report_on_config(args) check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savegraph: graphsize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.savegraph, graphsize, args.force) if ('-' in args.input_filenames or '/dev/stdin' in args.input_filenames) \ and not args.output: print("Accepting input from stdin; output filename must " "be provided with -o.", file=sys.stderr) sys.exit(1) if args.loadgraph: print('loading countgraph from', args.loadgraph, file=sys.stderr) ct = khmer.load_countgraph(args.loadgraph) else: print('making countgraph', file=sys.stderr) ct = khmer_args.create_countgraph(args) K = ct.ksize() CUTOFF = args.cutoff NORMALIZE_LIMIT = args.normalize_to tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir) print('created temporary directory %s; ' 'use -T to change location' % tempdir, file=sys.stderr) # ### FIRST PASS ### save_pass2_total = 0 n_bp = 0 n_reads = 0 written_bp = 0 written_reads = 0 trimmed_reads = 0 pass2list = [] for filename in args.input_filenames: pass2filename = os.path.basename(filename) + '.pass2' pass2filename = os.path.join(tempdir, pass2filename) if args.output is None: trimfp = get_file_writer(open(os.path.basename(filename) + '.abundtrim', 'wb'), args.gzip, args.bzip) else: trimfp = get_file_writer(args.output, args.gzip, args.bzip) pass2list.append((filename, pass2filename, trimfp)) screed_iter = screed.open(filename) pass2fp = open(pass2filename, 'w') save_pass2 = 0 n = 0 paired_iter = broken_paired_reader(screed_iter, min_length=K, force_single=args.ignore_pairs) for n, is_pair, read1, read2 in paired_iter: if n % 10000 == 0: print('...', n, filename, save_pass2, n_reads, n_bp, written_reads, written_bp, file=sys.stderr) # we want to track paired reads here, to make sure that pairs # are not split between first pass and second pass. if is_pair: n_reads += 2 n_bp += len(read1.sequence) + len(read2.sequence) seq1 = read1.sequence.replace('N', 'A') seq2 = read2.sequence.replace('N', 'A') med1, _, _ = ct.get_median_count(seq1) med2, _, _ = ct.get_median_count(seq2) if med1 < NORMALIZE_LIMIT or med2 < NORMALIZE_LIMIT: ct.consume(seq1) ct.consume(seq2) write_record_pair(read1, read2, pass2fp) save_pass2 += 2 else: _, trim_at1 = ct.trim_on_abundance(seq1, CUTOFF) _, trim_at2 = ct.trim_on_abundance(seq2, CUTOFF) if trim_at1 >= K: read1 = trim_record(read1, trim_at1) if trim_at2 >= K: read2 = trim_record(read2, trim_at2) if trim_at1 != len(seq1): trimmed_reads += 1 if trim_at2 != len(seq2): trimmed_reads += 1 write_record_pair(read1, read2, trimfp) written_reads += 2 written_bp += trim_at1 + trim_at2 else: n_reads += 1 n_bp += len(read1.sequence) seq = read1.sequence.replace('N', 'A') med, _, _ = ct.get_median_count(seq) # has this portion of the graph saturated? if not, # consume & save => pass2. if med < NORMALIZE_LIMIT: ct.consume(seq) write_record(read1, pass2fp) save_pass2 += 1 else: # trim!! _, trim_at = ct.trim_on_abundance(seq, CUTOFF) if trim_at >= K: new_read = trim_record(read1, trim_at) write_record(new_read, trimfp) written_reads += 1 written_bp += trim_at if trim_at != len(read1.sequence): trimmed_reads += 1 pass2fp.close() print('%s: kept aside %d of %d from first pass, in %s' % (filename, save_pass2, n, filename), file=sys.stderr) save_pass2_total += save_pass2 # ### SECOND PASS. ### skipped_n = 0 skipped_bp = 0 for _, pass2filename, trimfp in pass2list: print('second pass: looking at sequences kept aside in %s' % pass2filename, file=sys.stderr) # note that for this second pass, we don't care about paired # reads - they will be output in the same order they're read in, # so pairs will stay together if not orphaned. This is in contrast # to the first loop. for n, read in enumerate(screed.open(pass2filename)): if n % 10000 == 0: print('... x 2', n, pass2filename, written_reads, written_bp, file=sys.stderr) seq = read.sequence.replace('N', 'A') med, _, _ = ct.get_median_count(seq) # do we retain low-abundance components unchanged? if med < NORMALIZE_LIMIT and args.variable_coverage: write_record(read, trimfp) written_reads += 1 written_bp += len(read.sequence) skipped_n += 1 skipped_bp += len(read.sequence) # otherwise, examine/trim/truncate. else: # med >= NORMALIZE LIMIT or not args.variable_coverage _, trim_at = ct.trim_on_abundance(seq, CUTOFF) if trim_at >= K: new_read = trim_record(read, trim_at) write_record(new_read, trimfp) written_reads += 1 written_bp += trim_at if trim_at != len(read.sequence): trimmed_reads += 1 print('removing %s' % pass2filename, file=sys.stderr) os.unlink(pass2filename) print('removing temp directory & contents (%s)' % tempdir, file=sys.stderr) shutil.rmtree(tempdir) n_passes = 1.0 + (float(save_pass2_total) / n_reads) percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\ n_reads * 100.0 print('read %d reads, %d bp' % (n_reads, n_bp,), file=sys.stderr) print('wrote %d reads, %d bp' % (written_reads, written_bp,), file=sys.stderr) print('looked at %d reads twice (%.2f passes)' % (save_pass2_total, n_passes), file=sys.stderr) print('removed %d reads and trimmed %d reads (%.2f%%)' % (n_reads - written_reads, trimmed_reads, percent_reads_trimmed), file=sys.stderr) print('trimmed or removed %.2f%% of bases (%d total)' % ((1 - (written_bp / float(n_bp))) * 100.0, n_bp - written_bp), file=sys.stderr) if args.variable_coverage: percent_reads_hicov = 100.0 * float(n_reads - skipped_n) / n_reads print('%d reads were high coverage (%.2f%%);' % (n_reads - skipped_n, percent_reads_hicov), file=sys.stderr) print('skipped %d reads/%d bases because of low coverage' % (skipped_n, skipped_bp), file=sys.stderr) fp_rate = \ khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 print('fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate), file=sys.stderr) print('output in *.abundtrim', file=sys.stderr) if args.savegraph: print("Saving k-mer countgraph to", args.savegraph, file=sys.stderr) ct.save(args.savegraph)
def main(): info('filter-abund-single.py', ['counting', 'SeqAn']) args = get_parser().parse_args() check_input_files(args.datafile, args.force) check_space([args.datafile], args.force) if args.savetable: check_space_for_hashtable(args, 'countgraph', args.force) report_on_config(args) print('making countgraph', file=sys.stderr) htable = khmer_args.create_countgraph(args) # first, load reads into hash table rparser = khmer.ReadParser(args.datafile) threads = [] print('consuming input, round 1 --', args.datafile, file=sys.stderr) for _ in range(args.threads): cur_thread = \ threading.Thread( target=htable.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thread) cur_thread.start() for _ in threads: _.join() if args.report_total_kmers: print('Total number of unique k-mers: {0}'.format( htable.n_unique_kmers()), file=sys.stderr) fp_rate = khmer.calc_expected_collisions(htable, args.force) print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr) # now, trim. # the filtering function. def process_fn(record): name = record.name seq = record.sequence seqN = seq.replace('N', 'A') _, trim_at = htable.trim_on_abundance(seqN, args.cutoff) if trim_at >= args.ksize: # be sure to not to change the 'N's in the trimmed sequence - # so, return 'seq' and not 'seqN'. return name, seq[:trim_at] return None, None # the filtering loop print('filtering', args.datafile, file=sys.stderr) outfile = os.path.basename(args.datafile) + '.abundfilt' outfp = open(outfile, 'w') tsp = ThreadedSequenceProcessor(process_fn) tsp.start(verbose_loader(args.datafile), outfp) print('output in', outfile, file=sys.stderr) if args.savetable: print('Saving k-mer counting table filename', args.savetable, file=sys.stderr) print('...saving to', args.savetable, file=sys.stderr) htable.save(args.savetable) print('wrote to: ', outfile, file=sys.stderr)
def main(): info('load-into-counting.py', ['counting', 'SeqAn']) args = sanitize_help(get_parser()).parse_args() report_on_config(args) base = args.output_countgraph_filename filenames = args.input_sequence_filename for name in args.input_sequence_filename: check_input_files(name, args.force) tablesize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.output_countgraph_filename, tablesize, args.force) check_file_writable(base) check_file_writable(base + ".info") print('Saving k-mer countgraph to %s' % base, file=sys.stderr) print('Loading kmers from sequences in %s' % repr(filenames), file=sys.stderr) # clobber the '.info' file now, as we always open in append mode below if os.path.exists(base + '.info'): os.remove(base + '.info') print('making countgraph', file=sys.stderr) countgraph = khmer_args.create_countgraph(args) countgraph.set_use_bigcount(args.bigcount) filename = None total_num_reads = 0 for index, filename in enumerate(filenames): rparser = khmer.ReadParser(filename) threads = [] print('consuming input', filename, file=sys.stderr) for _ in range(args.threads): cur_thrd = \ threading.Thread( target=countgraph.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thrd) cur_thrd.start() for thread in threads: thread.join() if index > 0 and index % 10 == 0: tablesize = calculate_graphsize(args, 'countgraph') check_space_for_graph(base, tablesize, args.force) print('mid-save', base, file=sys.stderr) countgraph.save(base) with open(base + '.info', 'a') as info_fh: print('through', filename, file=info_fh) total_num_reads += rparser.num_reads n_kmers = countgraph.n_unique_kmers() print('Total number of unique k-mers:', n_kmers, file=sys.stderr) with open(base + '.info', 'a') as info_fp: print('Total number of unique k-mers:', n_kmers, file=info_fp) print('saving', base, file=sys.stderr) countgraph.save(base) # Change max_false_pos=0.2 only if you really grok it. HINT: You don't fp_rate = \ khmer.calc_expected_collisions( countgraph, args.force, max_false_pos=.2) with open(base + '.info', 'a') as info_fp: print('fp rate estimated to be %1.3f\n' % fp_rate, file=info_fp) if args.summary_info: mr_fmt = args.summary_info.lower() mr_file = base + '.info.' + mr_fmt print("Writing summmary info to", mr_file, file=sys.stderr) with open(mr_file, 'w') as mr_fh: if mr_fmt == 'json': mr_data = { "ht_name": os.path.basename(base), "fpr": fp_rate, "num_kmers": n_kmers, "files": filenames, "mrinfo_version": "0.2.0", "num_reads": total_num_reads, } json.dump(mr_data, mr_fh) mr_fh.write('\n') elif mr_fmt == 'tsv': mr_fh.write("ht_name\tfpr\tnum_kmers\tnum_reads\tfiles\n") vals = [ os.path.basename(base), "{:1.3f}".format(fp_rate), str(n_kmers), str(total_num_reads), ";".join(filenames), ] mr_fh.write("\t".join(vals) + "\n") print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr) print('DONE.', file=sys.stderr) print('wrote to:', base + '.info', file=sys.stderr)
def main(): parser = sanitize_help(get_parser()) args = parser.parse_args() if not args.quiet: info('trim-low-abund.py', ['streaming']) configure_logging(args.quiet) ### if len(set(args.input_filenames)) != len(args.input_filenames): log_error("Error: Cannot input the same filename multiple times.") sys.exit(1) if args.trim_at_coverage != DEFAULT_TRIM_AT_COVERAGE and \ not args.variable_coverage: log_error("Error: --trim-at-coverage/-Z given, but " "--variable-coverage/-V not specified.") sys.exit(1) if args.diginorm_coverage != DEFAULT_DIGINORM_COVERAGE and \ not args.diginorm: log_error("Error: --diginorm-coverage given, but " "--diginorm not specified.") sys.exit(1) if args.diginorm and args.single_pass: log_error("Error: --diginorm and --single-pass are incompatible!\n" "You probably want to use normalize-by-median.py instead.") sys.exit(1) ### report_on_config(args) check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savegraph: graphsize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.savegraph, graphsize, args.force) if ('-' in args.input_filenames or '/dev/stdin' in args.input_filenames) \ and not args.output: log_error("Accepting input from stdin; output filename must " "be provided with -o.") sys.exit(1) if args.loadgraph: log_info('loading countgraph from {graph}', graph=args.loadgraph) ct = khmer.load_countgraph(args.loadgraph) else: log_info('making countgraph') ct = khmer_args.create_countgraph(args) K = ct.ksize() tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir) log_info('created temporary directory {temp};\n' 'use -T to change location', temp=tempdir) trimmer = Trimmer(ct, not args.variable_coverage, args.cutoff, args.trim_at_coverage) if args.diginorm: trimmer.set_diginorm(args.diginorm_coverage) # ### FIRST PASS ### save_pass2_total = 0 written_bp = 0 written_reads = 0 # only create the file writer once if outfp is specified; otherwise, # create it for each file. if args.output: trimfp = get_file_writer(args.output, args.gzip, args.bzip) pass2list = [] for filename in args.input_filenames: # figure out temporary filename for 2nd pass pass2filename = os.path.basename(filename) + '.pass2' pass2filename = os.path.join(tempdir, pass2filename) pass2fp = open(pass2filename, 'w') # construct output filenames if args.output is None: # note: this will be saved in trimfp. outfp = open(os.path.basename(filename) + '.abundtrim', 'wb') # get file handle w/gzip, bzip trimfp = get_file_writer(outfp, args.gzip, args.bzip) # record all this info pass2list.append((filename, pass2filename, trimfp)) # input file stuff: get a broken_paired reader. screed_iter = screed.open(filename) paired_iter = broken_paired_reader(screed_iter, min_length=K, force_single=args.ignore_pairs) # main loop through the file. n_start = trimmer.n_reads save_start = trimmer.n_saved watermark = REPORT_EVERY_N_READS for read in trimmer.pass1(paired_iter, pass2fp): if (trimmer.n_reads - n_start) > watermark: log_info("... {filename} {n_saved} {n_reads} {n_bp} " "{w_reads} {w_bp}", filename=filename, n_saved=trimmer.n_saved, n_reads=trimmer.n_reads, n_bp=trimmer.n_bp, w_reads=written_reads, w_bp=written_bp) watermark += REPORT_EVERY_N_READS # write out the trimmed/etc sequences that AREN'T going to be # revisited in a 2nd pass. write_record(read, trimfp) written_bp += len(read) written_reads += 1 pass2fp.close() log_info("{filename}: kept aside {kept} of {total} from first pass", filename=filename, kept=trimmer.n_saved - save_start, total=trimmer.n_reads - n_start) # first pass goes across all the data, so record relevant stats... n_reads = trimmer.n_reads n_bp = trimmer.n_bp n_skipped = trimmer.n_skipped bp_skipped = trimmer.bp_skipped save_pass2_total = trimmer.n_saved # ### SECOND PASS. ### # nothing should have been skipped yet! assert trimmer.n_skipped == 0 assert trimmer.bp_skipped == 0 if args.single_pass: pass2list = [] # go back through all the files again. for _, pass2filename, trimfp in pass2list: log_info('second pass: looking at sequences kept aside in {pass2}', pass2=pass2filename) # note that for this second pass, we don't care about paired # reads - they will be output in the same order they're read in, # so pairs will stay together if not orphaned. This is in contrast # to the first loop. Hence, force_single=True below. screed_iter = screed.open(pass2filename, parse_description=False) paired_iter = broken_paired_reader(screed_iter, min_length=K, force_single=True) watermark = REPORT_EVERY_N_READS for read in trimmer.pass2(paired_iter): if (trimmer.n_reads - n_start) > watermark: log_info('... x 2 {a} {b} {c} {d} {e} {f} {g}', a=trimmer.n_reads - n_start, b=pass2filename, c=trimmer.n_saved, d=trimmer.n_reads, e=trimmer.n_bp, f=written_reads, g=written_bp) watermark += REPORT_EVERY_N_READS write_record(read, trimfp) written_reads += 1 written_bp += len(read) log_info('removing {pass2}', pass2=pass2filename) os.unlink(pass2filename) # if we created our own trimfps, close 'em. if not args.output: trimfp.close() log_info('removing temp directory & contents ({temp})', temp=tempdir) shutil.rmtree(tempdir) trimmed_reads = trimmer.trimmed_reads n_passes = 1.0 + (float(save_pass2_total) / n_reads) percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\ n_reads * 100.0 log_info('read {read} reads, {bp} bp', read=n_reads, bp=n_bp) log_info('wrote {wr} reads, {wbp} bp', wr=written_reads, wbp=written_bp) log_info('looked at {st} reads twice ({np:.2f} passes)', st=save_pass2_total, np=n_passes) log_info('removed {r} reads and trimmed {t} reads ({p:.2f}%)', r=n_reads - written_reads, t=trimmed_reads, p=percent_reads_trimmed) log_info('trimmed or removed {p:.2f}%% of bases ({bp} total)', p=(1 - (written_bp / float(n_bp))) * 100.0, bp=n_bp - written_bp) if args.variable_coverage: percent_reads_hicov = 100.0 * float(n_reads - n_skipped) / n_reads log_info('{n} reads were high coverage ({p:.2f}%);', n=n_reads - n_skipped, p=percent_reads_hicov) log_info('skipped {r} reads/{bp} bases because of low coverage', r=n_skipped, bp=bp_skipped) fp_rate = \ khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate) log_info('output in *.abundtrim') if args.savegraph: log_info("Saving k-mer countgraph to {graph}", graph=args.savegraph) ct.save(args.savegraph)
def main(): info('load-into-counting.py', ['counting', 'SeqAn']) args = get_parser().parse_args() report_on_config(args) base = args.output_countingtable_filename filenames = args.input_sequence_filename for name in args.input_sequence_filename: check_input_files(name, args.force) check_space(args.input_sequence_filename, args.force) check_space_for_hashtable(args, 'countgraph', args.force) check_file_writable(base) check_file_writable(base + ".info") print('Saving k-mer counting table to %s' % base, file=sys.stderr) print('Loading kmers from sequences in %s' % repr(filenames), file=sys.stderr) # clobber the '.info' file now, as we always open in append mode below if os.path.exists(base + '.info'): os.remove(base + '.info') print('making countgraph', file=sys.stderr) htable = khmer_args.create_countgraph(args) htable.set_use_bigcount(args.bigcount) filename = None total_num_reads = 0 for index, filename in enumerate(filenames): rparser = khmer.ReadParser(filename) threads = [] print('consuming input', filename, file=sys.stderr) for _ in range(args.threads): cur_thrd = \ threading.Thread( target=htable.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thrd) cur_thrd.start() for thread in threads: thread.join() if index > 0 and index % 10 == 0: check_space_for_hashtable(args, 'countgraph', args.force) print('mid-save', base, file=sys.stderr) htable.save(base) with open(base + '.info', 'a') as info_fh: print('through', filename, file=info_fh) total_num_reads += rparser.num_reads n_kmers = htable.n_unique_kmers() if args.report_total_kmers: print('Total number of unique k-mers:', n_kmers, file=sys.stderr) with open(base + '.info', 'a') as info_fp: print('Total number of unique k-mers:', n_kmers, file=info_fp) print('saving', base, file=sys.stderr) htable.save(base) # Change max_false_pos=0.2 only if you really grok it. HINT: You don't fp_rate = \ khmer.calc_expected_collisions(htable, args.force, max_false_pos=.2) with open(base + '.info', 'a') as info_fp: print('fp rate estimated to be %1.3f\n' % fp_rate, file=info_fp) if args.summary_info: mr_fmt = args.summary_info.lower() mr_file = base + '.info.' + mr_fmt print("Writing summmary info to", mr_file, file=sys.stderr) with open(mr_file, 'w') as mr_fh: if mr_fmt == 'json': mr_data = { "ht_name": os.path.basename(base), "fpr": fp_rate, "num_kmers": n_kmers, "files": filenames, "mrinfo_version": "0.2.0", "num_reads": total_num_reads, } json.dump(mr_data, mr_fh) mr_fh.write('\n') elif mr_fmt == 'tsv': mr_fh.write("ht_name\tfpr\tnum_kmers\tnum_reads\tfiles\n") vals = [ os.path.basename(base), "{:1.3f}".format(fp_rate), str(n_kmers), str(total_num_reads), ";".join(filenames), ] mr_fh.write("\t".join(vals) + "\n") print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr) print('DONE.', file=sys.stderr) print('wrote to:', base + '.info', file=sys.stderr)
def main(): p = build_counting_args(descr='Streaming assembly with tracking info') p.add_argument('fastq_files', nargs='+') p.add_argument('--prefix', default='transcriptome') args = p.parse_args() cg = create_countgraph(args) asm = khmer.JunctionCountAssembler(cg) tr_fn = '{0}.transcripts.fa'.format(args.prefix) orf_fn = '{0}.orfs.fa'.format(args.prefix) stats_fn = '{0}.stats.fa'.format(args.prefix) with open(tr_fn, 'w') as tr_fp,\ open(orf_fn, 'w') as orf_fp,\ open(stats_fn, 'w') as stats_fp: kept = 0 next_contig = 1 next_orf = 1 output = set() statswriter = csv.DictWriter( stats_fp, delimiter=',', fieldnames=['read_n', 'action', 'cov', 'n_junctions', 'contig_n']) for filename in args.fastq_files: for n, record in enumerate(screed.open(filename)): if n and n % 10000 == 0: print('...', n, file=sys.stderr) if len(record.sequence) < args.ksize: continue cov, _, _ = cg.get_median_count(record.sequence) if cov < 20: kept += 1 cg.consume(record.sequence) statswriter.writerow({ 'read_n': n, 'action': 'c', 'cov': cov, 'n_junctions': None, 'contig_n': None }) elif cov < 30: seq, pos = cg.trim_on_abundance(record.sequence, 3) if len(seq) < args.ksize: continue n_junctions = asm.consume(seq) statswriter.writerow({ 'read_n': n, 'action': 't', 'cov': cov, 'n_junctions': n_junctions, 'contig_n': None }) elif cov == 30: contigs = asm.assemble(record.sequence[:args.ksize]) for contig_n, contig in enumerate(contigs): statswriter.writerow({ 'read_n': n, 'action': 'a', 'cov': cov, 'n_junctions': None, 'contig_n': (next_contig, contig_n) }) tr_fp.write('>contig%d\n%s\n' % (next_contig, contig)) next_contig += 1 for t in translate(contig): for orf_n, o in enumerate(extract_orfs(t)): if hash(o) not in output: new = True output.add(hash(o)) orf_fp.write('>orf%d\n%s\n' % (next_orf, o)) next_orf += 1 else: new = False else: statswriter.writerow({ 'read_n': n, 'action': 's', 'cov': cov, 'n_junctions': None, 'contig_n': None })
def main(): info('find-knots.py', ['graph']) parser = get_parser() parser.epilog = parser.epilog.replace( ":doc:`partitioning-big-data`", "http://khmer.readthedocs.org/en/stable/user/" "partitioning-big-data.html") args = sanitize_help(parser).parse_args() graphbase = args.graphbase # @RamRS: This might need some more work infiles = [graphbase, graphbase + '.tagset'] if os.path.exists(graphbase + '.stoptags'): infiles.append(graphbase + '.stoptags') for _ in infiles: check_input_files(_, args.force) check_space(infiles, args.force) print('loading k-mer nodegraph %s' % graphbase, file=sys.stderr) graph = khmer.load_nodegraph(graphbase) print('loading tagset %s.tagset...' % graphbase, file=sys.stderr) graph.load_tagset(graphbase + '.tagset') initial_stoptags = False # @CTB regularize with make-initial if os.path.exists(graphbase + '.stoptags'): print('loading stoptags %s.stoptags' % graphbase, file=sys.stderr) graph.load_stop_tags(graphbase + '.stoptags') initial_stoptags = True pmap_files = glob.glob(args.graphbase + '.subset.*.pmap') print('loading %d pmap files (first one: %s)' % (len(pmap_files), pmap_files[0]), file=sys.stderr) print('---', file=sys.stderr) print('output stoptags will be in', graphbase + '.stoptags', file=sys.stderr) if initial_stoptags: print('(these output stoptags will include the already-loaded set)', file=sys.stderr) print('---', file=sys.stderr) # create countgraph ksize = graph.ksize() counting = khmer_args.create_countgraph(args, ksize=ksize) # load & merge for index, subset_file in enumerate(pmap_files): print('<-', subset_file, file=sys.stderr) subset = graph.load_subset_partitionmap(subset_file) print('** repartitioning subset... %s' % subset_file, file=sys.stderr) graph.repartition_largest_partition(subset, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) print('** merging subset... %s' % subset_file, file=sys.stderr) graph.merge_subset(subset) print('** repartitioning, round 2... %s' % subset_file, file=sys.stderr) size = graph.repartition_largest_partition( None, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) print('** repartitioned size:', size, file=sys.stderr) print('saving stoptags binary', file=sys.stderr) graph.save_stop_tags(graphbase + '.stoptags') os.rename(subset_file, subset_file + '.processed') print('(%d of %d)\n' % (index, len(pmap_files)), file=sys.stderr) print('done!', file=sys.stderr)
def main(): args = sanitize_help(get_parser()).parse_args() configure_logging(args.quiet) report_on_config(args) base = args.output_countgraph_filename filenames = args.input_sequence_filename for name in args.input_sequence_filename: check_input_files(name, args.force) tablesize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.output_countgraph_filename, tablesize, args.force) info_filename = base + ".info" check_file_writable(base) check_file_writable(info_filename) log_info('Saving k-mer countgraph to {base}', base=base) log_info('Loading kmers from sequences in {filenames}', filenames=repr(filenames)) # clobber the '.info' file now, as we always open in append mode below with open(info_filename, 'w') as info_fp: print('khmer version:', khmer.__version__, file=info_fp) log_info('making countgraph') countgraph = khmer_args.create_countgraph(args) filename = None total_num_reads = 0 for index, filename in enumerate(filenames): rparser = khmer.ReadParser(filename) threads = [] log_info('consuming input {input}', input=filename) for _ in range(args.threads): cur_thrd = \ threading.Thread( target=countgraph.consume_seqfile_with_reads_parser, args=(rparser, ) ) threads.append(cur_thrd) cur_thrd.start() for thread in threads: thread.join() if index > 0 and index % 10 == 0: tablesize = calculate_graphsize(args, 'countgraph') check_space_for_graph(base, tablesize, args.force) log_info('mid-save {base}', base=base) countgraph.save(base) with open(info_filename, 'a') as info_fh: print('through', filename, file=info_fh) total_num_reads += rparser.num_reads n_kmers = countgraph.n_unique_kmers() log_info('Total number of unique k-mers: {nk}', nk=n_kmers) with open(info_filename, 'a') as info_fp: print('Total number of unique k-mers:', n_kmers, file=info_fp) log_info('saving {base}', base=base) countgraph.save(base) # Change max_false_pos=0.2 only if you really grok it. HINT: You don't fp_rate = \ khmer.calc_expected_collisions( countgraph, args.force, max_false_pos=.2) with open(info_filename, 'a') as info_fp: print('fp rate estimated to be %1.3f\n' % fp_rate, file=info_fp) if args.summary_info: mr_fmt = args.summary_info.lower() mr_file = base + '.info.' + mr_fmt log_info("Writing summmary info to {mr_file}", mr_file=mr_file) with open(mr_file, 'w') as mr_fh: if mr_fmt == 'json': mr_data = { "ht_name": os.path.basename(base), "fpr": fp_rate, "num_kmers": n_kmers, "files": filenames, "mrinfo_version": "0.2.0", "num_reads": total_num_reads, } json.dump(mr_data, mr_fh) mr_fh.write('\n') elif mr_fmt == 'tsv': mr_fh.write("ht_name\tfpr\tnum_kmers\tnum_reads\tfiles\n") vals = [ os.path.basename(base), "{:1.3f}".format(fp_rate), str(n_kmers), str(total_num_reads), ";".join(filenames), ] mr_fh.write("\t".join(vals) + "\n") log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate) log_info('DONE.') log_info('wrote to: {filename}', filename=info_filename)
def main(): args = sanitize_help(get_parser()).parse_args() configure_logging(args.quiet) check_input_files(args.datafile, args.force) check_space([args.datafile], args.force) if args.savegraph: tablesize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.savegraph, tablesize, args.force) report_on_config(args) log_info('making countgraph') graph = khmer_args.create_countgraph(args) # first, load reads into graph rparser = khmer.ReadParser(args.datafile) threads = [] log_info('consuming input, round 1 -- {datafile}', datafile=args.datafile) for _ in range(args.threads): cur_thread = \ threading.Thread( target=graph.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thread) cur_thread.start() for _ in threads: _.join() log_info('Total number of unique k-mers: {nk}', nk=graph.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(graph, args.force) log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate) # the filtering loop log_info('filtering {datafile}', datafile=args.datafile) if args.outfile is None: outfile = os.path.basename(args.datafile) + '.abundfilt' else: outfile = args.outfile outfp = open(outfile, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) paired_iter = broken_paired_reader(ReadParser(args.datafile), min_length=graph.ksize(), force_single=True) for n, is_pair, read1, read2 in paired_iter: assert not is_pair assert read2 is None trimmed_record, _ = trim_record(graph, read1, args.cutoff, args.variable_coverage, args.normalize_to) if trimmed_record: print((trimmed_record,)) write_record(trimmed_record, outfp) log_info('output in {outfile}', outfile=outfile) if args.savegraph: log_info('Saving k-mer countgraph filename {graph}', graph=args.savegraph) graph.save(args.savegraph)
def main(): args = sanitize_help(get_parser()).parse_args() if not args.quiet: info('load-into-counting.py', ['counting', 'SeqAn']) configure_logging(args.quiet) report_on_config(args) base = args.output_countgraph_filename filenames = args.input_sequence_filename for name in args.input_sequence_filename: check_input_files(name, args.force) tablesize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.output_countgraph_filename, tablesize, args.force) info_filename = base + ".info" check_file_writable(base) check_file_writable(info_filename) log_info('Saving k-mer countgraph to {base}', base=base) log_info('Loading kmers from sequences in {filenames}', filenames=repr(filenames)) # clobber the '.info' file now, as we always open in append mode below with open(info_filename, 'w') as info_fp: print('khmer version:', khmer.__version__, file=info_fp) log_info('making countgraph') countgraph = khmer_args.create_countgraph(args) countgraph.set_use_bigcount(args.bigcount) filename = None total_num_reads = 0 for index, filename in enumerate(filenames): rparser = khmer.ReadParser(filename) threads = [] log_info('consuming input {input}', input=filename) for _ in range(args.threads): cur_thrd = \ threading.Thread( target=countgraph.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thrd) cur_thrd.start() for thread in threads: thread.join() if index > 0 and index % 10 == 0: tablesize = calculate_graphsize(args, 'countgraph') check_space_for_graph(base, tablesize, args.force) log_info('mid-save {base}', base=base) countgraph.save(base) with open(info_filename, 'a') as info_fh: print('through', filename, file=info_fh) total_num_reads += rparser.num_reads n_kmers = countgraph.n_unique_kmers() log_info('Total number of unique k-mers: {nk}', nk=n_kmers) with open(info_filename, 'a') as info_fp: print('Total number of unique k-mers:', n_kmers, file=info_fp) log_info('saving {base}', base=base) countgraph.save(base) # Change max_false_pos=0.2 only if you really grok it. HINT: You don't fp_rate = \ khmer.calc_expected_collisions( countgraph, args.force, max_false_pos=.2) with open(info_filename, 'a') as info_fp: print('fp rate estimated to be %1.3f\n' % fp_rate, file=info_fp) if args.summary_info: mr_fmt = args.summary_info.lower() mr_file = base + '.info.' + mr_fmt log_info("Writing summmary info to {mr_file}", mr_file=mr_file) with open(mr_file, 'w') as mr_fh: if mr_fmt == 'json': mr_data = { "ht_name": os.path.basename(base), "fpr": fp_rate, "num_kmers": n_kmers, "files": filenames, "mrinfo_version": "0.2.0", "num_reads": total_num_reads, } json.dump(mr_data, mr_fh) mr_fh.write('\n') elif mr_fmt == 'tsv': mr_fh.write("ht_name\tfpr\tnum_kmers\tnum_reads\tfiles\n") vals = [ os.path.basename(base), "{:1.3f}".format(fp_rate), str(n_kmers), str(total_num_reads), ";".join(filenames), ] mr_fh.write("\t".join(vals) + "\n") log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate) log_info('DONE.') log_info('wrote to: {filename}', filename=info_filename)
def main(): args = sanitize_help(get_parser()).parse_args() if not args.quiet: info('filter-abund-single.py', ['counting', 'SeqAn']) configure_logging(args.quiet) check_input_files(args.datafile, args.force) check_space([args.datafile], args.force) if args.savegraph: tablesize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.savegraph, tablesize, args.force) report_on_config(args) log_info('making countgraph') graph = khmer_args.create_countgraph(args) # first, load reads into graph rparser = khmer.ReadParser(args.datafile) threads = [] log_info('consuming input, round 1 -- {datafile}', datafile=args.datafile) for _ in range(args.threads): cur_thread = \ threading.Thread( target=graph.consume_fasta_with_reads_parser, args=(rparser, ) ) threads.append(cur_thread) cur_thread.start() for _ in threads: _.join() log_info('Total number of unique k-mers: {nk}', nk=graph.n_unique_kmers()) fp_rate = khmer.calc_expected_collisions(graph, args.force) log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate) # now, trim. # the filtering function. def process_fn(record): name = record.name seq = record.sequence seqN = seq.replace('N', 'A') if args.variable_coverage: # only trim when sequence has high enough C med, _, _ = graph.get_median_count(seqN) if med < args.normalize_to: return name, seq _, trim_at = graph.trim_on_abundance(seqN, args.cutoff) if trim_at >= args.ksize: # be sure to not to change the 'N's in the trimmed sequence - # so, return 'seq' and not 'seqN'. return name, seq[:trim_at] return None, None # the filtering loop log_info('filtering {datafile}', datafile=args.datafile) if args.outfile is None: outfile = os.path.basename(args.datafile) + '.abundfilt' else: outfile = args.outfile outfp = open(outfile, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) tsp = ThreadedSequenceProcessor(process_fn, verbose=not args.quiet) tsp.start(verbose_loader(args.datafile), outfp) log_info('output in {outfile}', outfile=outfile) if args.savegraph: log_info('Saving k-mer countgraph filename {graph}', graph=args.savegraph) graph.save(args.savegraph)
def main(): parser = get_parser() parser.epilog = parser.epilog.replace( ":doc:`partitioning-big-data`", "http://khmer.readthedocs.io/en/stable/user/" "partitioning-big-data.html" ) args = sanitize_help(parser).parse_args() graphbase = args.graphbase # @RamRS: This might need some more work infiles = [graphbase, graphbase + '.tagset'] if os.path.exists(graphbase + '.stoptags'): infiles.append(graphbase + '.stoptags') for _ in infiles: check_input_files(_, args.force) check_space(infiles, args.force) print('loading k-mer nodegraph %s' % graphbase, file=sys.stderr) graph = khmer.load_nodegraph(graphbase) print('loading tagset %s.tagset...' % graphbase, file=sys.stderr) graph.load_tagset(graphbase + '.tagset') initial_stoptags = False # @CTB regularize with make-initial if os.path.exists(graphbase + '.stoptags'): print('loading stoptags %s.stoptags' % graphbase, file=sys.stderr) graph.load_stop_tags(graphbase + '.stoptags') initial_stoptags = True pmap_files = glob.glob(args.graphbase + '.subset.*.pmap') print('loading %d pmap files (first one: %s)' % (len(pmap_files), pmap_files[0]), file=sys.stderr) print('---', file=sys.stderr) print('output stoptags will be in', graphbase + '.stoptags', file=sys.stderr) if initial_stoptags: print( '(these output stoptags will include the already-loaded set)', file=sys.stderr) print('---', file=sys.stderr) # create countgraph ksize = graph.ksize() counting = khmer_args.create_countgraph(args, ksize=ksize) # load & merge for index, subset_file in enumerate(pmap_files): print('<-', subset_file, file=sys.stderr) subset = graph.load_subset_partitionmap(subset_file) print('** repartitioning subset... %s' % subset_file, file=sys.stderr) graph.repartition_largest_partition(subset, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) print('** merging subset... %s' % subset_file, file=sys.stderr) graph.merge_subset(subset) print('** repartitioning, round 2... %s' % subset_file, file=sys.stderr) size = graph.repartition_largest_partition( None, counting, EXCURSION_DISTANCE, EXCURSION_KMER_THRESHOLD, EXCURSION_KMER_COUNT_THRESHOLD) print('** repartitioned size:', size, file=sys.stderr) print('saving stoptags binary', file=sys.stderr) graph.save_stop_tags(graphbase + '.stoptags') os.rename(subset_file, subset_file + '.processed') print('(%d of %d)\n' % (index, len(pmap_files)), file=sys.stderr) print('done!', file=sys.stderr)
def main(): p = build_counting_args(descr='Streaming assembly with tracking info') p.add_argument('fastq_files', nargs='+') p.add_argument('--prefix', default='transcriptome') args = p.parse_args() cg = create_countgraph(args) asm = khmer.JunctionCountAssembler(cg) tr_fn = '{0}.transcripts.fa'.format(args.prefix) orf_fn = '{0}.orfs.fa'.format(args.prefix) stats_fn = '{0}.stats.fa'.format(args.prefix) with open(tr_fn, 'w') as tr_fp,\ open(orf_fn, 'w') as orf_fp,\ open(stats_fn, 'w') as stats_fp: kept = 0 next_contig = 1 next_orf = 1 output = set() statswriter = csv.DictWriter(stats_fp, delimiter=',', fieldnames=['read_n', 'action', 'cov', 'n_junctions', 'contig_n']) for filename in args.fastq_files: for n, record in enumerate(screed.open(filename)): if n and n % 10000 == 0: print('...', n, file=sys.stderr) if len(record.sequence) < args.ksize: continue cov, _, _ = cg.get_median_count(record.sequence) if cov < 20: kept += 1 cg.consume(record.sequence) statswriter.writerow({'read_n': n, 'action': 'c', 'cov': cov, 'n_junctions': None, 'contig_n': None}) elif cov < 30: seq, pos = cg.trim_on_abundance(record.sequence, 3) if len(seq) < args.ksize: continue n_junctions = asm.consume(seq) statswriter.writerow({'read_n': n, 'action': 't', 'cov': cov, 'n_junctions': n_junctions, 'contig_n': None}) elif cov == 30: contigs = asm.assemble(record.sequence[:args.ksize]) for contig_n, contig in enumerate(contigs): statswriter.writerow({'read_n': n, 'action': 'a', 'cov': cov, 'n_junctions': None, 'contig_n': (next_contig, contig_n)}) tr_fp.write('>contig%d\n%s\n' % (next_contig, contig)) next_contig += 1 for t in translate(contig): for orf_n, o in enumerate(extract_orfs(t)): if hash(o) not in output: new = True output.add(hash(o)) orf_fp.write('>orf%d\n%s\n' % (next_orf, o)) next_orf += 1 else: new = False else: statswriter.writerow({'read_n': n, 'action': 's', 'cov': cov, 'n_junctions': None, 'contig_n': None})
def main(): # pylint: disable=too-many-branches,too-many-statements parser = sanitize_help(get_parser()) args = parser.parse_args() configure_logging(args.quiet) report_on_config(args) report_fp = args.report force_single = args.force_single # check for similar filenames # if we're using a single output file only check for identical filenames # otherwise, check for identical BASE names as well. filenames = [] basenames = [] for pathfilename in args.input_filenames: filenames.append(pathfilename) if args.single_output_file: continue # nothing more to worry about basename = os.path.basename(pathfilename) if basename in basenames: log_error('ERROR: Duplicate filename--Cannot handle this!') log_error('** Exiting!') sys.exit(1) basenames.append(basename) # check that files exist and there is sufficient output disk space. check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savegraph is not None: graphsize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.savegraph, graphsize, args.force) # load or create counting table. if args.loadgraph: log_info('loading k-mer countgraph from {graph}', graph=args.loadgraph) countgraph = khmer.load_countgraph(args.loadgraph) else: log_info('making countgraph') countgraph = khmer_args.create_countgraph(args) # create an object to handle diginorm of all files norm = Normalizer(args.cutoff, countgraph) with_diagnostics = WithDiagnostics(norm, report_fp, args.report_frequency) # make a list of all filenames and if they're paired or not; # if we don't know if they're paired, default to allowing but not # forcing pairing. files = [] for element in filenames: files.append([element, args.paired]) if args.unpaired_reads: files.append([args.unpaired_reads, False]) corrupt_files = [] outfp = None output_name = None if args.single_output_file: outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip) else: if '-' in filenames or '/dev/stdin' in filenames: print( "Accepting input from stdin; output filename must " "be provided with '-o'.", file=sys.stderr) sys.exit(1) # # main loop: iterate over all files given, do diginorm. # for filename, require_paired in files: if not args.single_output_file: output_name = os.path.basename(filename) + '.keep' outfp = open(output_name, 'wb') outfp = get_file_writer(outfp, args.gzip, args.bzip) # failsafe context manager in case an input file breaks with catch_io_errors(filename, outfp, args.single_output_file, args.force, corrupt_files): screed_iter = clean_input_reads(screed.open(filename)) reader = broken_paired_reader(screed_iter, min_length=args.ksize, force_single=force_single, require_paired=require_paired) # actually do diginorm for record in with_diagnostics(reader, filename): if record is not None: write_record(record, outfp) log_info('output in {name}', name=describe_file_handle(outfp)) if not args.single_output_file: outfp.close() # finished - print out some diagnostics. log_info('Total number of unique k-mers: {umers}', umers=countgraph.n_unique_kmers()) if args.savegraph is not None: log_info('...saving to {name}', name=args.savegraph) countgraph.save(args.savegraph) fp_rate = \ khmer.calc_expected_collisions(countgraph, False, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate) if args.force and len(corrupt_files) > 0: log_error("** WARNING: Finished with errors!") log_error("** I/O Errors occurred in the following files:") log_error("\t" + " ".join(corrupt_files))
def main(): info("collect-reads.py", ["counting"]) args = get_parser().parse_args() report_on_config(args) base = args.output_countingtable_filename filenames = args.input_sequence_filename for name in args.input_sequence_filename: check_input_files(name, False) check_space(args.input_sequence_filename, False) check_space_for_hashtable(args, "countgraph", False) print("Saving k-mer counting table to %s" % base) print("Loading sequences from %s" % repr(filenames)) if args.output: print("Outputting sequences to", args.output) print("making countgraph", file=sys.stderr) htable = khmer_args.create_countgraph(args) htable.set_use_bigcount(args.bigcount) total_coverage = 0.0 n = 0 for index, filename in enumerate(filenames): for record in screed.open(filename): seq = record.sequence.upper() if "N" in seq: seq = seq.replace("N", "A") try: med, _, _ = htable.get_median_count(seq) except ValueError: continue total_coverage += med n += 1 if total_coverage / float(n) > args.coverage: print("reached target average coverage:", total_coverage / float(n)) break htable.consume(seq) if args.output: args.output.write(output_single(record)) if n % 100000 == 0: print("...", index, filename, n, total_coverage / float(n)) if total_coverage / float(n) > args.coverage: break print("Collected %d reads" % (n,)) if args.report_total_kmers: print("Total number of k-mers: {0}".format(htable.n_occupied()), file=sys.stderr) print("saving", base) htable.save(base) info_fp = open(base + ".info", "w") info_fp.write("through end: %s\n" % filenames[-1]) # Change 0.2 only if you really grok it. HINT: You don't. fp_rate = khmer.calc_expected_collisions(htable, False, max_false_pos=0.2) print("fp rate estimated to be %1.3f" % fp_rate) print("fp rate estimated to be %1.3f" % fp_rate, file=info_fp) print("DONE.")
def main(): info('trim-low-abund.py', ['streaming']) parser = sanitize_help(get_parser()) args = parser.parse_args() ### if len(set(args.input_filenames)) != len(args.input_filenames): print("Error: Cannot input the same filename multiple times.", file=sys.stderr) sys.exit(1) ### report_on_config(args) check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savegraph: graphsize = calculate_graphsize(args, 'countgraph') check_space_for_graph(args.savegraph, graphsize, args.force) if ('-' in args.input_filenames or '/dev/stdin' in args.input_filenames) \ and not args.output: print( "Accepting input from stdin; output filename must " "be provided with -o.", file=sys.stderr) sys.exit(1) if args.loadgraph: print('loading countgraph from', args.loadgraph, file=sys.stderr) ct = khmer.load_countgraph(args.loadgraph) else: print('making countgraph', file=sys.stderr) ct = khmer_args.create_countgraph(args) K = ct.ksize() CUTOFF = args.cutoff NORMALIZE_LIMIT = args.normalize_to tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir) print('created temporary directory %s; ' 'use -T to change location' % tempdir, file=sys.stderr) # ### FIRST PASS ### save_pass2_total = 0 n_bp = 0 n_reads = 0 written_bp = 0 written_reads = 0 trimmed_reads = 0 pass2list = [] for filename in args.input_filenames: pass2filename = os.path.basename(filename) + '.pass2' pass2filename = os.path.join(tempdir, pass2filename) if args.output is None: trimfp = get_file_writer( open(os.path.basename(filename) + '.abundtrim', 'wb'), args.gzip, args.bzip) else: trimfp = get_file_writer(args.output, args.gzip, args.bzip) pass2list.append((filename, pass2filename, trimfp)) screed_iter = screed.open(filename) pass2fp = open(pass2filename, 'w') save_pass2 = 0 n = 0 paired_iter = broken_paired_reader(screed_iter, min_length=K, force_single=args.ignore_pairs) for n, is_pair, read1, read2 in paired_iter: if n % 10000 == 0: print('...', n, filename, save_pass2, n_reads, n_bp, written_reads, written_bp, file=sys.stderr) # we want to track paired reads here, to make sure that pairs # are not split between first pass and second pass. if is_pair: n_reads += 2 n_bp += len(read1.sequence) + len(read2.sequence) seq1 = read1.sequence.replace('N', 'A') seq2 = read2.sequence.replace('N', 'A') med1, _, _ = ct.get_median_count(seq1) med2, _, _ = ct.get_median_count(seq2) if med1 < NORMALIZE_LIMIT or med2 < NORMALIZE_LIMIT: ct.consume(seq1) ct.consume(seq2) write_record_pair(read1, read2, pass2fp) save_pass2 += 2 else: _, trim_at1 = ct.trim_on_abundance(seq1, CUTOFF) _, trim_at2 = ct.trim_on_abundance(seq2, CUTOFF) if trim_at1 >= K: read1 = trim_record(read1, trim_at1) if trim_at2 >= K: read2 = trim_record(read2, trim_at2) if trim_at1 != len(seq1): trimmed_reads += 1 if trim_at2 != len(seq2): trimmed_reads += 1 write_record_pair(read1, read2, trimfp) written_reads += 2 written_bp += trim_at1 + trim_at2 else: n_reads += 1 n_bp += len(read1.sequence) seq = read1.sequence.replace('N', 'A') med, _, _ = ct.get_median_count(seq) # has this portion of the graph saturated? if not, # consume & save => pass2. if med < NORMALIZE_LIMIT: ct.consume(seq) write_record(read1, pass2fp) save_pass2 += 1 else: # trim!! _, trim_at = ct.trim_on_abundance(seq, CUTOFF) if trim_at >= K: new_read = trim_record(read1, trim_at) write_record(new_read, trimfp) written_reads += 1 written_bp += trim_at if trim_at != len(read1.sequence): trimmed_reads += 1 pass2fp.close() print('%s: kept aside %d of %d from first pass, in %s' % (filename, save_pass2, n, filename), file=sys.stderr) save_pass2_total += save_pass2 # ### SECOND PASS. ### skipped_n = 0 skipped_bp = 0 for _, pass2filename, trimfp in pass2list: print('second pass: looking at sequences kept aside in %s' % pass2filename, file=sys.stderr) # note that for this second pass, we don't care about paired # reads - they will be output in the same order they're read in, # so pairs will stay together if not orphaned. This is in contrast # to the first loop. for n, read in enumerate(screed.open(pass2filename)): if n % 10000 == 0: print('... x 2', n, pass2filename, written_reads, written_bp, file=sys.stderr) seq = read.sequence.replace('N', 'A') med, _, _ = ct.get_median_count(seq) # do we retain low-abundance components unchanged? if med < NORMALIZE_LIMIT and args.variable_coverage: write_record(read, trimfp) written_reads += 1 written_bp += len(read.sequence) skipped_n += 1 skipped_bp += len(read.sequence) # otherwise, examine/trim/truncate. else: # med >= NORMALIZE LIMIT or not args.variable_coverage _, trim_at = ct.trim_on_abundance(seq, CUTOFF) if trim_at >= K: new_read = trim_record(read, trim_at) write_record(new_read, trimfp) written_reads += 1 written_bp += trim_at if trim_at != len(read.sequence): trimmed_reads += 1 print('removing %s' % pass2filename, file=sys.stderr) os.unlink(pass2filename) print('removing temp directory & contents (%s)' % tempdir, file=sys.stderr) shutil.rmtree(tempdir) n_passes = 1.0 + (float(save_pass2_total) / n_reads) percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\ n_reads * 100.0 print('read %d reads, %d bp' % ( n_reads, n_bp, ), file=sys.stderr) print('wrote %d reads, %d bp' % ( written_reads, written_bp, ), file=sys.stderr) print('looked at %d reads twice (%.2f passes)' % (save_pass2_total, n_passes), file=sys.stderr) print('removed %d reads and trimmed %d reads (%.2f%%)' % (n_reads - written_reads, trimmed_reads, percent_reads_trimmed), file=sys.stderr) print('trimmed or removed %.2f%% of bases (%d total)' % ((1 - (written_bp / float(n_bp))) * 100.0, n_bp - written_bp), file=sys.stderr) if args.variable_coverage: percent_reads_hicov = 100.0 * float(n_reads - skipped_n) / n_reads print('%d reads were high coverage (%.2f%%);' % (n_reads - skipped_n, percent_reads_hicov), file=sys.stderr) print('skipped %d reads/%d bases because of low coverage' % (skipped_n, skipped_bp), file=sys.stderr) fp_rate = \ khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 print('fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate), file=sys.stderr) print('output in *.abundtrim', file=sys.stderr) if args.savegraph: print("Saving k-mer countgraph to", args.savegraph, file=sys.stderr) ct.save(args.savegraph)
def main(): # pylint: disable=too-many-branches,too-many-statements info('normalize-by-median.py', ['diginorm']) args = get_parser().parse_args() report_on_config(args) report_fp = args.report force_single = args.force_single # if optimization args are given, do optimization args = oxutils.do_sanity_checking(args, 0.1) # check for similar filenames # if we're using a single output file only check for identical filenames # otherwise, check for identical BASE names as well. filenames = [] basenames = [] for pathfilename in args.input_filenames: filenames.append(pathfilename) if args.single_output_file: continue # nothing more to worry about basename = os.path.basename(pathfilename) if basename in basenames: print('ERROR: Duplicate filename--Cannot handle this!', file=sys.stderr) print('** Exiting!', file=sys.stderr) sys.exit(1) basenames.append(basename) # check that files exist and there is sufficient output disk space. check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) if args.savetable: check_space_for_hashtable(args, 'countgraph', args.force) # load or create counting table. if args.loadtable: print('loading k-mer counting table from ' + args.loadtable, file=sys.stderr) htable = khmer.load_counting_hash(args.loadtable) if args.unique_kmers != 0: print('Warning: You have specified a number of unique kmers' ' but are loading a precreated counting table--' 'argument optimization will NOT be done.', file=sys.stderr) else: print('making countgraph', file=sys.stderr) htable = khmer_args.create_countgraph(args) # create an object to handle diginorm of all files norm = Normalizer(args.cutoff, htable) # make a list of all filenames and if they're paired or not; # if we don't know if they're paired, default to allowing but not # forcing pairing. files = [] for element in filenames: files.append([element, args.paired]) if args.unpaired_reads: files.append([args.unpaired_reads, False]) corrupt_files = [] outfp = None output_name = None if args.single_output_file: if args.single_output_file is sys.stdout: output_name = '/dev/stdout' else: output_name = args.single_output_file.name outfp = args.single_output_file # # main loop: iterate over all files given, do diginorm. # for filename, require_paired in files: if not args.single_output_file: output_name = os.path.basename(filename) + '.keep' outfp = open(output_name, 'w') # failsafe context manager in case an input file breaks with catch_io_errors(filename, outfp, args.single_output_file, args.force, corrupt_files): screed_iter = screed.open(filename, parse_description=False) reader = broken_paired_reader(screed_iter, min_length=args.ksize, force_single=force_single, require_paired=require_paired) # actually do diginorm for record in with_diagnostics(filename, norm, reader, report_fp): if record is not None: write_record(record, outfp) print('output in ' + output_name, file=sys.stderr) if output_name is not '/dev/stdout': outfp.close() # finished - print out some diagnostics. print('Total number of unique k-mers: {0}' .format(htable.n_unique_kmers()), file=sys.stderr) if args.savetable: print('...saving to ' + args.savetable, file=sys.stderr) htable.save(args.savetable) fp_rate = \ khmer.calc_expected_collisions(htable, args.force, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 print('fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate), file=sys.stderr) if args.force and len(corrupt_files) > 0: print("** WARNING: Finished with errors!", file=sys.stderr) print("** I/O Errors occurred in the following files:", file=sys.stderr) print("\t", " ".join(corrupt_files), file=sys.stderr)
def main(): info('correct-reads.py', ['streaming']) args = sanitize_help(get_parser()).parse_args() ### if len(set(args.input_filenames)) != len(args.input_filenames): print("Error: Cannot input the same filename multiple times.", file=sys.stderr) sys.exit(1) ### report_on_config(args) check_valid_file_exists(args.input_filenames) check_space(args.input_filenames, args.force) tablesize = calculate_graphsize(args, 'countgraph') if args.savegraph: check_space_for_graph(args.savegraph, tablesize, args.force) K = args.ksize CUTOFF = args.cutoff NORMALIZE_LIMIT = args.normalize_to if args.loadgraph: print('loading k-mer countgraph from', args.loadgraph, file=sys.stderr) ct = khmer.load_countgraph(args.loadgraph) else: print('making k-mer countgraph', file=sys.stderr) ct = create_countgraph(args, multiplier=8 / (9. + 0.3)) tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir) print('created temporary directory %s; use -T to change location' % tempdir, file=sys.stderr) aligner = khmer.ReadAligner(ct, args.cutoff, args.bits_theta) # ### FIRST PASS ### save_pass2_total = 0 n_bp = 0 n_reads = 0 written_bp = 0 written_reads = 0 corrected_reads = 0 pass2list = [] for filename in args.input_filenames: pass2filename = os.path.basename(filename) + '.pass2' pass2filename = os.path.join(tempdir, pass2filename) if args.out is None: corrfp = open(os.path.basename(filename) + '.corr', 'w') else: corrfp = args.out pass2list.append((filename, pass2filename, corrfp)) screed_iter = screed.open(filename, parse_description=False) pass2fp = open(pass2filename, 'w') save_pass2 = 0 n = 0 paired_iter = broken_paired_reader(screed_iter, min_length=K, force_single=args.ignore_pairs) for n, is_pair, read1, read2 in paired_iter: if n % 10000 == 0: print('...', n, filename, save_pass2, n_reads, n_bp, written_reads, written_bp, file=sys.stderr) # we want to track paired reads here, to make sure that pairs # are not split between first pass and second pass. if is_pair: n_reads += 2 n_bp += len(read1.sequence) + len(read2.sequence) seq1 = read1.sequence.replace('N', 'A') seq2 = read2.sequence.replace('N', 'A') med1, _, _ = ct.get_median_count(seq1) med2, _, _ = ct.get_median_count(seq2) if med1 < NORMALIZE_LIMIT or med2 < NORMALIZE_LIMIT: ct.consume(seq1) ct.consume(seq2) write_record_pair(read1, read2, pass2fp) save_pass2 += 2 else: is_aligned, new_seq1 = correct_sequence(aligner, seq1) if is_aligned: if new_seq1 != read1.sequence: corrected_reads += 1 read1.sequence = new_seq1 if hasattr(read1, 'quality'): fix_quality(read1) is_aligned, new_seq2 = correct_sequence(aligner, seq2) if is_aligned: if new_seq2 != read2.sequence: corrected_reads += 1 read2.sequence = new_seq2 if hasattr(read2, 'quality'): fix_quality(read2) write_record_pair(read1, read2, corrfp) written_reads += 2 written_bp += len(read1) written_bp += len(read2) else: n_reads += 1 n_bp += len(read1.sequence) seq = read1.sequence.replace('N', 'A') med, _, _ = ct.get_median_count(seq) # has this portion of the graph saturated? if not, # consume & save => pass2. if med < NORMALIZE_LIMIT: ct.consume(seq) write_record(read1, pass2fp) save_pass2 += 1 else: # trim!! is_aligned, new_seq = correct_sequence(aligner, seq) if is_aligned: if new_seq != read1.sequence: corrected_reads += 1 read1.sequence = new_seq if hasattr(read1, 'quality'): fix_quality(read1) write_record(read1, corrfp) written_reads += 1 written_bp += len(new_seq) pass2fp.close() print('%s: kept aside %d of %d from first pass, in %s' % (filename, save_pass2, n, filename), file=sys.stderr) save_pass2_total += save_pass2 # ### SECOND PASS. ### skipped_n = 0 skipped_bp = 0 for _, pass2filename, corrfp in pass2list: print(('second pass: looking at sequences kept aside in %s') % pass2filename, file=sys.stderr) # note that for this second pass, we don't care about paired # reads - they will be output in the same order they're read in, # so pairs will stay together if not orphaned. This is in contrast # to the first loop. for n, read in enumerate(screed.open(pass2filename, parse_description=False)): if n % 10000 == 0: print('... x 2', n, pass2filename, written_reads, written_bp, file=sys.stderr) seq = read.sequence.replace('N', 'A') med, _, _ = ct.get_median_count(seq) # do we retain low-abundance components unchanged? if med < NORMALIZE_LIMIT and args.variable_coverage: write_record(read, corrfp) written_reads += 1 written_bp += len(read.sequence) skipped_n += 1 skipped_bp += len(read.sequence) # otherwise, examine/correct. else: # med >= NORMALIZE LIMIT or not args.variable_coverage is_aligned, new_seq = correct_sequence(aligner, seq) if is_aligned: if new_seq != read.sequence: corrected_reads += 1 read.sequence = new_seq if hasattr(read, 'quality'): fix_quality(read) write_record(read, corrfp) written_reads += 1 written_bp += len(new_seq) print('removing %s' % pass2filename, file=sys.stderr) os.unlink(pass2filename) print('removing temp directory & contents (%s)' % tempdir, file=sys.stderr) shutil.rmtree(tempdir) n_passes = 1.0 + (float(save_pass2_total) / n_reads) percent_reads_corrected = float(corrected_reads + (n_reads - written_reads)) /\ n_reads * 100.0 print('read %d reads, %d bp' % (n_reads, n_bp,), file=sys.stderr) print('wrote %d reads, %d bp' % (written_reads, written_bp,), file=sys.stderr) print('looked at %d reads twice (%.2f passes)' % (save_pass2_total, n_passes), file=sys.stderr) print('removed %d reads and corrected %d reads (%.2f%%)' % (n_reads - written_reads, corrected_reads, percent_reads_corrected), file=sys.stderr) print('removed %.2f%% of bases (%d total)' % ((1 - (written_bp / float(n_bp))) * 100.0, n_bp - written_bp), file=sys.stderr) if args.variable_coverage: percent_reads_hicov = 100.0 * float(n_reads - skipped_n) / n_reads print('%d reads were high coverage (%.2f%%);' % (n_reads - skipped_n, percent_reads_hicov), file=sys.stderr) print(('skipped %d reads/%d bases because of low coverage') % (skipped_n, skipped_bp), file=sys.stderr) fp_rate = \ khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8) # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975 print('fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate), file=sys.stderr) print('output in *.corr', file=sys.stderr) if args.savegraph: print("Saving k-mer countgraph to", args.savegraph, file=sys.stderr) ct.save(args.savegraph)
def main(): p = build_counting_args(descr="Streaming assembly with tracking info") p.add_argument("fastq_files", nargs="+") p.add_argument("--prefix", default="transcriptome") args = p.parse_args() cg = create_countgraph(args) asm = khmer.JunctionCountAssembler(cg) tr_fn = "{0}.transcripts.fa".format(args.prefix) orf_fn = "{0}.orfs.fa".format(args.prefix) stats_fn = "{0}.stats.fa".format(args.prefix) with open(tr_fn, "w") as tr_fp, open(orf_fn, "w") as orf_fp, open(stats_fn, "w") as stats_fp: kept = 0 next_contig = 1 next_orf = 1 output = set() statswriter = csv.DictWriter( stats_fp, delimiter=",", fieldnames=["read_n", "action", "cov", "n_junctions", "contig_n"] ) for filename in args.fastq_files: for n, record in enumerate(screed.open(filename)): if n and n % 10000 == 0: print("...", n, file=sys.stderr) if len(record.sequence) < args.ksize: continue cov, _, _ = cg.get_median_count(record.sequence) if cov < 20: kept += 1 cg.consume(record.sequence) statswriter.writerow( {"read_n": n, "action": "c", "cov": cov, "n_junctions": None, "contig_n": None} ) elif cov < 30: seq, pos = cg.trim_on_abundance(record.sequence, 3) if len(seq) < args.ksize: continue n_junctions = asm.consume(seq) statswriter.writerow( {"read_n": n, "action": "t", "cov": cov, "n_junctions": n_junctions, "contig_n": None} ) elif cov == 30: contigs = asm.assemble(record.sequence[: args.ksize]) for contig_n, contig in enumerate(contigs): statswriter.writerow( { "read_n": n, "action": "a", "cov": cov, "n_junctions": None, "contig_n": (next_contig, contig_n), } ) tr_fp.write(">contig%d\n%s\n" % (next_contig, contig)) next_contig += 1 for t in translate(contig): for orf_n, o in enumerate(extract_orfs(t)): if hash(o) not in output: new = True output.add(hash(o)) orf_fp.write(">orf%d\n%s\n" % (next_orf, o)) next_orf += 1 else: new = False else: statswriter.writerow( {"read_n": n, "action": "s", "cov": cov, "n_junctions": None, "contig_n": None} )