示例#1
0
def main(args):
    info('build-graph.py', ['graph', 'SeqAn'])

    report_on_config(args, hashtype='nodegraph')
    base = args.output_filename
    filenames = args.input_filenames

    for fname in args.input_filenames:
        check_input_files(fname, args.force)

    # if optimization args are given, do optimization
    args = functions.do_sanity_checking(args, 0.01)

    check_space(args.input_filenames, args.force)
    check_space_for_hashtable(args, 'nodegraph', args.force)

    print('Saving k-mer presence table to %s' % base, file=sys.stderr)
    print('Loading kmers from sequences in %s' %
          repr(filenames), file=sys.stderr)
    if args.no_build_tagset:
        print('We WILL NOT build the tagset.', file=sys.stderr)
    else:
        print('We WILL build the tagset (for partitioning/traversal).',
              file=sys.stderr)

    print('making nodegraph', file=sys.stderr)
    htable = khmer_args.create_nodegraph(args)

    functions.build_graph(filenames, htable, args.threads,
                          not args.no_build_tagset)

    print('Total number of unique k-mers: {0}'.format(htable.n_unique_kmers()),
          file=sys.stderr)

    print('saving k-mer presence table in', base + '.pt', file=sys.stderr)
    htable.save(base + '.pt')

    if not args.no_build_tagset:
        print('saving tagset in', base + '.tagset', file=sys.stderr)
        htable.save_tagset(base + '.tagset')

    info_fp = open(base + '.info', 'w')
    info_fp.write('%d unique k-mers' % htable.n_unique_kmers())

    fp_rate = \
        khmer.calc_expected_collisions(htable, args.force, max_false_pos=.15)
    # 0.18 is ACTUAL MAX. Do not change.

    print('false positive rate estimated to be %1.3f' % fp_rate,
          file=sys.stderr)
    print('\nfalse positive rate estimated to be %1.3f' % fp_rate,
          file=info_fp)

    print('wrote to', base + '.info and', base + '.pt', file=sys.stderr)
    if not args.no_build_tagset:
        print('and ' + base + '.tagset', file=sys.stderr)

    sys.exit(0)
示例#2
0
def main(args):
    info('build-graph.py', ['graph', 'SeqAn'])

    report_on_config(args, hashtype='hashbits')
    base = args.output_filename
    filenames = args.input_filenames

    for fname in args.input_filenames:
        check_input_files(fname, args.force)

    check_space(args.input_filenames, args.force)
    check_space_for_hashtable(
        (float(args.n_tables * args.min_tablesize) / 8.), args.force)

    print >>sys.stderr, 'Saving k-mer presence table to %s' % base
    print >>sys.stderr, 'Loading kmers from sequences in %s' % repr(filenames)
    if args.no_build_tagset:
        print >>sys.stderr, 'We WILL NOT build the tagset.'
    else:
        print >>sys.stderr, 'We WILL build the tagset', \
                            ' (for partitioning/traversal).'

    print >>sys.stderr, 'making k-mer presence table'
    htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables)

    functions.build_graph(filenames, htable, args.threads,
                          not args.no_build_tagset)

    print >> sys.stderr, 'Total number of unique k-mers: {0}'.format(
        htable.n_unique_kmers())

    print >>sys.stderr, 'saving k-mer presence table in', base + '.pt'
    htable.save(base + '.pt')

    if not args.no_build_tagset:
        print >>sys.stderr, 'saving tagset in', base + '.tagset'
        htable.save_tagset(base + '.tagset')

    info_fp = open(base + '.info', 'w')
    info_fp.write('%d unique k-mers' % htable.n_unique_kmers())

    fp_rate = \
        khmer.calc_expected_collisions(htable, args.force, max_false_pos=.15)
    # 0.18 is ACTUAL MAX. Do not change.

    print >>sys.stderr, 'false positive rate estimated to be %1.3f' % fp_rate
    print >>info_fp, '\nfalse positive rate estimated to be %1.3f' % fp_rate

    print >> sys.stderr, 'wrote to', base + '.info and', base + '.pt'
    if not args.no_build_tagset:
        print >> sys.stderr, 'and ' + base + '.tagset'

    sys.exit(0)
示例#3
0
def main(args):
    info("build-graph.py", ["graph", "SeqAn"])

    report_on_config(args, hashtype="hashbits")
    base = args.output_filename
    filenames = args.input_filenames

    for fname in args.input_filenames:
        check_input_files(fname, args.force)

    check_space(args.input_filenames, args.force)
    check_space_for_hashtable((float(args.n_tables * args.min_tablesize) / 8.0), args.force)

    print("Saving k-mer presence table to %s" % base, file=sys.stderr)
    print("Loading kmers from sequences in %s" % repr(filenames), file=sys.stderr)
    if args.no_build_tagset:
        print("We WILL NOT build the tagset.", file=sys.stderr)
    else:
        print("We WILL build the tagset (for partitioning/traversal).", file=sys.stderr)

    print("making k-mer presence table", file=sys.stderr)
    htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables)

    functions.build_graph(filenames, htable, args.threads, not args.no_build_tagset)

    print("Total number of unique k-mers: {0}".format(htable.n_unique_kmers()), file=sys.stderr)

    print("saving k-mer presence table in", base + ".pt", file=sys.stderr)
    htable.save(base + ".pt")

    if not args.no_build_tagset:
        print("saving tagset in", base + ".tagset", file=sys.stderr)
        htable.save_tagset(base + ".tagset")

    info_fp = open(base + ".info", "w")
    info_fp.write("%d unique k-mers" % htable.n_unique_kmers())

    fp_rate = khmer.calc_expected_collisions(htable, args.force, max_false_pos=0.15)
    # 0.18 is ACTUAL MAX. Do not change.

    print("false positive rate estimated to be %1.3f" % fp_rate, file=sys.stderr)
    print("\nfalse positive rate estimated to be %1.3f" % fp_rate, file=info_fp)

    print("wrote to", base + ".info and", base + ".pt", file=sys.stderr)
    if not args.no_build_tagset:
        print("and " + base + ".tagset", file=sys.stderr)

    sys.exit(0)
示例#4
0
def main():
    info('filter-abund-single.py', ['counting', 'SeqAn'])
    args = get_parser().parse_args()
    check_input_files(args.datafile, args.force)
    check_space([args.datafile], args.force)
    if args.savetable:
        check_space_for_hashtable(args, 'countgraph', args.force)
    report_on_config(args)

    print('making countgraph', file=sys.stderr)
    htable = khmer_args.create_countgraph(args)

    # first, load reads into hash table
    rparser = khmer.ReadParser(args.datafile)
    threads = []
    print('consuming input, round 1 --', args.datafile, file=sys.stderr)
    for _ in range(args.threads):
        cur_thread = \
            threading.Thread(
                target=htable.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(cur_thread)
        cur_thread.start()

    for _ in threads:
        _.join()

    if args.report_total_kmers:
        print('Total number of unique k-mers: {0}'.format(
            htable.n_unique_kmers()), file=sys.stderr)

    fp_rate = khmer.calc_expected_collisions(htable, args.force)
    print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr)

    # now, trim.

    # the filtering function.
    def process_fn(record):
        name = record.name
        seq = record.sequence
        seqN = seq.replace('N', 'A')

        _, trim_at = htable.trim_on_abundance(seqN, args.cutoff)

        if trim_at >= args.ksize:
            # be sure to not to change the 'N's in the trimmed sequence -
            # so, return 'seq' and not 'seqN'.
            return name, seq[:trim_at]

        return None, None

    # the filtering loop
    print('filtering', args.datafile, file=sys.stderr)
    outfile = os.path.basename(args.datafile) + '.abundfilt'
    outfp = open(outfile, 'w')

    tsp = ThreadedSequenceProcessor(process_fn)
    tsp.start(verbose_loader(args.datafile), outfp)

    print('output in', outfile, file=sys.stderr)

    if args.savetable:
        print('Saving k-mer counting table filename',
              args.savetable, file=sys.stderr)
        print('...saving to', args.savetable, file=sys.stderr)
        htable.save(args.savetable)
    print('wrote to: ', outfile, file=sys.stderr)
示例#5
0
def main():  # pylint: disable=too-many-branches,too-many-statements
    info('normalize-by-median.py', ['diginorm'])
    args = get_parser().parse_args()

    report_on_config(args)

    report_fp = args.report

    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savetable:
        check_space_for_hashtable(args.n_tables * args.min_tablesize,
                                  args.force)

    # list to save error files along with throwing exceptions
    if args.force:
        corrupt_files = []

    if args.loadtable:
        print 'loading k-mer counting table from', args.loadtable
        htable = khmer.load_counting_hash(args.loadtable)
    else:
        print 'making k-mer counting table'
        htable = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                         args.n_tables)

    total = 0
    discarded = 0
    input_filename = None

    if args.single_output_filename:
        output_name = args.single_output_filename
        if args.append:
            outfp = open(args.single_output_filename, 'a')
        else:
            outfp = open(args.single_output_filename, 'w')

    for index, input_filename in enumerate(args.input_filenames):
        if not args.single_output_filename:
            output_name = os.path.basename(input_filename) + '.keep'
            outfp = open(output_name, 'w')

        total_acc = 0
        discarded_acc = 0

        try:
            total_acc, discarded_acc = normalize_by_median(
                input_filename, outfp, htable, args, report_fp)
        except IOError as err:
            handle_error(err, output_name, input_filename, args.fail_save,
                         htable)
            if not args.force:
                print >> sys.stderr, '** Exiting!'

                sys.exit(1)
            else:
                print >> sys.stderr, '*** Skipping error file, moving on...'
                corrupt_files.append(input_filename)
        else:
            if total_acc == 0 and discarded_acc == 0:
                print 'SKIPPED empty file', input_filename
            else:
                total += total_acc
                discarded += discarded_acc
                print 'DONE with {inp}; kept {kept} of {total} or {perc:2}%'\
                      .format(inp=input_filename, kept=total - discarded,
                              total=total, perc=int(100. - discarded /
                                                    float(total) * 100.))
                print 'output in', output_name

        if (args.dump_frequency > 0 and index > 0
                and index % args.dump_frequency == 0):
            print 'Backup: Saving k-mer counting file through', input_filename
            if args.savetable:
                hashname = args.savetable
                print '...saving to', hashname
            else:
                hashname = 'backup.ct'
                print 'Nothing given for savetable, saving to', hashname
            htable.save(hashname)

    if args.report_total_kmers:
        print >> sys.stderr, 'Total number of unique k-mers: {0}'.format(
            htable.n_unique_kmers())

    if args.savetable:
        print 'Saving k-mer counting table through', input_filename
        print '...saving to', args.savetable
        htable.save(args.savetable)

    fp_rate = khmer.calc_expected_collisions(htable)
    print 'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate)

    if args.force and len(corrupt_files) > 0:
        print >> sys.stderr, "** WARNING: Finished with errors!"
        print >> sys.stderr, "** IOErrors occurred in the following files:"
        print >> sys.stderr, "\t", " ".join(corrupt_files)

    if fp_rate > MAX_FALSE_POSITIVE_RATE:
        print >> sys.stderr, "**"
        print >> sys.stderr, ("** ERROR: the k-mer counting table is too small"
                              " for this data set. Increase tablesize/# "
                              "tables.")
        print >> sys.stderr, "**"
        print >> sys.stderr, "** Do not use these results!!"
        if not args.force:
            sys.exit(1)
示例#6
0
def main():
    info('filter-abund-single.py', ['counting', 'SeqAn'])
    args = get_parser().parse_args()
    check_input_files(args.datafile, args.force)
    check_space([args.datafile], args.force)
    if args.savetable:
        check_space_for_hashtable(
            args.n_tables * args.min_tablesize, args.force)
    report_on_config(args)

    print >>sys.stderr, 'making k-mer counting table'
    htable = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                     args.n_tables)

    # first, load reads into hash table
    rparser = khmer.ReadParser(args.datafile)
    threads = []
    print >>sys.stderr, 'consuming input, round 1 --', args.datafile
    for _ in xrange(args.threads):
        cur_thread = \
            threading.Thread(
                target=htable.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(cur_thread)
        cur_thread.start()

    for _ in threads:
        _.join()

    if args.report_total_kmers:
        print >> sys.stderr, 'Total number of unique k-mers: {0}'.format(
            htable.n_unique_kmers())

    fp_rate = khmer.calc_expected_collisions(htable, args.force)
    print >>sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate

    # now, trim.

    # the filtering function.
    def process_fn(record):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        trim_seq, trim_at = htable.trim_on_abundance(seq, args.cutoff)

        if trim_at >= args.ksize:
            return name, trim_seq

        return None, None

    # the filtering loop
    print >>sys.stderr, 'filtering', args.datafile
    outfile = os.path.basename(args.datafile) + '.abundfilt'
    outfp = open(outfile, 'w')

    tsp = ThreadedSequenceProcessor(process_fn)
    tsp.start(verbose_loader(args.datafile), outfp)

    print >>sys.stderr, 'output in', outfile

    if args.savetable:
        print >>sys.stderr, 'Saving k-mer counting table filename', \
            args.savetable
        print >>sys.stderr, '...saving to', args.savetable
        htable.save(args.savetable)
    print >>sys.stderr, 'wrote to: ', outfile
示例#7
0
def main():
    info('load-graph.py', ['graph', 'SeqAn'])
    args = get_parser().parse_args()
    report_on_config(args, hashtype='hashbits')

    base = args.output_filename
    filenames = args.input_filenames

    for _ in args.input_filenames:
        check_file_status(_, args.force)

    check_space(args.input_filenames, args.force)
    check_space_for_hashtable(
        (float(args.n_tables * args.min_tablesize) / 8.), args.force)

    print >>sys.stderr, 'Saving k-mer presence table to %s' % base
    print >>sys.stderr, 'Loading kmers from sequences in %s' % repr(filenames)
    if args.no_build_tagset:
        print >>sys.stderr, 'We WILL NOT build the tagset.'
    else:
        print >>sys.stderr, 'We WILL build the tagset', \
                            ' (for partitioning/traversal).'

    print >>sys.stderr, 'making k-mer presence table'
    htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables)

    if args.no_build_tagset:
        target_method = htable.consume_fasta_with_reads_parser
    else:
        target_method = htable.consume_fasta_and_tag_with_reads_parser

    for _, filename in enumerate(filenames):
        rparser = khmer.ReadParser(filename)
        threads = []
        print >>sys.stderr, 'consuming input', filename
        for num in xrange(args.threads):
            cur_thread = threading.Thread(
                target=target_method, args=(rparser,))
            threads.append(cur_thread)
            cur_thread.start()

        for thread in threads:
            thread.join()

    if args.report_total_kmers:
        print >> sys.stderr, 'Total number of unique k-mers: {0}'.format(
            htable.n_unique_kmers())

    print >>sys.stderr, 'saving k-mer presence table in', base + '.pt'
    htable.save(base + '.pt')

    if not args.no_build_tagset:
        print >>sys.stderr, 'saving tagset in', base + '.tagset'
        htable.save_tagset(base + '.tagset')

    info_fp = open(base + '.info', 'w')
    info_fp.write('%d unique k-mers' % htable.n_unique_kmers())

    fp_rate = khmer.calc_expected_collisions(htable)
    print >>sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate
    if args.write_fp_rate:
        print >> info_fp, \
            '\nfalse positive rate estimated to be %1.3f' % fp_rate

    if fp_rate > 0.15:          # 0.18 is ACTUAL MAX. Do not change.
        print >> sys.stderr, "**"
        print >> sys.stderr, ("** ERROR: the graph structure is too small for "
                              "this data set. Increase table size/# tables.")
        print >> sys.stderr, "**"
        if not args.force:
            sys.exit(1)

    print >> sys.stderr, 'wrote to', base + '.info and', base + '.pt'
    if not args.no_build_tagset:
        print >> sys.stderr, 'and ' + base + '.tagset'
示例#8
0
def main():  # pylint: disable=too-many-locals,too-many-branches
    info('abundance-dist-single.py', ['counting', 'SeqAn'])
    args = get_parser().parse_args()
    report_on_config(args)

    check_input_files(args.input_sequence_filename, args.force)
    check_space([args.input_sequence_filename], args.force)
    if args.savetable:
        check_space_for_hashtable(args.n_tables * args.min_tablesize,
                                  args.force)

    if (not args.squash_output and
            os.path.exists(args.output_histogram_filename)):
        print('ERROR: %s exists; not squashing.' %
              args.output_histogram_filename, file=sys.stderr)
        sys.exit(1)
    else:
        hist_fp = open(args.output_histogram_filename, 'w')
        if args.csv:
            hist_fp_csv = csv.writer(hist_fp)
            # write headers:
            hist_fp_csv.writerow(['abundance', 'count', 'cumulative',
                                  'cumulative_fraction'])

    print('making k-mer counting table', file=sys.stderr)
    counting_hash = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                            args.n_tables)
    counting_hash.set_use_bigcount(args.bigcount)

    print('building k-mer tracking table', file=sys.stderr)
    tracking = khmer.new_hashbits(counting_hash.ksize(), args.min_tablesize,
                                  args.n_tables)

    print('kmer_size:', counting_hash.ksize(), file=sys.stderr)
    print('k-mer counting table sizes:',
          counting_hash.hashsizes(), file=sys.stderr)
    print('outputting to', args.output_histogram_filename, file=sys.stderr)

    # start loading
    rparser = khmer.ReadParser(args.input_sequence_filename)
    threads = []
    print('consuming input, round 1 --',
          args.input_sequence_filename, file=sys.stderr)
    for _ in range(args.threads):
        thread = \
            threading.Thread(
                target=counting_hash.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

    if args.report_total_kmers:
        print('Total number of unique k-mers: {0}'.format(
            counting_hash.n_unique_kmers()), file=sys.stderr)

    abundance_lists = []

    def __do_abundance_dist__(read_parser):
        abundances = counting_hash.abundance_distribution_with_reads_parser(
            read_parser, tracking)
        abundance_lists.append(abundances)

    print('preparing hist from %s...' %
          args.input_sequence_filename, file=sys.stderr)
    rparser = khmer.ReadParser(args.input_sequence_filename)
    threads = []
    print('consuming input, round 2 --',
          args.input_sequence_filename, file=sys.stderr)
    for _ in range(args.threads):
        thread = \
            threading.Thread(
                target=__do_abundance_dist__,
                args=(rparser, )
            )
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

    assert len(abundance_lists) == args.threads, len(abundance_lists)
    abundance = {}
    for abundance_list in abundance_lists:
        for i, count in enumerate(abundance_list):
            abundance[i] = abundance.get(i, 0) + count

    total = sum(abundance.values())

    if 0 == total:
        print("ERROR: abundance distribution is uniformly zero; "
              "nothing to report.", file=sys.stderr)
        print(
            "\tPlease verify that the input files are valid.", file=sys.stderr)
        sys.exit(1)

    sofar = 0
    for _, i in sorted(abundance.items()):
        if i == 0 and not args.output_zero:
            continue

        sofar += i
        frac = sofar / float(total)

        if args.csv:
            hist_fp_csv.writerow([_, i, sofar, round(frac, 3)])
        else:
            print(_, i, sofar, round(frac, 3), file=hist_fp)

        if sofar == total:
            break

    if args.savetable:
        print('Saving k-mer counting table ', args.savetable, file=sys.stderr)
        print('...saving to', args.savetable, file=sys.stderr)
        counting_hash.save(args.savetable)

    print('wrote to: ' + args.output_histogram_filename, file=sys.stderr)
示例#9
0
def main():

    info("load-into-counting.py", ["counting", "SeqAn"])

    args = get_parser().parse_args()
    report_on_config(args)

    base = args.output_countingtable_filename
    filenames = args.input_sequence_filename

    for name in args.input_sequence_filename:
        check_input_files(name, args.force)

    check_space(args.input_sequence_filename, args.force)
    check_space_for_hashtable(args.n_tables * args.min_tablesize, args.force)

    check_file_writable(base)
    check_file_writable(base + ".info")

    print("Saving k-mer counting table to %s" % base, file=sys.stderr)
    print("Loading kmers from sequences in %s" % repr(filenames), file=sys.stderr)

    # clobber the '.info' file now, as we always open in append mode below
    if os.path.exists(base + ".info"):
        os.remove(base + ".info")

    print("making k-mer counting table", file=sys.stderr)
    htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables)
    htable.set_use_bigcount(args.bigcount)

    filename = None

    total_num_reads = 0

    for index, filename in enumerate(filenames):

        rparser = khmer.ReadParser(filename)
        threads = []
        print("consuming input", filename, file=sys.stderr)
        for _ in range(args.threads):
            cur_thrd = threading.Thread(target=htable.consume_fasta_with_reads_parser, args=(rparser,))
            threads.append(cur_thrd)
            cur_thrd.start()

        for thread in threads:
            thread.join()

        if index > 0 and index % 10 == 0:
            check_space_for_hashtable(args.n_tables * args.min_tablesize, args.force)
            print("mid-save", base, file=sys.stderr)
            htable.save(base)
        with open(base + ".info", "a") as info_fh:
            print("through", filename, file=info_fh)
        total_num_reads += rparser.num_reads

    n_kmers = htable.n_unique_kmers()
    if args.report_total_kmers:
        print("Total number of unique k-mers:", n_kmers, file=sys.stderr)
        with open(base + ".info", "a") as info_fp:
            print("Total number of unique k-mers:", n_kmers, file=info_fp)

    print("saving", base, file=sys.stderr)
    htable.save(base)

    # Change max_false_pos=0.2 only if you really grok it. HINT: You don't
    fp_rate = khmer.calc_expected_collisions(htable, args.force, max_false_pos=0.2)

    with open(base + ".info", "a") as info_fp:
        print("fp rate estimated to be %1.3f\n" % fp_rate, file=info_fp)

    if args.summary_info:
        mr_fmt = args.summary_info.lower()
        mr_file = base + ".info." + mr_fmt
        print("Writing summmary info to", mr_file, file=sys.stderr)
        with open(mr_file, "w") as mr_fh:
            if mr_fmt == "json":
                mr_data = {
                    "ht_name": os.path.basename(base),
                    "fpr": fp_rate,
                    "num_kmers": n_kmers,
                    "files": filenames,
                    "mrinfo_version": "0.2.0",
                    "num_reads": total_num_reads,
                }
                json.dump(mr_data, mr_fh)
                mr_fh.write("\n")
            elif mr_fmt == "tsv":
                mr_fh.write("ht_name\tfpr\tnum_kmers\tnum_reads\tfiles\n")
                vals = [
                    os.path.basename(base),
                    "{:1.3f}".format(fp_rate),
                    str(n_kmers),
                    str(total_num_reads),
                    ";".join(filenames),
                ]
                mr_fh.write("\t".join(vals) + "\n")

    print("fp rate estimated to be %1.3f" % fp_rate, file=sys.stderr)

    print("DONE.", file=sys.stderr)
    print("wrote to:", base + ".info", file=sys.stderr)
示例#10
0
def main():
    info('filter-abund-single.py', ['counting', 'SeqAn'])
    args = get_parser().parse_args()
    check_input_files(args.datafile, args.force)
    check_space([args.datafile], args.force)
    if args.savetable:
        check_space_for_hashtable(args.n_tables * args.min_tablesize,
                                  args.force)
    report_on_config(args)

    print >> sys.stderr, 'making k-mer counting table'
    htable = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                     args.n_tables)

    # first, load reads into hash table
    rparser = khmer.ReadParser(args.datafile)
    threads = []
    print >> sys.stderr, 'consuming input, round 1 --', args.datafile
    for _ in xrange(args.threads):
        cur_thread = \
            threading.Thread(
                target=htable.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(cur_thread)
        cur_thread.start()

    for _ in threads:
        _.join()

    if args.report_total_kmers:
        print >> sys.stderr, 'Total number of unique k-mers: {0}'.format(
            htable.n_unique_kmers())

    fp_rate = khmer.calc_expected_collisions(htable, args.force)
    print >> sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate

    # now, trim.

    # the filtering function.
    def process_fn(record):
        name = record.name
        seq = record.sequence
        seqN = seq.replace('N', 'A')

        _, trim_at = htable.trim_on_abundance(seqN, args.cutoff)

        if trim_at >= args.ksize:
            # be sure to not to change the 'N's in the trimmed sequence -
            # so, return 'seq' and not 'seqN'.
            return name, seq[:trim_at]

        return None, None

    # the filtering loop
    print >> sys.stderr, 'filtering', args.datafile
    outfile = os.path.basename(args.datafile) + '.abundfilt'
    outfp = open(outfile, 'w')

    tsp = ThreadedSequenceProcessor(process_fn)
    tsp.start(verbose_loader(args.datafile), outfp)

    print >> sys.stderr, 'output in', outfile

    if args.savetable:
        print >>sys.stderr, 'Saving k-mer counting table filename', \
            args.savetable
        print >> sys.stderr, '...saving to', args.savetable
        htable.save(args.savetable)
    print >> sys.stderr, 'wrote to: ', outfile
示例#11
0
def main():

    info('load-into-counting.py', ['counting', 'SeqAn'])

    args = get_parser().parse_args()
    report_on_config(args)

    base = args.output_countingtable_filename
    filenames = args.input_sequence_filename

    for name in args.input_sequence_filename:
        check_input_files(name, args.force)

    check_space(args.input_sequence_filename, args.force)
    check_space_for_hashtable(args.n_tables * args.min_tablesize, args.force)

    check_file_writable(base)
    check_file_writable(base + ".info")

    print >>sys.stderr, 'Saving k-mer counting table to %s' % base
    print >>sys.stderr, 'Loading kmers from sequences in %s' % repr(filenames)

    # clobber the '.info' file now, as we always open in append mode below
    if os.path.exists(base + '.info'):
        os.remove(base + '.info')

    print >>sys.stderr, 'making k-mer counting table'
    htable = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                     args.n_tables)
    htable.set_use_bigcount(args.bigcount)

    filename = None

    for index, filename in enumerate(filenames):

        rparser = khmer.ReadParser(filename)
        threads = []
        print >>sys.stderr, 'consuming input', filename
        for _ in xrange(args.threads):
            cur_thrd = \
                threading.Thread(
                    target=htable.consume_fasta_with_reads_parser,
                    args=(rparser, )
                )
            threads.append(cur_thrd)
            cur_thrd.start()

        for thread in threads:
            thread.join()

        if index > 0 and index % 10 == 0:
            check_space_for_hashtable(args.n_tables * args.min_tablesize,
                                      args.force)
            print >>sys.stderr, 'mid-save', base
            htable.save(base)
        with open(base + '.info', 'a') as info_fh:
            print >> info_fh, 'through', filename

    n_kmers = htable.n_unique_kmers()
    if args.report_total_kmers:
        print >> sys.stderr, 'Total number of unique k-mers:', n_kmers
        with open(base + '.info', 'a') as info_fp:
            print >>info_fp, 'Total number of unique k-mers:', n_kmers

    print >>sys.stderr, 'saving', base
    htable.save(base)

    # Change max_false_pos=0.2 only if you really grok it. HINT: You don't
    fp_rate = \
        khmer.calc_expected_collisions(htable, args.force, max_false_pos=.2)

    with open(base + '.info', 'a') as info_fp:
        print >> info_fp, 'fp rate estimated to be %1.3f\n' % fp_rate

    if args.summary_info:
        mr_fmt = args.summary_info.lower()
        mr_file = base + '.info.' + mr_fmt
        print >> sys.stderr, "Writing summmary info to", mr_file
        with open(mr_file, 'w') as mr_fh:
            if mr_fmt == 'json':
                mr_data = {
                    "ht_name": os.path.basename(base),
                    "fpr": fp_rate,
                    "num_kmers": n_kmers,
                    "files": filenames,
                    "mrinfo_version": "0.1.0",
                }
                json.dump(mr_data, mr_fh)
                mr_fh.write('\n')
            elif mr_fmt == 'tsv':
                mr_fh.write("ht_name\tfpr\tnum_kmers\tfiles\n")
                mr_fh.write("{b:s}\t{fpr:1.3f}\t{k:d}\t{fls:s}\n".format(
                    b=os.path.basename(base), fpr=fp_rate, k=n_kmers,
                    fls=";".join(filenames)))

    print >> sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate

    print >>sys.stderr, 'DONE.'
    print >>sys.stderr, 'wrote to:', base + '.info'
示例#12
0
def main():  # pylint: disable=too-many-branches,too-many-statements
    info('normalize-by-median.py', ['diginorm'])
    args = get_parser().parse_args()

    report_on_config(args)

    report_fp = args.report
    force_single = args.force_single

    # check for similar filenames
    # if we're using a single output file only check for identical filenames
    # otherwise, check for identical BASE names as well.
    filenames = []
    basenames = []
    for pathfilename in args.input_filenames:
        filenames.append(pathfilename)
        if args.single_output_file:
            continue  # nothing more to worry about

        basename = os.path.basename(pathfilename)
        if basename in basenames:
            print('ERROR: Duplicate filename--Cannot handle this!',
                  file=sys.stderr)
            print('** Exiting!', file=sys.stderr)
            sys.exit(1)

        basenames.append(basename)

    # check that files exist and there is sufficient output disk space.
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savetable:
        check_space_for_hashtable(args.n_tables * args.min_tablesize,
                                  args.force)

    # load or create counting table.
    if args.loadtable:
        print('loading k-mer counting table from ' + args.loadtable,
              file=sys.stderr)
        htable = khmer.load_counting_hash(args.loadtable)
    else:
        print('making k-mer counting table', file=sys.stderr)
        htable = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                         args.n_tables)

    input_filename = None

    # create an object to handle diginorm of all files
    norm = Normalizer(args.cutoff, htable)

    # make a list of all filenames and if they're paired or not;
    # if we don't know if they're paired, default to allowing but not
    # forcing pairing.
    files = []
    for e in filenames:
        files.append([e, args.paired])
    if args.unpaired_reads:
        files.append([args.unpaired_reads, False])

    corrupt_files = []
    outfp = None
    output_name = None

    if args.single_output_file:
        if args.single_output_file is sys.stdout:
            output_name = '/dev/stdout'
        else:
            output_name = args.single_output_file.name
        outfp = args.single_output_file

    #
    # main loop: iterate over all files given, do diginorm.
    #

    for filename, require_paired in files:
        if not args.single_output_file:
            output_name = os.path.basename(filename) + '.keep'
            outfp = open(output_name, 'w')

        # failsafe context manager in case an input file breaks
        with CatchIOErrors(filename, outfp, args.single_output_file,
                           args.force, corrupt_files):

            screed_iter = screed.open(filename, parse_description=False)
            reader = broken_paired_reader(screed_iter,
                                          min_length=args.ksize,
                                          force_single=force_single,
                                          require_paired=require_paired)

            # actually do diginorm
            for record in WithDiagnostics(filename, norm, reader, report_fp):
                if record is not None:
                    write_record(record, outfp)

            print('output in ' + output_name, file=sys.stderr)
            if output_name is not '/dev/stdout':
                outfp.close()

    # finished - print out some diagnostics.

    print('Total number of unique k-mers: {0}'.format(htable.n_unique_kmers()),
          file=sys.stderr)

    if args.savetable:
        print('...saving to ' + args.savetable, file=sys.stderr)
        htable.save(args.savetable)

    fp_rate = \
        khmer.calc_expected_collisions(htable, args.force, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975

    print('fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate),
          file=sys.stderr)

    if args.force and len(corrupt_files) > 0:
        print("** WARNING: Finished with errors!", file=sys.stderr)
        print("** IOErrors occurred in the following files:", file=sys.stderr)
        print("\t", " ".join(corrupt_files), file=sys.stderr)
示例#13
0
def main():  # pylint: disable=too-many-locals,too-many-branches
    info('abundance-dist-single.py', ['counting', 'SeqAn'])
    args = get_parser().parse_args()
    report_on_config(args)

    check_input_files(args.input_sequence_filename, args.force)
    check_space([args.input_sequence_filename], args.force)
    if args.savetable:
        check_space_for_hashtable(args.n_tables * args.min_tablesize,
                                  args.force)

    if (not args.squash_output
            and os.path.exists(args.output_histogram_filename)):
        print >> sys.stderr, 'ERROR: %s exists; not squashing.' % \
            args.output_histogram_filename
        sys.exit(1)
    else:
        hist_fp = open(args.output_histogram_filename, 'w')
        if args.csv:
            hist_fp_csv = csv.writer(hist_fp)
            # write headers:
            hist_fp_csv.writerow(
                ['abundance', 'count', 'cumulative', 'cumulative_fraction'])

    print >> sys.stderr, 'making k-mer counting table'
    counting_hash = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                            args.n_tables)
    counting_hash.set_use_bigcount(args.bigcount)

    print >> sys.stderr, 'building k-mer tracking table'
    tracking = khmer.new_hashbits(counting_hash.ksize(), args.min_tablesize,
                                  args.n_tables)

    print >> sys.stderr, 'kmer_size:', counting_hash.ksize()
    print >>sys.stderr, 'k-mer counting table sizes:', \
        counting_hash.hashsizes()
    print >> sys.stderr, 'outputting to', args.output_histogram_filename

    # start loading
    rparser = khmer.ReadParser(args.input_sequence_filename)
    threads = []
    print >>sys.stderr, 'consuming input, round 1 --', \
        args.input_sequence_filename
    for _ in xrange(args.threads):
        thread = \
            threading.Thread(
                target=counting_hash.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

    if args.report_total_kmers:
        print >> sys.stderr, 'Total number of unique k-mers: {0}'.format(
            counting_hash.n_unique_kmers())

    abundance_lists = []

    def __do_abundance_dist__(read_parser):
        abundances = counting_hash.abundance_distribution_with_reads_parser(
            read_parser, tracking)
        abundance_lists.append(abundances)

    print >>sys.stderr, 'preparing hist from %s...' % \
        args.input_sequence_filename
    rparser = khmer.ReadParser(args.input_sequence_filename)
    threads = []
    print >>sys.stderr, 'consuming input, round 2 --', \
        args.input_sequence_filename
    for _ in xrange(args.threads):
        thread = \
            threading.Thread(
                target=__do_abundance_dist__,
                args=(rparser, )
            )
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

    assert len(abundance_lists) == args.threads, len(abundance_lists)
    abundance = {}
    for abundance_list in abundance_lists:
        for i, count in enumerate(abundance_list):
            abundance[i] = abundance.get(i, 0) + count

    total = sum(abundance.values())

    if 0 == total:
        print >> sys.stderr, \
            "ERROR: abundance distribution is uniformly zero; " \
            "nothing to report."
        print >> sys.stderr, "\tPlease verify that the input files are valid."
        sys.exit(1)

    sofar = 0
    for _, i in sorted(abundance.items()):
        if i == 0 and not args.output_zero:
            continue

        sofar += i
        frac = sofar / float(total)

        if args.csv:
            hist_fp_csv.writerow([_, i, sofar, round(frac, 3)])
        else:
            print >> hist_fp, _, i, sofar, round(frac, 3)

        if sofar == total:
            break

    if args.savetable:
        print >> sys.stderr, 'Saving k-mer counting table ', args.savetable
        print >> sys.stderr, '...saving to', args.savetable
        counting_hash.save(args.savetable)

    print >> sys.stderr, 'wrote to: ' + args.output_histogram_filename
示例#14
0
def main():

    info('collect-reads.py', ['counting'])
    args = get_parser().parse_args()
    report_on_config(args)

    base = args.output_countingtable_filename
    filenames = args.input_sequence_filename

    for name in args.input_sequence_filename:
        check_input_files(name, False)

    check_space(args.input_sequence_filename, False)
    check_space_for_hashtable(args.n_tables * args.min_tablesize, False)

    print 'Saving k-mer counting table to %s' % base
    print 'Loading sequences from %s' % repr(filenames)
    if args.output:
        print 'Outputting sequences to', args.output

    print 'making k-mer counting table'
    htable = khmer.new_counting_hash(args.ksize, args.min_tablesize)
    htable.set_use_bigcount(args.bigcount)

    total_coverage = 0.
    n = 0

    for index, filename in enumerate(filenames):
        for record in screed.open(filename):
            seq = record.sequence.upper()
            if 'N' in seq:
                seq = seq.replace('N', 'G')

            try:
                med, _, _ = htable.get_median_count(seq)
            except ValueError:
                continue

            total_coverage += med
            n += 1

            if total_coverage / float(n) > args.coverage:
                print 'reached target average coverage:', \
                      total_coverage / float(n)
                break

            htable.consume(seq)
            if args.output:
                args.output.write(output_single(record))

            if n % 100000 == 0:
                print '...', index, filename, n, total_coverage / float(n)

        if total_coverage / float(n) > args.coverage:
            break

    print 'Collected %d reads' % (n,)

    if args.report_total_kmers:
        print >> sys.stderr, 'Total number of k-mers: {0}'.format(
            htable.n_occupied())

    print 'saving', base
    htable.save(base)

    info_fp = open(base + '.info', 'w')
    info_fp.write('through end: %s\n' % filenames[-1])

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(htable, args.force, max_false_pos=.2)
    print 'fp rate estimated to be %1.3f' % fp_rate
    print >> info_fp, 'fp rate estimated to be %1.3f' % fp_rate

    print 'DONE.'
示例#15
0
def main():  # pylint: disable=too-many-branches,too-many-statements
    info('normalize-by-median.py', ['diginorm'])
    args = get_parser().parse_args()

    report_on_config(args)

    report_fp = args.report

    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savetable:
        check_space_for_hashtable(
            args.n_tables * args.min_tablesize, args.force)

    # list to save error files along with throwing exceptions
    if args.force:
        corrupt_files = []

    if args.loadtable:
        print 'loading k-mer counting table from', args.loadtable
        htable = khmer.load_counting_hash(args.loadtable)
    else:
        print 'making k-mer counting table'
        htable = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                         args.n_tables)

    total = 0
    discarded = 0
    input_filename = None

    if args.single_output_filename:
        output_name = args.single_output_filename
        if args.append:
            outfp = open(args.single_output_filename, 'a')
        else:
            outfp = open(args.single_output_filename, 'w')

    for index, input_filename in enumerate(args.input_filenames):
        if not args.single_output_filename:
            output_name = os.path.basename(input_filename) + '.keep'
            outfp = open(output_name, 'w')

        total_acc = 0
        discarded_acc = 0

        try:
            total_acc, discarded_acc = normalize_by_median(input_filename,
                                                           outfp, htable, args,
                                                           report_fp)
        except IOError as err:
            handle_error(err, output_name, input_filename, args.fail_save,
                         htable)
            if not args.force:
                print >> sys.stderr, '** Exiting!'

                sys.exit(1)
            else:
                print >> sys.stderr, '*** Skipping error file, moving on...'
                corrupt_files.append(input_filename)
        else:
            if total_acc == 0 and discarded_acc == 0:
                print 'SKIPPED empty file', input_filename
            else:
                total += total_acc
                discarded += discarded_acc
                print 'DONE with {inp}; kept {kept} of {total} or {perc:2}%'\
                      .format(inp=input_filename, kept=total - discarded,
                              total=total, perc=int(100. - discarded /
                                                    float(total) * 100.))
                print 'output in', output_name

        if (args.dump_frequency > 0 and
                index > 0 and index % args.dump_frequency == 0):
            print 'Backup: Saving k-mer counting file through', input_filename
            if args.savetable:
                hashname = args.savetable
                print '...saving to', hashname
            else:
                hashname = 'backup.ct'
                print 'Nothing given for savetable, saving to', hashname
            htable.save(hashname)

    if args.report_total_kmers:
        print >> sys.stderr, 'Total number of unique k-mers: {0}'.format(
            htable.n_unique_kmers())

    if args.savetable:
        print 'Saving k-mer counting table through', input_filename
        print '...saving to', args.savetable
        htable.save(args.savetable)

    fp_rate = khmer.calc_expected_collisions(htable)
    print 'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate)

    if args.force and len(corrupt_files) > 0:
        print >> sys.stderr, "** WARNING: Finished with errors!"
        print >> sys.stderr, "** IOErrors occurred in the following files:"
        print >> sys.stderr, "\t", " ".join(corrupt_files)

    if fp_rate > MAX_FALSE_POSITIVE_RATE:
        print >> sys.stderr, "**"
        print >> sys.stderr, ("** ERROR: the k-mer counting table is too small"
                              " for this data set. Increase tablesize/# "
                              "tables.")
        print >> sys.stderr, "**"
        print >> sys.stderr, "** Do not use these results!!"
        if not args.force:
            sys.exit(1)
示例#16
0
def main():  # pylint: disable=too-many-branches,too-many-statements
    info('normalize-by-median.py', ['diginorm'])
    args = get_parser().parse_args()

    report_on_config(args)

    report_fp = args.report

    # check for similar filenames
    filenames = []
    for pathfilename in args.input_filenames:
        filename = pathfilename.split('/')[-1]
        if (filename in filenames):
            print >> sys.stderr, "WARNING: At least two input files are named \
%s . (The script normalize-by-median.py can not handle this, only one .keep \
file for one of the input files will be generated.)" % filename
        else:
            filenames.append(filename)

    # check for others
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savetable:
        check_space_for_hashtable(args.n_tables * args.min_tablesize,
                                  args.force)

    # list to save error files along with throwing exceptions
    corrupt_files = []

    if args.loadtable:
        print 'loading k-mer counting table from', args.loadtable
        htable = khmer.load_counting_hash(args.loadtable)
    else:
        print >> sys.stderr, 'making k-mer counting table'
        htable = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                         args.n_tables)

    input_filename = None

    for index, input_filename in enumerate(args.input_filenames):
        total_acc, discarded_acc, corrupt_files = \
            normalize_by_median_and_check(
                input_filename, htable, args.single_output_file,
                args.fail_save, args.paired, args.cutoff, args.force,
                corrupt_files, report_fp)

        if (args.dump_frequency > 0 and index > 0
                and index % args.dump_frequency == 0):
            print 'Backup: Saving k-mer counting file through', input_filename
            if args.savetable:
                hashname = args.savetable
                print '...saving to', hashname
            else:
                hashname = 'backup.ct'
                print 'Nothing given for savetable, saving to', hashname
            htable.save(hashname)

    if args.paired and args.unpaired_reads:
        args.paired = False
        output_name = args.unpaired_reads
        if not args.single_output_file:
            output_name = os.path.basename(args.unpaired_reads) + '.keep'
        outfp = open(output_name, 'w')
        total_acc, discarded_acc, corrupt_files = \
            normalize_by_median_and_check(
                args.unpaired_reads, htable, args.single_output_file,
                args.fail_save, args.paired, args.cutoff, args.force,
                corrupt_files, report_fp)

    if args.report_total_kmers:
        print >> sys.stderr, 'Total number of unique k-mers: {0}'.format(
            htable.n_unique_kmers())

    if args.savetable:
        print 'Saving k-mer counting table through', input_filename
        print '...saving to', args.savetable
        htable.save(args.savetable)

    fp_rate = \
        khmer.calc_expected_collisions(htable, args.force, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975

    print >> sys.stderr, \
        'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate)

    if args.force and len(corrupt_files) > 0:
        print >> sys.stderr, "** WARNING: Finished with errors!"
        print >> sys.stderr, "** IOErrors occurred in the following files:"
        print >> sys.stderr, "\t", " ".join(corrupt_files)
示例#17
0
def main():

    info("collect-reads.py", ["counting"])
    args = get_parser().parse_args()
    report_on_config(args)

    base = args.output_countingtable_filename
    filenames = args.input_sequence_filename

    for name in args.input_sequence_filename:
        check_input_files(name, False)

    check_space(args.input_sequence_filename, False)
    check_space_for_hashtable(args, "countgraph", False)

    print("Saving k-mer counting table to %s" % base)
    print("Loading sequences from %s" % repr(filenames))
    if args.output:
        print("Outputting sequences to", args.output)

    print("making countgraph", file=sys.stderr)
    htable = khmer_args.create_countgraph(args)
    htable.set_use_bigcount(args.bigcount)

    total_coverage = 0.0
    n = 0

    for index, filename in enumerate(filenames):
        for record in screed.open(filename):
            seq = record.sequence.upper()
            if "N" in seq:
                seq = seq.replace("N", "A")

            try:
                med, _, _ = htable.get_median_count(seq)
            except ValueError:
                continue

            total_coverage += med
            n += 1

            if total_coverage / float(n) > args.coverage:
                print("reached target average coverage:", total_coverage / float(n))
                break

            htable.consume(seq)
            if args.output:
                args.output.write(output_single(record))

            if n % 100000 == 0:
                print("...", index, filename, n, total_coverage / float(n))

        if total_coverage / float(n) > args.coverage:
            break

    print("Collected %d reads" % (n,))

    if args.report_total_kmers:
        print("Total number of k-mers: {0}".format(htable.n_occupied()), file=sys.stderr)

    print("saving", base)
    htable.save(base)

    info_fp = open(base + ".info", "w")
    info_fp.write("through end: %s\n" % filenames[-1])

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(htable, False, max_false_pos=0.2)
    print("fp rate estimated to be %1.3f" % fp_rate)
    print("fp rate estimated to be %1.3f" % fp_rate, file=info_fp)

    print("DONE.")
示例#18
0
def main():  # pylint: disable=too-many-branches,too-many-statements
    info('normalize-by-median.py', ['diginorm'])
    args = get_parser().parse_args()

    report_on_config(args)

    report_fp = args.report

    # check for similar filenames
    filenames = []
    for pathfilename in args.input_filenames:
        filename = pathfilename.split('/')[-1]
        if (filename in filenames):
            print >>sys.stderr, "WARNING: At least two input files are named \
%s . (The script normalize-by-median.py can not handle this, only one .keep \
file for one of the input files will be generated.)" % filename
        else:
            filenames.append(filename)

    # check for others
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savetable:
        check_space_for_hashtable(
            args.n_tables * args.min_tablesize, args.force)

    # list to save error files along with throwing exceptions
    if args.force:
        corrupt_files = []

    if args.loadtable:
        print 'loading k-mer counting table from', args.loadtable
        htable = khmer.load_counting_hash(args.loadtable)
    else:
        print >> sys.stderr, 'making k-mer counting table'
        htable = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                         args.n_tables)

    total = 0
    discarded = 0
    input_filename = None

    if args.single_output_file:
        outfp = args.single_output_file
        if args.single_output_file is sys.stdout:
            output_name = '/dev/stdout'
        else:
            output_name = args.single_output_file.name

    for index, input_filename in enumerate(args.input_filenames):
        if not args.single_output_file:
            output_name = os.path.basename(input_filename) + '.keep'
            outfp = open(output_name, 'w')

        total_acc = 0
        discarded_acc = 0

        try:
            total_acc, discarded_acc = normalize_by_median(input_filename,
                                                           outfp, htable, args,
                                                           report_fp)
        except IOError as err:
            handle_error(err, output_name, input_filename, args.fail_save,
                         htable)
            if not args.force:
                print >> sys.stderr, '** Exiting!'

                sys.exit(1)
            else:
                print >> sys.stderr, '*** Skipping error file, moving on...'
                corrupt_files.append(input_filename)
        else:
            if total_acc == 0 and discarded_acc == 0:
                print >> sys.stderr, 'SKIPPED empty file', input_filename
            else:
                total += total_acc
                discarded += discarded_acc
                print >> sys.stderr, \
                    'DONE with {inp}; kept {kept} of {total} or {perc:2}%'\
                    .format(inp=input_filename, kept=total - discarded,
                            total=total, perc=int(100. - discarded /
                                                  float(total) * 100.))
                print >> sys.stderr, 'output in', output_name

        if (args.dump_frequency > 0 and
                index > 0 and index % args.dump_frequency == 0):
            print 'Backup: Saving k-mer counting file through', input_filename
            if args.savetable:
                hashname = args.savetable
                print '...saving to', hashname
            else:
                hashname = 'backup.ct'
                print 'Nothing given for savetable, saving to', hashname
            htable.save(hashname)

    if args.report_total_kmers:
        print >> sys.stderr, 'Total number of unique k-mers: {0}'.format(
            htable.n_unique_kmers())

    if args.savetable:
        print 'Saving k-mer counting table through', input_filename
        print '...saving to', args.savetable
        htable.save(args.savetable)

    fp_rate = \
        khmer.calc_expected_collisions(htable, args.force, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975

    print >> sys.stderr, \
        'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate)

    if args.force and len(corrupt_files) > 0:
        print >> sys.stderr, "** WARNING: Finished with errors!"
        print >> sys.stderr, "** IOErrors occurred in the following files:"
        print >> sys.stderr, "\t", " ".join(corrupt_files)
示例#19
0
def main():  # pylint: disable=too-many-branches,too-many-statements
    info('saturate-by-median.py', ['diginorm'])
    args = get_parser().parse_args()

    report_on_config(args)

    report_fp = args.report
    report_frequency = args.report_frequency

    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, False)
    if args.savetable:
        check_space_for_hashtable(args.n_tables * args.min_tablesize, False)

    # list to save error files along with throwing exceptions
    if args.force:
        corrupt_files = []

    if args.loadtable:
        print 'loading k-mer counting table from', args.loadtable
        htable = khmer.load_counting_hash(args.loadtable)
    else:
        print 'making k-mer counting table'
        htable = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                         args.n_tables)

    total = 0
    discarded = 0

    for index, input_filename in enumerate(args.input_filenames):
        total_acc = 0
        discarded_acc = 0

        try:
            total_acc, discarded_acc = normalize_by_median(
                input_filename, htable, args, report_fp, report_frequency)
        except IOError as err:
            handle_error(err, input_filename)
            if not args.force:
                print >> sys.stderr, '** Exiting!'
                sys.exit(1)
            else:
                print >> sys.stderr, '*** Skipping error file, moving on...'
                corrupt_files.append(input_filename)
        else:
            if total_acc == 0 and discarded_acc == 0:
                print 'SKIPPED empty file', input_filename
            else:
                total += total_acc
                discarded += discarded_acc
                print 'DONE with {inp}; kept {kept} of {total} or {perc:2}%'\
                    .format(inp=input_filename,
                            kept=total - discarded, total=total,
                            perc=int(100. - discarded / float(total) * 100.))

    if args.savetable:
        print 'Saving k-mer counting table through', input_filename
        print '...saving to', args.savetable
        htable.save(args.savetable)

    # re: threshold, see Zhang et al.,
    # http://arxiv.org/abs/1309.2975
    fp_rate = khmer.calc_expected_collisions(htable,
                                             args.force,
                                             max_false_pos=.8)
    print 'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate)

    if args.force and len(corrupt_files) > 0:
        print >> sys.stderr, "** WARNING: Finished with errors!"
        print >> sys.stderr, "** IOErrors occurred in the following files:"
        print >> sys.stderr, "\t", " ".join(corrupt_files)
示例#20
0
def main():  # pylint: disable=too-many-branches,too-many-statements
    info('normalize-by-median.py', ['diginorm'])
    args = get_parser().parse_args()

    report_on_config(args)

    report_fp = args.report
    force_single = args.force_single

    # check for similar filenames
    # if we're using a single output file only check for identical filenames
    # otherwise, check for identical BASE names as well.
    filenames = []
    basenames = []
    for pathfilename in args.input_filenames:
        filenames.append(pathfilename)
        if args.single_output_file:
            continue  # nothing more to worry about

        basename = os.path.basename(pathfilename)
        if basename in basenames:
            print('ERROR: Duplicate filename--Cannot handle this!',
                  file=sys.stderr)
            print('** Exiting!', file=sys.stderr)
            sys.exit(1)

        basenames.append(basename)

    # check that files exist and there is sufficient output disk space.
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savetable:
        check_space_for_hashtable(args, 'countgraph', args.force)

    # load or create counting table.
    if args.loadtable:
        print('loading k-mer counting table from ' + args.loadtable,
              file=sys.stderr)
        htable = khmer.load_counting_hash(args.loadtable)
    else:
        print('making countgraph', file=sys.stderr)
        htable = khmer_args.create_countgraph(args)

    input_filename = None

    # create an object to handle diginorm of all files
    norm = Normalizer(args.cutoff, htable)

    # make a list of all filenames and if they're paired or not;
    # if we don't know if they're paired, default to allowing but not
    # forcing pairing.
    files = []
    for e in filenames:
        files.append([e, args.paired])
    if args.unpaired_reads:
        files.append([args.unpaired_reads, False])

    corrupt_files = []
    outfp = None
    output_name = None

    if args.single_output_file:
        if args.single_output_file is sys.stdout:
            output_name = '/dev/stdout'
        else:
            output_name = args.single_output_file.name
        outfp = args.single_output_file

    #
    # main loop: iterate over all files given, do diginorm.
    #

    for filename, require_paired in files:
        if not args.single_output_file:
            output_name = os.path.basename(filename) + '.keep'
            outfp = open(output_name, 'w')

        # failsafe context manager in case an input file breaks
        with CatchIOErrors(filename, outfp, args.single_output_file,
                           args.force, corrupt_files):

            screed_iter = screed.open(filename, parse_description=False)
            reader = broken_paired_reader(screed_iter, min_length=args.ksize,
                                          force_single=force_single,
                                          require_paired=require_paired)

            # actually do diginorm
            for record in WithDiagnostics(filename, norm, reader, report_fp):
                if record is not None:
                    write_record(record, outfp)

            print('output in ' + output_name, file=sys.stderr)
            if output_name is not '/dev/stdout':
                outfp.close()

    # finished - print out some diagnostics.

    print('Total number of unique k-mers: {0}'
          .format(htable.n_unique_kmers()),
          file=sys.stderr)

    if args.savetable:
        print('...saving to ' + args.savetable, file=sys.stderr)
        htable.save(args.savetable)

    fp_rate = \
        khmer.calc_expected_collisions(htable, args.force, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975

    print('fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate),
          file=sys.stderr)

    if args.force and len(corrupt_files) > 0:
        print("** WARNING: Finished with errors!", file=sys.stderr)
        print("** I/O Errors occurred in the following files:",
              file=sys.stderr)
        print("\t", " ".join(corrupt_files), file=sys.stderr)
示例#21
0
def main():
    info('trim-low-abund.py', ['streaming'])
    parser = get_parser()
    args = parser.parse_args()

    ###

    if len(set(args.input_filenames)) != len(args.input_filenames):
        print >>sys.stderr, \
            "Error: Cannot input the same filename multiple times."
        sys.exit(1)

    ###

    report_on_config(args)
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savetable:
        check_space_for_hashtable(args.n_tables * args.min_tablesize,
                                  args.force)

    K = args.ksize

    CUTOFF = args.cutoff
    NORMALIZE_LIMIT = args.normalize_to

    if args.loadtable:
        print >> sys.stderr, 'loading k-mer counting table from', args.loadtable
        ct = khmer.load_counting_hash(args.loadtable)
    else:
        print >> sys.stderr, 'making k-mer counting table'
        ct = khmer.new_counting_hash(K, args.min_tablesize, args.n_tables)

    tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir)
    print >>sys.stderr, 'created temporary directory %s; ' \
                        'use -T to change location' % tempdir

    # ### FIRST PASS ###

    save_pass2_total = 0

    n_bp = 0
    n_reads = 0
    written_bp = 0
    written_reads = 0
    trimmed_reads = 0

    pass2list = []
    for filename in args.input_filenames:
        pass2filename = os.path.basename(filename) + '.pass2'
        pass2filename = os.path.join(tempdir, pass2filename)
        if args.out is None:
            trimfp = open(os.path.basename(filename) + '.abundtrim', 'w')
        else:
            trimfp = args.out

        pass2list.append((filename, pass2filename, trimfp))

        screed_iter = screed.open(filename, parse_description=False)
        pass2fp = open(pass2filename, 'w')

        save_pass2 = 0
        n = 0

        paired_iter = broken_paired_reader(screed_iter,
                                           min_length=K,
                                           force_single=args.ignore_pairs)
        for n, is_pair, read1, read2 in paired_iter:
            if n % 10000 == 0:
                print >>sys.stderr, '...', n, filename, save_pass2, \
                    n_reads, n_bp, written_reads, written_bp

            # we want to track paired reads here, to make sure that pairs
            # are not split between first pass and second pass.

            if is_pair:
                n_reads += 2
                n_bp += len(read1.sequence) + len(read2.sequence)

                seq1 = read1.sequence.replace('N', 'A')
                seq2 = read2.sequence.replace('N', 'A')

                med1, _, _ = ct.get_median_count(seq1)
                med2, _, _ = ct.get_median_count(seq2)

                if med1 < NORMALIZE_LIMIT or med2 < NORMALIZE_LIMIT:
                    ct.consume(seq1)
                    ct.consume(seq2)
                    write_record_pair(read1, read2, pass2fp)
                    save_pass2 += 2
                else:
                    _, trim_at1 = ct.trim_on_abundance(seq1, CUTOFF)
                    _, trim_at2 = ct.trim_on_abundance(seq2, CUTOFF)

                    if trim_at1 >= K:
                        read1 = trim_record(read1, trim_at1)

                    if trim_at2 >= K:
                        read2 = trim_record(read2, trim_at2)

                    if trim_at1 != len(seq1):
                        trimmed_reads += 1
                    if trim_at2 != len(seq2):
                        trimmed_reads += 1

                    write_record_pair(read1, read2, trimfp)
                    written_reads += 2
                    written_bp += trim_at1 + trim_at2
            else:
                n_reads += 1
                n_bp += len(read1.sequence)

                seq = read1.sequence.replace('N', 'A')

                med, _, _ = ct.get_median_count(seq)

                # has this portion of the graph saturated? if not,
                # consume & save => pass2.
                if med < NORMALIZE_LIMIT:
                    ct.consume(seq)
                    write_record(read1, pass2fp)
                    save_pass2 += 1
                else:  # trim!!
                    _, trim_at = ct.trim_on_abundance(seq, CUTOFF)
                    if trim_at >= K:
                        new_read = trim_record(read1, trim_at)
                        write_record(new_read, trimfp)

                        written_reads += 1
                        written_bp += trim_at

                        if trim_at != len(read1.sequence):
                            trimmed_reads += 1

        pass2fp.close()

        print >>sys.stderr, '%s: kept aside %d of %d from first pass, in %s' \
            % (filename, save_pass2, n, filename)
        save_pass2_total += save_pass2

    # ### SECOND PASS. ###

    skipped_n = 0
    skipped_bp = 0
    for _, pass2filename, trimfp in pass2list:
        print >> sys.stderr, ('second pass: looking at sequences kept aside '
                              'in %s') % pass2filename

        # note that for this second pass, we don't care about paired
        # reads - they will be output in the same order they're read in,
        # so pairs will stay together if not orphaned.  This is in contrast
        # to the first loop.

        for n, read in enumerate(
                screed.open(pass2filename, parse_description=False)):
            if n % 10000 == 0:
                print >>sys.stderr, '... x 2', n, pass2filename, \
                    written_reads, written_bp

            seq = read.sequence.replace('N', 'A')
            med, _, _ = ct.get_median_count(seq)

            # do we retain low-abundance components unchanged?
            if med < NORMALIZE_LIMIT and args.variable_coverage:
                write_record(read, trimfp)

                written_reads += 1
                written_bp += len(read.sequence)
                skipped_n += 1
                skipped_bp += len(read.sequence)

            # otherwise, examine/trim/truncate.
            else:  # med >= NORMALIZE LIMIT or not args.variable_coverage
                _, trim_at = ct.trim_on_abundance(seq, CUTOFF)
                if trim_at >= K:
                    new_read = trim_record(read, trim_at)
                    write_record(new_read, trimfp)

                    written_reads += 1
                    written_bp += trim_at

                    if trim_at != len(read.sequence):
                        trimmed_reads += 1

        print >> sys.stderr, 'removing %s' % pass2filename
        os.unlink(pass2filename)

    print >> sys.stderr, 'removing temp directory & contents (%s)' % tempdir
    shutil.rmtree(tempdir)

    n_passes = 1.0 + (float(save_pass2_total) / n_reads)
    percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\
        n_reads * 100.0

    print >> sys.stderr, 'read %d reads, %d bp' % (
        n_reads,
        n_bp,
    )
    print >> sys.stderr, 'wrote %d reads, %d bp' % (
        written_reads,
        written_bp,
    )
    print >>sys.stderr, 'looked at %d reads twice (%.2f passes)' % \
        (save_pass2_total, n_passes)
    print >>sys.stderr, 'removed %d reads and trimmed %d reads (%.2f%%)' % \
        (n_reads - written_reads, trimmed_reads, percent_reads_trimmed)
    print >>sys.stderr, 'trimmed or removed %.2f%% of bases (%d total)' % \
        ((1 - (written_bp / float(n_bp))) * 100.0, n_bp - written_bp)

    if args.variable_coverage:
        percent_reads_hicov = 100.0 * float(n_reads - skipped_n) / n_reads
        print >>sys.stderr, '%d reads were high coverage (%.2f%%);' % \
            (n_reads - skipped_n, percent_reads_hicov)
        print >> sys.stderr, ('skipped %d reads/%d bases because of low'
                              'coverage') % (skipped_n, skipped_bp)

    fp_rate = \
        khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975
    print >>sys.stderr, \
        'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate)

    print >> sys.stderr, 'output in *.abundtrim'

    if args.savetable:
        print >> sys.stderr, "Saving k-mer counting table to", args.savetable
        ct.save(args.savetable)
示例#22
0
def main():  # pylint: disable=too-many-branches,too-many-statements
    info('saturate-by-median.py', ['diginorm'])
    args = get_parser().parse_args()

    report_on_config(args)

    report_fp = args.report
    report_frequency = args.report_frequency

    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, False)
    if args.savetable:
        check_space_for_hashtable(args.n_tables * args.min_tablesize, False)

    # list to save error files along with throwing exceptions
    if args.force:
        corrupt_files = []

    if args.loadtable:
        print 'loading k-mer counting table from', args.loadtable
        htable = khmer.load_counting_hash(args.loadtable)
    else:
        print 'making k-mer counting table'
        htable = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                         args.n_tables)

    total = 0
    discarded = 0

    for index, input_filename in enumerate(args.input_filenames):
        total_acc = 0
        discarded_acc = 0

        try:
            total_acc, discarded_acc = normalize_by_median(input_filename,
                                                           htable, args,
                                                           report_fp,
                                                           report_frequency)
        except IOError as err:
            handle_error(err, input_filename)
            if not args.force:
                print >> sys.stderr, '** Exiting!'
                sys.exit(1)
            else:
                print >> sys.stderr, '*** Skipping error file, moving on...'
                corrupt_files.append(input_filename)
        else:
            if total_acc == 0 and discarded_acc == 0:
                print 'SKIPPED empty file', input_filename
            else:
                total += total_acc
                discarded += discarded_acc
                print 'DONE with {inp}; kept {kept} of {total} or {perc:2}%'\
                    .format(inp=input_filename,
                            kept=total - discarded, total=total,
                            perc=int(100. - discarded / float(total) * 100.))

    if args.savetable:
        print 'Saving k-mer counting table through', input_filename
        print '...saving to', args.savetable
        htable.save(args.savetable)

    # re: threshold, see Zhang et al.,
    # http://arxiv.org/abs/1309.2975
    fp_rate = khmer.calc_expected_collisions(htable, args.force, max_false_pos=.8)
    print 'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate)

    if args.force and len(corrupt_files) > 0:
        print >> sys.stderr, "** WARNING: Finished with errors!"
        print >> sys.stderr, "** IOErrors occurred in the following files:"
        print >> sys.stderr, "\t", " ".join(corrupt_files)
示例#23
0
def main():

    info('collect-reads.py', ['counting'])
    args = get_parser().parse_args()
    report_on_config(args)

    base = args.output_countingtable_filename
    filenames = args.input_sequence_filename

    for name in args.input_sequence_filename:
        check_input_files(name, False)

    check_space(args.input_sequence_filename, False)
    check_space_for_hashtable(args.n_tables * args.min_tablesize, False)

    print 'Saving k-mer counting table to %s' % base
    print 'Loading sequences from %s' % repr(filenames)
    if args.output:
        print 'Outputting sequences to', args.output

    print 'making k-mer counting table'
    htable = khmer.new_counting_hash(args.ksize, args.min_tablesize)
    htable.set_use_bigcount(args.bigcount)

    total_coverage = 0.
    n = 0

    for index, filename in enumerate(filenames):
        for record in screed.open(filename):
            seq = record.sequence.upper()
            if 'N' in seq:
                seq = seq.replace('N', 'A')

            try:
                med, _, _ = htable.get_median_count(seq)
            except ValueError:
                continue

            total_coverage += med
            n += 1

            if total_coverage / float(n) > args.coverage:
                print 'reached target average coverage:', \
                      total_coverage / float(n)
                break

            htable.consume(seq)
            if args.output:
                args.output.write(output_single(record))

            if n % 100000 == 0:
                print '...', index, filename, n, total_coverage / float(n)

        if total_coverage / float(n) > args.coverage:
            break

    print 'Collected %d reads' % (n, )

    if args.report_total_kmers:
        print >> sys.stderr, 'Total number of k-mers: {0}'.format(
            htable.n_occupied())

    print 'saving', base
    htable.save(base)

    info_fp = open(base + '.info', 'w')
    info_fp.write('through end: %s\n' % filenames[-1])

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(htable,
                                             args.force,
                                             max_false_pos=.2)
    print 'fp rate estimated to be %1.3f' % fp_rate
    print >> info_fp, 'fp rate estimated to be %1.3f' % fp_rate

    print 'DONE.'
示例#24
0
def main():
    info('trim-low-abund.py', ['streaming'])
    parser = get_parser()
    args = parser.parse_args()

    ###

    if len(set(args.input_filenames)) != len(args.input_filenames):
        print >>sys.stderr, \
            "Error: Cannot input the same filename multiple times."
        sys.exit(1)

    ###

    report_on_config(args)
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savetable:
        check_space_for_hashtable(
            args.n_tables * args.min_tablesize, args.force)

    K = args.ksize

    CUTOFF = args.cutoff
    NORMALIZE_LIMIT = args.normalize_to

    if args.loadtable:
        print >>sys.stderr, 'loading k-mer counting table from', args.loadtable
        ct = khmer.load_counting_hash(args.loadtable)
    else:
        print >>sys.stderr, 'making k-mer counting table'
        ct = khmer.new_counting_hash(K, args.min_tablesize, args.n_tables)

    tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir)
    print >>sys.stderr, 'created temporary directory %s; ' \
                        'use -T to change location' % tempdir

    # ### FIRST PASS ###

    save_pass2_total = 0

    n_bp = 0
    n_reads = 0
    written_bp = 0
    written_reads = 0
    trimmed_reads = 0

    pass2list = []
    for filename in args.input_filenames:
        pass2filename = os.path.basename(filename) + '.pass2'
        pass2filename = os.path.join(tempdir, pass2filename)
        trimfilename = os.path.basename(filename) + '.abundtrim'

        pass2list.append((filename, pass2filename, trimfilename))

        screed_iter = screed.open(filename, parse_description=False)
        pass2fp = open(pass2filename, 'w')
        trimfp = open(trimfilename, 'w')

        save_pass2 = 0
        n = 0

        paired_iter = broken_paired_reader(screed_iter, min_length=K,
                                           force_single=args.ignore_pairs)
        for n, is_pair, read1, read2 in paired_iter:
            if n % 10000 == 0:
                print >>sys.stderr, '...', n, filename, save_pass2, \
                    n_reads, n_bp, written_reads, written_bp

            # we want to track paired reads here, to make sure that pairs
            # are not split between first pass and second pass.

            if is_pair:
                n_reads += 2
                n_bp += len(read1.sequence) + len(read2.sequence)

                seq1 = read1.sequence.replace('N', 'A')
                seq2 = read2.sequence.replace('N', 'A')

                med1, _, _ = ct.get_median_count(seq1)
                med2, _, _ = ct.get_median_count(seq2)

                if med1 < NORMALIZE_LIMIT or med2 < NORMALIZE_LIMIT:
                    ct.consume(seq1)
                    ct.consume(seq2)
                    write_record_pair(read1, read2, pass2fp)
                    save_pass2 += 2
                else:
                    _, trim_at1 = ct.trim_on_abundance(seq1, CUTOFF)
                    _, trim_at2 = ct.trim_on_abundance(seq2, CUTOFF)

                    if trim_at1 >= K:
                        read1 = trim_record(read1, trim_at1)

                    if trim_at2 >= K:
                        read2 = trim_record(read2, trim_at2)

                    if trim_at1 != len(seq1):
                        trimmed_reads += 1
                    if trim_at2 != len(seq2):
                        trimmed_reads += 1

                    write_record_pair(read1, read2, trimfp)
                    written_reads += 2
                    written_bp += trim_at1 + trim_at2
            else:
                n_reads += 1
                n_bp += len(read1.sequence)

                seq = read1.sequence.replace('N', 'A')

                med, _, _ = ct.get_median_count(seq)

                # has this portion of the graph saturated? if not,
                # consume & save => pass2.
                if med < NORMALIZE_LIMIT:
                    ct.consume(seq)
                    write_record(read1, pass2fp)
                    save_pass2 += 1
                else:                       # trim!!
                    _, trim_at = ct.trim_on_abundance(seq, CUTOFF)
                    if trim_at >= K:
                        new_read = trim_record(read1, trim_at)
                        write_record(new_read, trimfp)

                        written_reads += 1
                        written_bp += trim_at

                        if trim_at != len(read1.sequence):
                            trimmed_reads += 1

        pass2fp.close()
        trimfp.close()

        print '%s: kept aside %d of %d from first pass, in %s' % \
              (filename, save_pass2, n, filename)
        save_pass2_total += save_pass2

    # ### SECOND PASS. ###

    skipped_n = 0
    skipped_bp = 0
    for _, pass2filename, trimfilename in pass2list:
        print 'second pass: looking at sequences kept aside in %s' % \
              pass2filename

        # note that for this second pass, we don't care about paired
        # reads - they will be output in the same order they're read in,
        # so pairs will stay together if not orphaned.  This is in contrast
        # to the first loop.

        trimfp = open(trimfilename, 'a')
        for n, read in enumerate(screed.open(pass2filename,
                                             parse_description=False)):
            if n % 10000 == 0:
                print >>sys.stderr, '... x 2', n, pass2filename, \
                    written_reads, written_bp

            seq = read.sequence.replace('N', 'A')
            med, _, _ = ct.get_median_count(seq)

            # do we retain low-abundance components unchanged?
            if med < NORMALIZE_LIMIT and args.variable_coverage:
                write_record(read, trimfp)

                written_reads += 1
                written_bp += len(read.sequence)
                skipped_n += 1
                skipped_bp += len(read.sequence)

            # otherwise, examine/trim/truncate.
            else:    # med >= NORMALIZE LIMIT or not args.variable_coverage
                _, trim_at = ct.trim_on_abundance(seq, CUTOFF)
                if trim_at >= K:
                    new_read = trim_record(read, trim_at)
                    write_record(new_read, trimfp)

                    written_reads += 1
                    written_bp += trim_at

                    if trim_at != len(read.sequence):
                        trimmed_reads += 1

        print >>sys.stderr, 'removing %s' % pass2filename
        os.unlink(pass2filename)

    print >>sys.stderr, 'removing temp directory & contents (%s)' % tempdir
    shutil.rmtree(tempdir)

    n_passes = 1.0 + (float(save_pass2_total) / n_reads)
    percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\
        n_reads * 100.0

    print 'read %d reads, %d bp' % (n_reads, n_bp,)
    print 'wrote %d reads, %d bp' % (written_reads, written_bp,)
    print 'looked at %d reads twice (%.2f passes)' % (save_pass2_total,
                                                      n_passes)
    print 'removed %d reads and trimmed %d reads (%.2f%%)' % \
        (n_reads - written_reads, trimmed_reads, percent_reads_trimmed)
    print 'trimmed or removed %.2f%% of bases (%d total)' % \
        ((1 - (written_bp / float(n_bp))) * 100.0, n_bp - written_bp)

    if args.variable_coverage:
        percent_reads_hicov = 100.0 * float(n_reads - skipped_n) / n_reads
        print '%d reads were high coverage (%.2f%%);' % (n_reads - skipped_n,
                                                         percent_reads_hicov)
        print 'skipped %d reads/%d bases because of low coverage' % \
              (skipped_n, skipped_bp)

    fp_rate = khmer.calc_expected_collisions(ct)
    print >>sys.stderr, \
        'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate)

    if fp_rate > MAX_FALSE_POSITIVE_RATE:
        print >> sys.stderr, "**"
        print >> sys.stderr, ("** ERROR: the k-mer counting table is too small"
                              " for this data set. Increase tablesize/# "
                              "tables.")
        print >> sys.stderr, "**"
        print >> sys.stderr, "** Do not use these results!!"
        sys.exit(1)

    print 'output in *.abundtrim'

    if args.savetable:
        print >>sys.stderr, "Saving k-mer counting table to", args.savetable
        ct.save(args.savetable)
示例#25
0
def main():  # pylint: disable=too-many-branches,too-many-statements
    info('normalize-by-median.py', ['diginorm'])
    args = get_parser().parse_args()

    report_on_config(args)

    report_fp = args.report

    # check for similar filenames
    filenames = []
    for pathfilename in args.input_filenames:
        filename = pathfilename.split('/')[-1]
        if (filename in filenames):
            print >>sys.stderr, "WARNING: At least two input files are named \
%s . (The script normalize-by-median.py can not handle this, only one .keep \
file for one of the input files will be generated.)" % filename
        else:
            filenames.append(filename)

    # check for others
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savetable:
        check_space_for_hashtable(
            args.n_tables * args.min_tablesize, args.force)

    # list to save error files along with throwing exceptions
    corrupt_files = []

    if args.loadtable:
        print 'loading k-mer counting table from', args.loadtable
        htable = khmer.load_counting_hash(args.loadtable)
    else:
        print >> sys.stderr, 'making k-mer counting table'
        htable = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                         args.n_tables)

    input_filename = None

    for index, input_filename in enumerate(args.input_filenames):
        total_acc, discarded_acc, corrupt_files = \
            normalize_by_median_and_check(
                input_filename, htable, args.single_output_file,
                args.fail_save, args.paired, args.cutoff, args.force,
                corrupt_files, report_fp)

        if (args.dump_frequency > 0 and
                index > 0 and index % args.dump_frequency == 0):
            print 'Backup: Saving k-mer counting file through', input_filename
            if args.savetable:
                hashname = args.savetable
                print '...saving to', hashname
            else:
                hashname = 'backup.ct'
                print 'Nothing given for savetable, saving to', hashname
            htable.save(hashname)

    if args.paired and args.unpaired_reads:
        args.paired = False
        output_name = args.unpaired_reads
        if not args.single_output_file:
            output_name = os.path.basename(args.unpaired_reads) + '.keep'
        outfp = open(output_name, 'w')
        total_acc, discarded_acc, corrupt_files = \
            normalize_by_median_and_check(
                args.unpaired_reads, htable, args.single_output_file,
                args.fail_save, args.paired, args.cutoff, args.force,
                corrupt_files, report_fp)

    if args.report_total_kmers:
        print >> sys.stderr, 'Total number of unique k-mers: {0}'.format(
            htable.n_unique_kmers())

    if args.savetable:
        print 'Saving k-mer counting table through', input_filename
        print '...saving to', args.savetable
        htable.save(args.savetable)

    fp_rate = \
        khmer.calc_expected_collisions(htable, args.force, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975

    print >> sys.stderr, \
        'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate)

    if args.force and len(corrupt_files) > 0:
        print >> sys.stderr, "** WARNING: Finished with errors!"
        print >> sys.stderr, "** IOErrors occurred in the following files:"
        print >> sys.stderr, "\t", " ".join(corrupt_files)
示例#26
0
def main():  # pylint: disable=too-many-branches,too-many-statements
    info("saturate-by-median.py", ["diginorm"])
    args = get_parser().parse_args()

    report_on_config(args)

    report_fp = args.report
    report_frequency = args.report_frequency

    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, False)
    if args.savetable:
        check_space_for_hashtable(args, "countgraph", False)

    # list to save error files along with throwing exceptions
    if args.force:
        corrupt_files = []

    if args.loadtable:
        print("loading k-mer counting table from", args.loadtable)
        htable = khmer.load_counting_hash(args.loadtable)
    else:
        print("making countgraph")
        htable = create_countgraph(args)

    total = 0
    discarded = 0

    for index, input_filename in enumerate(args.input_filenames):
        total_acc = 0
        discarded_acc = 0

        try:
            total_acc, discarded_acc = normalize_by_median(input_filename, htable, args, report_fp, report_frequency)
        except IOError as err:
            handle_error(err, input_filename)
            if not args.force:
                print("** Exiting!", file=sys.stderr)
                sys.exit(1)
            else:
                print("*** Skipping error file, moving on...", file=sys.stderr)
                corrupt_files.append(input_filename)
        else:
            if total_acc == 0 and discarded_acc == 0:
                print("SKIPPED empty file", input_filename)
            else:
                total += total_acc
                discarded += discarded_acc
                print(
                    "DONE with {inp}; kept {kept} of {total} or {perc:2}%".format(
                        inp=input_filename,
                        kept=total - discarded,
                        total=total,
                        perc=int(100.0 - discarded / float(total) * 100.0),
                    )
                )

    if args.savetable:
        print("Saving k-mer counting table through", input_filename)
        print("...saving to", args.savetable)
        htable.save(args.savetable)

    # re: threshold, see Zhang et al.,
    # http://arxiv.org/abs/1309.2975
    fp_rate = khmer.calc_expected_collisions(htable, args.force, max_false_pos=0.8)
    print("fp rate estimated to be {fpr:1.3f}".format(fpr=fp_rate))

    if args.force and len(corrupt_files) > 0:
        print("** WARNING: Finished with errors!", file=sys.stderr)
        print("** I/O Errors occurred in the following files:", file=sys.stderr)
        print("\t", " ".join(corrupt_files), file=sys.stderr)