示例#1
0
def main():
    info('filter-abund.py', ['counting'])
    args = sanitize_help(get_parser()).parse_args()

    check_input_files(args.input_graph, args.force)
    infiles = args.input_filename
    if ('-' in infiles or '/dev/stdin' in infiles) and not \
       args.single_output_file:
        print("Accepting input from stdin; output filename must "
              "be provided with -o.", file=sys.stderr)
        sys.exit(1)

    for filename in infiles:
        check_input_files(filename, args.force)

    check_space(infiles, args.force)

    print('loading countgraph:', args.input_graph,
          file=sys.stderr)
    countgraph = khmer.load_countgraph(args.input_graph)
    ksize = countgraph.ksize()

    print("K:", ksize, file=sys.stderr)

    # the filtering function.
    def process_fn(record):
        name = record.name
        seq = record.sequence
        seqN = seq.replace('N', 'A')

        if args.variable_coverage:  # only trim when sequence has high enough C
            med, _, _ = countgraph.get_median_count(seqN)
            if med < args.normalize_to:
                return name, seq

        _, trim_at = countgraph.trim_on_abundance(seqN, args.cutoff)

        if trim_at >= ksize:
            # be sure to not to change the 'N's in the trimmed sequence -
            # so, return 'seq' and not 'seqN'.
            return name, seq[:trim_at]

        return None, None

    if args.single_output_file:
        outfile = args.single_output_file.name
        outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip)

    # the filtering loop
    for infile in infiles:
        print('filtering', infile, file=sys.stderr)
        if not args.single_output_file:
            outfile = os.path.basename(infile) + '.abundfilt'
            outfp = open(outfile, 'wb')
            outfp = get_file_writer(outfp, args.gzip, args.bzip)

        tsp = ThreadedSequenceProcessor(process_fn, n_workers=args.threads)
        tsp.start(verbose_loader(infile), outfp)

        print('output in', outfile, file=sys.stderr)
示例#2
0
def test_get_file_writer_fail():
    somefile = utils.get_temp_filename("potato")
    somefile = open(somefile, "w")
    stopped = True
    try:
        get_file_writer(somefile, True, True)
        stopped = False
    except ValueError as err:
        assert "Cannot specify both bzip and gzip" in str(err), str(err)

    assert stopped, "Expected exception"
示例#3
0
def test_get_file_writer_fail():
    somefile = utils.get_temp_filename("potato")
    somefile = open(somefile, "w")
    stopped = True
    try:
        get_file_writer(somefile, True, True)
        stopped = False
    except ValueError as err:
        assert "Cannot specify both bzip and gzip" in str(err), str(err)

    assert stopped, "Expected exception"
示例#4
0
def main():
    args = sanitize_help(get_parser()).parse_args()
    if not args.quiet:
        info('filter-abund.py', ['counting'])

    configure_logging(args.quiet)

    infiles = args.input_filename
    if ('-' in infiles or '/dev/stdin' in infiles) and not \
       args.single_output_file:
        log_error("Accepting input from stdin; output filename must "
                  "be provided with -o.")
        sys.exit(1)

    for filename in infiles:
        check_input_files(filename, args.force)

    check_space(infiles, args.force)

    log_info('loading countgraph: {graph}', graph=args.input_graph)
    countgraph = khmer.load_countgraph(args.input_graph)
    ksize = countgraph.ksize()

    log_info("K: {ksize}", ksize=ksize)

    if args.single_output_file:
        outfile = args.single_output_file.name
        outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip)

    # the filtering loop
    for infile in infiles:
        log_info('filtering {infile}', infile=infile)
        if not args.single_output_file:
            outfile = os.path.basename(infile) + '.abundfilt'
            outfp = open(outfile, 'wb')
            outfp = get_file_writer(outfp, args.gzip, args.bzip)

        paired_iter = broken_paired_reader(ReadParser(infile),
                                           min_length=ksize,
                                           force_single=True)

        for n, is_pair, read1, read2 in paired_iter:
            assert not is_pair
            assert read2 is None

            trimmed_record, _ = trim_record(countgraph, read1, args.cutoff,
                                            args.variable_coverage,
                                            args.normalize_to)
            if trimmed_record:
                write_record(trimmed_record, outfp)

        log_info('output in {outfile}', outfile=outfile)
示例#5
0
def main():
    args = sanitize_help(get_parser()).parse_args()

    configure_logging(args.quiet)

    infiles = args.input_filename
    if ('-' in infiles or '/dev/stdin' in infiles) and not \
       args.single_output_file:
        log_error("Accepting input from stdin; output filename must "
                  "be provided with -o.")
        sys.exit(1)

    for filename in infiles:
        check_input_files(filename, args.force)

    check_space(infiles, args.force)

    log_info('loading countgraph: {graph}', graph=args.input_graph)
    countgraph = khmer.load_countgraph(args.input_graph)
    ksize = countgraph.ksize()

    log_info("K: {ksize}", ksize=ksize)

    if args.single_output_file:
        outfile = args.single_output_file.name
        outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip)

    # the filtering loop
    for infile in infiles:
        log_info('filtering {infile}', infile=infile)
        if not args.single_output_file:
            outfile = os.path.basename(infile) + '.abundfilt'
            outfp = open(outfile, 'wb')
            outfp = get_file_writer(outfp, args.gzip, args.bzip)

        paired_iter = broken_paired_reader(ReadParser(infile),
                                           min_length=ksize,
                                           force_single=True)

        for n, is_pair, read1, read2 in paired_iter:
            assert not is_pair
            assert read2 is None

            trimmed_record, _ = trim_record(countgraph, read1, args.cutoff,
                                            args.variable_coverage,
                                            args.normalize_to)
            if trimmed_record:
                write_record(trimmed_record, outfp)

        log_info('output in {outfile}', outfile=outfile)
示例#6
0
def main():
    args = sanitize_help(get_parser()).parse_args()

    print('fastq from ', args.input_sequence, file=sys.stderr)
    outfp = get_file_writer(args.output, args.gzip, args.bzip)
    n_count = 0
    for n, record in enumerate(screed.open(args.input_sequence)):
        if n % 10000 == 0:
            print('...', n, file=sys.stderr)

        sequence = record['sequence']

        if 'N' in sequence:
            if not args.n_keep:
                n_count += 1
                continue

        del record['quality']
        write_record(record, outfp)

    print('\n' + 'lines from ' + args.input_sequence, file=sys.stderr)

    if not args.n_keep:
        print(str(n_count) + ' lines dropped.', file=sys.stderr)

    else:
        print('No lines dropped from file.', file=sys.stderr)

    print('Wrote output to', describe_file_handle(args.output),
          file=sys.stderr)
示例#7
0
def main():
    args = sanitize_help(get_parser()).parse_args()

    print('fastq from ', args.input_sequence, file=sys.stderr)
    outfp = get_file_writer(args.output, args.gzip, args.bzip)
    n_count = 0
    for n, record in enumerate(screed.open(args.input_sequence)):
        if n % 10000 == 0:
            print('...', n, file=sys.stderr)

        sequence = record['sequence']

        if 'N' in sequence:
            if not args.n_keep:
                n_count += 1
                continue

        del record['quality']
        write_record(record, outfp)

    print('\n' + 'lines from ' + args.input_sequence, file=sys.stderr)

    if not args.n_keep:
        print(str(n_count) + ' lines dropped.', file=sys.stderr)

    else:
        print('No lines dropped from file.', file=sys.stderr)

    print('Wrote output to',
          describe_file_handle(args.output),
          file=sys.stderr)
示例#8
0
def main():
    args = sanitize_help(get_parser()).parse_args()
    outfp = get_file_writer(args.output, args.gzip, args.bzip)
    for filename in args.input_filenames:
        for record in screed.open(filename):
            if len(record['sequence']) >= args.length:
                write_record(record, outfp)
    print('wrote to: ' + args.output.name, file=sys.stderr)
示例#9
0
def main():
    args = sanitize_help(get_parser()).parse_args()
    outfp = get_file_writer(args.output, args.gzip, args.bzip)
    for filename in args.input_filenames:
        for record in screed.open(filename):
            if len(record['sequence']) >= args.length:
                write_record(record, outfp)
    print('wrote to: ' + args.output.name, file=sys.stderr)
示例#10
0
def main():
    args = sanitize_help(get_parser()).parse_args()

    configure_logging(args.quiet)
    check_input_files(args.datafile, args.force)
    check_space([args.datafile], args.force)

    if args.savegraph:
        tablesize = calculate_graphsize(args, "countgraph")
        check_space_for_graph(args.savegraph, tablesize, args.force)

    report_on_config(args)

    log_info("making countgraph")
    graph = khmer_args.create_countgraph(args)

    # first, load reads into graph
    rparser = khmer.ReadParser(args.datafile)
    threads = []
    log_info("consuming input, round 1 -- {datafile}", datafile=args.datafile)
    for _ in range(args.threads):
        cur_thread = threading.Thread(target=graph.consume_fasta_with_reads_parser, args=(rparser,))
        threads.append(cur_thread)
        cur_thread.start()

    for _ in threads:
        _.join()

    log_info("Total number of unique k-mers: {nk}", nk=graph.n_unique_kmers())

    fp_rate = khmer.calc_expected_collisions(graph, args.force)
    log_info("fp rate estimated to be {fpr:1.3f}", fpr=fp_rate)

    # the filtering loop
    log_info("filtering {datafile}", datafile=args.datafile)
    if args.outfile is None:
        outfile = os.path.basename(args.datafile) + ".abundfilt"
    else:
        outfile = args.outfile
    outfp = open(outfile, "wb")
    outfp = get_file_writer(outfp, args.gzip, args.bzip)

    paired_iter = broken_paired_reader(ReadParser(args.datafile), min_length=graph.ksize(), force_single=True)

    for n, is_pair, read1, read2 in paired_iter:
        assert not is_pair
        assert read2 is None

        trimmed_record, _ = trim_record(graph, read1, args.cutoff, args.variable_coverage, args.normalize_to)
        if trimmed_record:
            print((trimmed_record,))
            write_record(trimmed_record, outfp)

    log_info("output in {outfile}", outfile=outfile)

    if args.savegraph:
        log_info("Saving k-mer countgraph filename {graph}", graph=args.savegraph)
        graph.save(args.savegraph)
示例#11
0
def main():
    info('interleave-reads.py')
    args = sanitize_help(get_parser()).parse_args()

    check_input_files(args.left, args.force)
    check_input_files(args.right, args.force)
    check_space([args.left, args.right], args.force)

    s1_file = args.left
    s2_file = args.right

    print("Interleaving:\n\t%s\n\t%s" % (s1_file, s2_file), file=sys.stderr)

    outfp = get_file_writer(args.output, args.gzip, args.bzip)

    counter = 0
    screed_iter_1 = screed.open(s1_file)
    screed_iter_2 = screed.open(s2_file)
    for read1, read2 in zip_longest(screed_iter_1, screed_iter_2):
        if read1 is None or read2 is None:
            print(("ERROR: Input files contain different number"
                   " of records."),
                  file=sys.stderr)
            sys.exit(1)

        if counter % 100000 == 0:
            print('...', counter, 'pairs', file=sys.stderr)
        counter += 1

        name1 = read1.name
        name2 = read2.name

        if not args.no_reformat:
            if not check_is_left(name1):
                name1 += '/1'
            if not check_is_right(name2):
                name2 += '/2'

            read1.name = name1
            read2.name = name2

            if not check_is_pair(read1, read2):
                print("ERROR: This doesn't look like paired data! "
                      "%s %s" % (read1.name, read2.name),
                      file=sys.stderr)
                sys.exit(1)

        write_record_pair(read1, read2, outfp)

    print('final: interleaved %d pairs' % counter, file=sys.stderr)
    print('output written to', describe_file_handle(outfp), file=sys.stderr)
示例#12
0
def main():
    info('interleave-reads.py')
    args = sanitize_help(get_parser()).parse_args()

    check_input_files(args.left, args.force)
    check_input_files(args.right, args.force)
    check_space([args.left, args.right], args.force)

    s1_file = args.left
    s2_file = args.right

    fail = False

    print("Interleaving:\n\t%s\n\t%s" % (s1_file, s2_file), file=sys.stderr)

    outfp = get_file_writer(args.output, args.gzip, args.bzip)

    counter = 0
    screed_iter_1 = screed.open(s1_file)
    screed_iter_2 = screed.open(s2_file)
    for read1, read2 in zip_longest(screed_iter_1, screed_iter_2):
        if read1 is None or read2 is None:
            print(("ERROR: Input files contain different number"
                   " of records."), file=sys.stderr)
            sys.exit(1)

        if counter % 100000 == 0:
            print('...', counter, 'pairs', file=sys.stderr)
        counter += 1

        name1 = read1.name
        name2 = read2.name

        if not args.no_reformat:
            if not check_is_left(name1):
                name1 += '/1'
            if not check_is_right(name2):
                name2 += '/2'

            read1.name = name1
            read2.name = name2

            if not check_is_pair(read1, read2):
                print("ERROR: This doesn't look like paired data! "
                      "%s %s" % (read1.name, read2.name), file=sys.stderr)
                sys.exit(1)

        write_record_pair(read1, read2, outfp)

    print('final: interleaved %d pairs' % counter, file=sys.stderr)
    print('output written to', describe_file_handle(outfp), file=sys.stderr)
示例#13
0
def main():
    args = sanitize_help(get_parser()).parse_args()

    infile = args.infile

    filenames = [infile]
    check_input_files(infile, args.force)
    check_space(filenames, args.force)

    basename = os.path.basename(infile)

    # decide where to put output files - specific directory? or just default?
    if infile in ('/dev/stdin', '-'):
        # seqan only treats '-' as "read from stdin"
        infile = '-'
        if not (args.output_first and args.output_second):
            print("Accepting input from stdin; "
                  "output filenames must be provided.", file=sys.stderr)
            sys.exit(1)
    elif args.output_directory:
        if not os.path.exists(args.output_directory):
            os.makedirs(args.output_directory)
        out1 = os.path.join(args.output_directory, basename + '.1')
        out2 = os.path.join(args.output_directory, basename + '.2')
    else:
        out1 = basename + '.1'
        out2 = basename + '.2'

    # OVERRIDE output file locations with -1, -2
    if args.output_first:
        fp_out1 = get_file_writer(args.output_first, args.gzip, args.bzip)
        out1 = fp_out1.name
    else:
        # Use default filename created above
        fp_out1 = get_file_writer(open(out1, 'wb'), args.gzip, args.bzip)
    if args.output_second:
        fp_out2 = get_file_writer(args.output_second, args.gzip, args.bzip)
        out2 = fp_out2.name
    else:
        # Use default filename created above
        fp_out2 = get_file_writer(open(out2, 'wb'), args.gzip, args.bzip)

    # put orphaned reads here, if -0!
    if args.output_orphaned:
        fp_out0 = get_file_writer(args.output_orphaned, args.gzip, args.bzip)
        out0 = describe_file_handle(args.output_orphaned)

    counter1 = 0
    counter2 = 0
    counter3 = 0
    index = None

    # walk through all the reads in broken-paired mode.
    paired_iter = broken_paired_reader(ReadParser(infile),
                                       require_paired=not args.output_orphaned)

    try:
        for index, is_pair, record1, record2 in paired_iter:
            if index % 10000 == 0:
                print('...', index, file=sys.stderr)

            if is_pair:
                write_record(record1, fp_out1)
                counter1 += 1
                write_record(record2, fp_out2)
                counter2 += 1
            elif args.output_orphaned:
                write_record(record1, fp_out0)
                counter3 += 1
    except UnpairedReadsError as e:
        print("Unpaired reads found starting at {name}; exiting".format(
            name=e.read1.name), file=sys.stderr)
        sys.exit(1)

    print("DONE; split %d sequences (%d left, %d right, %d orphans)" %
          (counter1 + counter2, counter1, counter2, counter3), file=sys.stderr)
    print("/1 reads in %s" % out1, file=sys.stderr)
    print("/2 reads in %s" % out2, file=sys.stderr)
    if args.output_orphaned:
        print("orphans in %s" % out0, file=sys.stderr)
示例#14
0
def main():
    args = sanitize_help(get_parser()).parse_args()
    if not args.quiet:
        info('filter-abund-single.py', ['counting', 'SeqAn'])

    configure_logging(args.quiet)
    check_input_files(args.datafile, args.force)
    check_space([args.datafile], args.force)

    if args.savegraph:
        tablesize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, tablesize, args.force)

    report_on_config(args)

    log_info('making countgraph')
    graph = khmer_args.create_countgraph(args)

    # first, load reads into graph
    rparser = khmer.ReadParser(args.datafile)
    threads = []
    log_info('consuming input, round 1 -- {datafile}', datafile=args.datafile)
    for _ in range(args.threads):
        cur_thread = \
            threading.Thread(
                target=graph.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(cur_thread)
        cur_thread.start()

    for _ in threads:
        _.join()

    log_info('Total number of unique k-mers: {nk}', nk=graph.n_unique_kmers())

    fp_rate = khmer.calc_expected_collisions(graph, args.force)
    log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate)

    # now, trim.

    # the filtering function.
    def process_fn(record):
        name = record.name
        seq = record.sequence
        seqN = seq.replace('N', 'A')

        _, trim_at = graph.trim_on_abundance(seqN, args.cutoff)

        if trim_at >= args.ksize:
            # be sure to not to change the 'N's in the trimmed sequence -
            # so, return 'seq' and not 'seqN'.
            return name, seq[:trim_at]

        return None, None

    # the filtering loop
    log_info('filtering {datafile}', datafile=args.datafile)
    if args.outfile is None:
        outfile = os.path.basename(args.datafile) + '.abundfilt'
    else:
        outfile = args.outfile
    outfp = open(outfile, 'wb')
    outfp = get_file_writer(outfp, args.gzip, args.bzip)

    tsp = ThreadedSequenceProcessor(process_fn, verbose=not args.quiet)
    tsp.start(verbose_loader(args.datafile), outfp)

    log_info('output in {outfile}', outfile=outfile)

    if args.savegraph:
        log_info('Saving k-mer countgraph filename {graph}',
                 graph=args.savegraph)
        graph.save(args.savegraph)
示例#15
0
def main():  # pylint: disable=too-many-branches,too-many-statements
    parser = sanitize_help(get_parser())
    args = parser.parse_args()

    configure_logging(args.quiet)
    report_on_config(args)

    report_fp = args.report
    force_single = args.force_single

    # check for similar filenames
    # if we're using a single output file only check for identical filenames
    # otherwise, check for identical BASE names as well.
    filenames = []
    basenames = []
    for pathfilename in args.input_filenames:
        filenames.append(pathfilename)
        if args.single_output_file:
            continue  # nothing more to worry about

        basename = os.path.basename(pathfilename)
        if basename in basenames:
            log_error('ERROR: Duplicate filename--Cannot handle this!')
            log_error('** Exiting!')
            sys.exit(1)

        basenames.append(basename)

    # check that files exist and there is sufficient output disk space.
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savegraph is not None:
        graphsize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, graphsize, args.force)

    # load or create counting table.
    if args.loadgraph:
        log_info('loading k-mer countgraph from {graph}',
                 graph=args.loadgraph)
        countgraph = Countgraph.load(args.loadgraph)
    else:
        log_info('making countgraph')
        countgraph = khmer_args.create_countgraph(args)

    # create an object to handle diginorm of all files
    norm = Normalizer(args.cutoff, countgraph)
    with_diagnostics = WithDiagnostics(norm, report_fp, args.report_frequency)

    # make a list of all filenames and if they're paired or not;
    # if we don't know if they're paired, default to allowing but not
    # forcing pairing.
    files = []
    for element in filenames:
        files.append([element, args.paired])
    if args.unpaired_reads:
        files.append([args.unpaired_reads, False])

    corrupt_files = []
    outfp = None
    output_name = None

    if args.single_output_file:
        outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip)
    else:
        if '-' in filenames or '/dev/stdin' in filenames:
            print("Accepting input from stdin; output filename must "
                  "be provided with '-o'.", file=sys.stderr)
            sys.exit(1)

    #
    # main loop: iterate over all files given, do diginorm.
    #

    for filename, require_paired in files:
        if not args.single_output_file:
            output_name = os.path.basename(filename) + '.keep'
            outfp = open(output_name, 'wb')
            outfp = get_file_writer(outfp, args.gzip, args.bzip)

        # failsafe context manager in case an input file breaks
        with catch_io_errors(filename, outfp, args.single_output_file,
                             args.force, corrupt_files):
            screed_iter = clean_input_reads(screed.open(filename))
            reader = broken_paired_reader(screed_iter, min_length=args.ksize,
                                          force_single=force_single,
                                          require_paired=require_paired)

            # actually do diginorm
            for record in with_diagnostics(reader, filename):
                if record is not None:
                    write_record(record, outfp)

            log_info('output in {name}', name=describe_file_handle(outfp))
            if not args.single_output_file:
                outfp.close()

    # finished - print out some diagnostics.

    log_info('Total number of unique k-mers: {umers}',
             umers=countgraph.n_unique_kmers())

    if args.savegraph is not None:
        log_info('...saving to {name}', name=args.savegraph)
        countgraph.save(args.savegraph)

    fp_rate = \
        khmer.calc_expected_collisions(countgraph, False, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975

    log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate)

    if args.force and len(corrupt_files) > 0:
        log_error("** WARNING: Finished with errors!")
        log_error("** I/O Errors occurred in the following files:")
        log_error("\t" + " ".join(corrupt_files))
示例#16
0
def main():  # pylint: disable=too-many-locals,too-many-branches
    info('extract-partitions.py', ['graph'])
    args = sanitize_help(get_parser()).parse_args()

    distfilename = args.prefix + '.dist'

    n_unassigned = 0

    for infile in args.part_filenames:
        check_input_files(infile, args.force)

    check_space(args.part_filenames, args.force)

    print('---', file=sys.stderr)
    print('reading partitioned files:', repr(
        args.part_filenames), file=sys.stderr)
    if args.output_groups:
        print('outputting to files named "%s.groupN.fa"' %
              args.prefix, file=sys.stderr)
        print('min reads to keep a partition:',
              args.min_part_size, file=sys.stderr)
        print('max size of a group file:', args.max_size, file=sys.stderr)
    else:
        print('NOT outputting groups! Beware!', file=sys.stderr)

    if args.output_unassigned:
        print('outputting unassigned reads to "%s.unassigned.fa"' %
              args.prefix, file=sys.stderr)
    print('partition size distribution will go to %s'
          % distfilename, file=sys.stderr)
    print('---', file=sys.stderr)

    #

    suffix = 'fa'
    is_fastq = False

    for index, read, pid in read_partition_file(args.part_filenames[0]):
        if hasattr(read, 'quality'):
            suffix = 'fq'
            is_fastq = True
        break

    for filename in args.part_filenames:
        for index, read, pid in read_partition_file(filename):
            if is_fastq:
                assert hasattr(read, 'quality'), \
                    "all input files must be FASTQ if the first one is"
            else:
                assert not hasattr(read, 'quality'), \
                    "all input files must be FASTA if the first one is"

            break

    if args.output_unassigned:
        ofile = open('%s.unassigned.%s' % (args.prefix, suffix), 'wb')
        unassigned_fp = get_file_writer(ofile, args.gzip, args.bzip)
    count = {}
    for filename in args.part_filenames:
        for index, read, pid in read_partition_file(filename):
            if index % 100000 == 0:
                print('...', index, file=sys.stderr)

            count[pid] = count.get(pid, 0) + 1

            if pid == 0:
                n_unassigned += 1
                if args.output_unassigned:
                    write_record(read, unassigned_fp)

    if args.output_unassigned:
        unassigned_fp.close()

    if 0 in count:                          # eliminate unpartitioned sequences
        del count[0]

    # develop histogram of partition sizes
    dist = {}
    for pid, size in list(count.items()):
        dist[size] = dist.get(size, 0) + 1

    # output histogram
    distfp = open(distfilename, 'w')

    total = 0
    wtotal = 0
    for counter, index in sorted(dist.items()):
        total += index
        wtotal += counter * index
        distfp.write('%d %d %d %d\n' % (counter, index, total, wtotal))
    distfp.close()

    if not args.output_groups:
        sys.exit(0)

    # sort groups by size
    divvy = sorted(list(count.items()), key=lambda y: y[1])
    divvy = [y for y in divvy if y[1] > args.min_part_size]

    # divvy up into different groups, based on having max_size sequences
    # in each group.
    total = 0
    group = set()
    group_n = 0
    group_d = {}
    for partition_id, n_reads in divvy:
        group.add(partition_id)
        total += n_reads

        if total > args.max_size:
            for partition_id in group:
                group_d[partition_id] = group_n
                # print 'group_d', partition_id, group_n

            group_n += 1
            group = set()
            total = 0

    if group:
        for partition_id in group:
            group_d[partition_id] = group_n
            # print 'group_d', partition_id, group_n
        group_n += 1

    print('%d groups' % group_n, file=sys.stderr)
    if group_n == 0:
        print('nothing to output; exiting!', file=sys.stderr)
        return

    # open a bunch of output files for the different groups
    group_fps = {}
    for index in range(group_n):
        fname = '%s.group%04d.%s' % (args.prefix, index, suffix)
        group_fp = get_file_writer(open(fname, 'wb'), args.gzip,
                                   args.bzip)
        group_fps[index] = group_fp

    # write 'em all out!

    total_seqs = 0
    part_seqs = 0
    toosmall_parts = 0
    for filename in args.part_filenames:
        for index, read, partition_id in read_partition_file(filename):
            total_seqs += 1
            if index % 100000 == 0:
                print('...x2', index, file=sys.stderr)

            if partition_id == 0:
                continue

            try:
                group_n = group_d[partition_id]
            except KeyError:
                assert count[partition_id] <= args.min_part_size
                toosmall_parts += 1
                continue

            outfp = group_fps[group_n]

            write_record(read, outfp)
            part_seqs += 1

    print('---', file=sys.stderr)
    print('Of %d total seqs,' % total_seqs, file=sys.stderr)
    print('extracted %d partitioned seqs into group files,' %
          part_seqs, file=sys.stderr)
    print('discarded %d sequences from small partitions (see -m),' %
          toosmall_parts, file=sys.stderr)
    print('and found %d unpartitioned sequences (see -U).' %
          n_unassigned, file=sys.stderr)
    print('', file=sys.stderr)
    print('Created %d group files named %s.groupXXXX.%s' %
          (len(group_fps),
           args.prefix,
           suffix), file=sys.stderr)
示例#17
0
def main():
    parser = sanitize_help(get_parser())
    args = parser.parse_args()
    if not args.quiet:
        info('trim-low-abund.py', ['streaming'])

    configure_logging(args.quiet)

    ###

    if len(set(args.input_filenames)) != len(args.input_filenames):
        log_error("Error: Cannot input the same filename multiple times.")
        sys.exit(1)

    if args.trim_at_coverage != DEFAULT_TRIM_AT_COVERAGE and \
       not args.variable_coverage:
        log_error("Error: --trim-at-coverage/-Z given, but "
                  "--variable-coverage/-V not specified.")
        sys.exit(1)

    if args.diginorm_coverage != DEFAULT_DIGINORM_COVERAGE and \
       not args.diginorm:
        log_error("Error: --diginorm-coverage given, but "
                  "--diginorm not specified.")
        sys.exit(1)

    if args.diginorm and args.single_pass:
        log_error("Error: --diginorm and --single-pass are incompatible!\n"
                  "You probably want to use normalize-by-median.py instead.")
        sys.exit(1)

    ###

    report_on_config(args)
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savegraph:
        graphsize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, graphsize, args.force)

    if ('-' in args.input_filenames or '/dev/stdin' in args.input_filenames) \
       and not args.output:
        log_error("Accepting input from stdin; output filename must "
                  "be provided with -o.")
        sys.exit(1)

    if args.loadgraph:
        log_info('loading countgraph from {graph}', graph=args.loadgraph)
        ct = khmer.load_countgraph(args.loadgraph)
    else:
        log_info('making countgraph')
        ct = khmer_args.create_countgraph(args)

    K = ct.ksize()
    tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir)
    log_info(
        'created temporary directory {temp};\n'
        'use -T to change location',
        temp=tempdir)

    trimmer = Trimmer(ct, not args.variable_coverage, args.cutoff,
                      args.trim_at_coverage)
    if args.diginorm:
        trimmer.set_diginorm(args.diginorm_coverage)

    # ### FIRST PASS ###

    save_pass2_total = 0

    written_bp = 0
    written_reads = 0

    # only create the file writer once if outfp is specified; otherwise,
    # create it for each file.
    if args.output:
        trimfp = get_file_writer(args.output, args.gzip, args.bzip)

    pass2list = []
    for filename in args.input_filenames:
        # figure out temporary filename for 2nd pass
        pass2filename = os.path.basename(filename) + '.pass2'
        pass2filename = os.path.join(tempdir, pass2filename)
        pass2fp = open(pass2filename, 'w')

        # construct output filenames
        if args.output is None:
            # note: this will be saved in trimfp.
            outfp = open(os.path.basename(filename) + '.abundtrim', 'wb')

            # get file handle w/gzip, bzip
            trimfp = get_file_writer(outfp, args.gzip, args.bzip)

        # record all this info
        pass2list.append((filename, pass2filename, trimfp))

        # input file stuff: get a broken_paired reader.
        screed_iter = screed.open(filename)
        paired_iter = broken_paired_reader(screed_iter,
                                           min_length=K,
                                           force_single=args.ignore_pairs)

        # main loop through the file.
        n_start = trimmer.n_reads
        save_start = trimmer.n_saved

        watermark = REPORT_EVERY_N_READS
        for read in trimmer.pass1(paired_iter, pass2fp):
            if (trimmer.n_reads - n_start) > watermark:
                log_info(
                    "... {filename} {n_saved} {n_reads} {n_bp} "
                    "{w_reads} {w_bp}",
                    filename=filename,
                    n_saved=trimmer.n_saved,
                    n_reads=trimmer.n_reads,
                    n_bp=trimmer.n_bp,
                    w_reads=written_reads,
                    w_bp=written_bp)
                watermark += REPORT_EVERY_N_READS

            # write out the trimmed/etc sequences that AREN'T going to be
            # revisited in a 2nd pass.
            write_record(read, trimfp)
            written_bp += len(read)
            written_reads += 1
        pass2fp.close()

        log_info("{filename}: kept aside {kept} of {total} from first pass",
                 filename=filename,
                 kept=trimmer.n_saved - save_start,
                 total=trimmer.n_reads - n_start)

    # first pass goes across all the data, so record relevant stats...
    n_reads = trimmer.n_reads
    n_bp = trimmer.n_bp
    n_skipped = trimmer.n_skipped
    bp_skipped = trimmer.bp_skipped
    save_pass2_total = trimmer.n_saved

    # ### SECOND PASS. ###

    # nothing should have been skipped yet!
    assert trimmer.n_skipped == 0
    assert trimmer.bp_skipped == 0

    if args.single_pass:
        pass2list = []

    # go back through all the files again.
    for _, pass2filename, trimfp in pass2list:
        log_info('second pass: looking at sequences kept aside in {pass2}',
                 pass2=pass2filename)

        # note that for this second pass, we don't care about paired
        # reads - they will be output in the same order they're read in,
        # so pairs will stay together if not orphaned.  This is in contrast
        # to the first loop.  Hence, force_single=True below.

        screed_iter = screed.open(pass2filename, parse_description=False)
        paired_iter = broken_paired_reader(screed_iter,
                                           min_length=K,
                                           force_single=True)

        watermark = REPORT_EVERY_N_READS
        for read in trimmer.pass2(paired_iter):
            if (trimmer.n_reads - n_start) > watermark:
                log_info('... x 2 {a} {b} {c} {d} {e} {f} {g}',
                         a=trimmer.n_reads - n_start,
                         b=pass2filename,
                         c=trimmer.n_saved,
                         d=trimmer.n_reads,
                         e=trimmer.n_bp,
                         f=written_reads,
                         g=written_bp)
                watermark += REPORT_EVERY_N_READS

            write_record(read, trimfp)
            written_reads += 1
            written_bp += len(read)

        log_info('removing {pass2}', pass2=pass2filename)
        os.unlink(pass2filename)

        # if we created our own trimfps, close 'em.
        if not args.output:
            trimfp.close()

    log_info('removing temp directory & contents ({temp})', temp=tempdir)
    shutil.rmtree(tempdir)

    trimmed_reads = trimmer.trimmed_reads

    n_passes = 1.0 + (float(save_pass2_total) / n_reads)
    percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\
        n_reads * 100.0

    log_info('read {read} reads, {bp} bp', read=n_reads, bp=n_bp)
    log_info('wrote {wr} reads, {wbp} bp', wr=written_reads, wbp=written_bp)
    log_info('looked at {st} reads twice ({np:.2f} passes)',
             st=save_pass2_total,
             np=n_passes)
    log_info('removed {r} reads and trimmed {t} reads ({p:.2f}%)',
             r=n_reads - written_reads,
             t=trimmed_reads,
             p=percent_reads_trimmed)
    log_info('trimmed or removed {p:.2f}%% of bases ({bp} total)',
             p=(1 - (written_bp / float(n_bp))) * 100.0,
             bp=n_bp - written_bp)

    if args.variable_coverage:
        percent_reads_hicov = 100.0 * float(n_reads - n_skipped) / n_reads
        log_info('{n} reads were high coverage ({p:.2f}%);',
                 n=n_reads - n_skipped,
                 p=percent_reads_hicov)
        log_info('skipped {r} reads/{bp} bases because of low coverage',
                 r=n_skipped,
                 bp=bp_skipped)

    fp_rate = \
        khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975
    log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate)

    log_info('output in *.abundtrim')

    if args.savegraph:
        log_info("Saving k-mer countgraph to {graph}", graph=args.savegraph)
        ct.save(args.savegraph)
示例#18
0
def main():
    info('trim-low-abund.py', ['streaming'])
    parser = sanitize_help(get_parser())
    args = parser.parse_args()

    ###

    if len(set(args.input_filenames)) != len(args.input_filenames):
        print("Error: Cannot input the same filename multiple times.",
              file=sys.stderr)
        sys.exit(1)

    ###

    report_on_config(args)
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savegraph:
        graphsize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, graphsize, args.force)

    if ('-' in args.input_filenames or '/dev/stdin' in args.input_filenames) \
       and not args.output:
        print("Accepting input from stdin; output filename must "
              "be provided with -o.", file=sys.stderr)
        sys.exit(1)

    if args.loadgraph:
        print('loading countgraph from', args.loadgraph, file=sys.stderr)
        ct = khmer.load_countgraph(args.loadgraph)
    else:
        print('making countgraph', file=sys.stderr)
        ct = khmer_args.create_countgraph(args)

    K = ct.ksize()
    CUTOFF = args.cutoff
    NORMALIZE_LIMIT = args.normalize_to

    tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir)
    print('created temporary directory %s; '
          'use -T to change location' % tempdir, file=sys.stderr)

    # ### FIRST PASS ###

    save_pass2_total = 0

    n_bp = 0
    n_reads = 0
    written_bp = 0
    written_reads = 0
    trimmed_reads = 0

    pass2list = []
    for filename in args.input_filenames:
        pass2filename = os.path.basename(filename) + '.pass2'
        pass2filename = os.path.join(tempdir, pass2filename)
        if args.output is None:
            trimfp = get_file_writer(open(os.path.basename(filename) +
                                          '.abundtrim', 'wb'),
                                     args.gzip, args.bzip)
        else:
            trimfp = get_file_writer(args.output, args.gzip, args.bzip)

        pass2list.append((filename, pass2filename, trimfp))

        screed_iter = screed.open(filename)
        pass2fp = open(pass2filename, 'w')

        save_pass2 = 0
        n = 0

        paired_iter = broken_paired_reader(screed_iter, min_length=K,
                                           force_single=args.ignore_pairs)
        for n, is_pair, read1, read2 in paired_iter:
            if n % 10000 == 0:
                print('...', n, filename, save_pass2, n_reads, n_bp,
                      written_reads, written_bp, file=sys.stderr)

            # we want to track paired reads here, to make sure that pairs
            # are not split between first pass and second pass.

            if is_pair:
                n_reads += 2
                n_bp += len(read1.sequence) + len(read2.sequence)

                seq1 = read1.sequence.replace('N', 'A')
                seq2 = read2.sequence.replace('N', 'A')

                med1, _, _ = ct.get_median_count(seq1)
                med2, _, _ = ct.get_median_count(seq2)

                if med1 < NORMALIZE_LIMIT or med2 < NORMALIZE_LIMIT:
                    ct.consume(seq1)
                    ct.consume(seq2)
                    write_record_pair(read1, read2, pass2fp)
                    save_pass2 += 2
                else:
                    _, trim_at1 = ct.trim_on_abundance(seq1, CUTOFF)
                    _, trim_at2 = ct.trim_on_abundance(seq2, CUTOFF)

                    if trim_at1 >= K:
                        read1 = trim_record(read1, trim_at1)

                    if trim_at2 >= K:
                        read2 = trim_record(read2, trim_at2)

                    if trim_at1 != len(seq1):
                        trimmed_reads += 1
                    if trim_at2 != len(seq2):
                        trimmed_reads += 1

                    write_record_pair(read1, read2, trimfp)
                    written_reads += 2
                    written_bp += trim_at1 + trim_at2
            else:
                n_reads += 1
                n_bp += len(read1.sequence)

                seq = read1.sequence.replace('N', 'A')

                med, _, _ = ct.get_median_count(seq)

                # has this portion of the graph saturated? if not,
                # consume & save => pass2.
                if med < NORMALIZE_LIMIT:
                    ct.consume(seq)
                    write_record(read1, pass2fp)
                    save_pass2 += 1
                else:                       # trim!!
                    _, trim_at = ct.trim_on_abundance(seq, CUTOFF)
                    if trim_at >= K:
                        new_read = trim_record(read1, trim_at)
                        write_record(new_read, trimfp)

                        written_reads += 1
                        written_bp += trim_at

                        if trim_at != len(read1.sequence):
                            trimmed_reads += 1

        pass2fp.close()

        print('%s: kept aside %d of %d from first pass, in %s' %
              (filename, save_pass2, n, filename),
              file=sys.stderr)
        save_pass2_total += save_pass2

    # ### SECOND PASS. ###

    skipped_n = 0
    skipped_bp = 0
    for _, pass2filename, trimfp in pass2list:
        print('second pass: looking at sequences kept aside in %s' %
              pass2filename,
              file=sys.stderr)

        # note that for this second pass, we don't care about paired
        # reads - they will be output in the same order they're read in,
        # so pairs will stay together if not orphaned.  This is in contrast
        # to the first loop.

        for n, read in enumerate(screed.open(pass2filename)):
            if n % 10000 == 0:
                print('... x 2', n, pass2filename,
                      written_reads, written_bp, file=sys.stderr)

            seq = read.sequence.replace('N', 'A')
            med, _, _ = ct.get_median_count(seq)

            # do we retain low-abundance components unchanged?
            if med < NORMALIZE_LIMIT and args.variable_coverage:
                write_record(read, trimfp)

                written_reads += 1
                written_bp += len(read.sequence)
                skipped_n += 1
                skipped_bp += len(read.sequence)

            # otherwise, examine/trim/truncate.
            else:    # med >= NORMALIZE LIMIT or not args.variable_coverage
                _, trim_at = ct.trim_on_abundance(seq, CUTOFF)
                if trim_at >= K:
                    new_read = trim_record(read, trim_at)
                    write_record(new_read, trimfp)

                    written_reads += 1
                    written_bp += trim_at

                    if trim_at != len(read.sequence):
                        trimmed_reads += 1

        print('removing %s' % pass2filename, file=sys.stderr)
        os.unlink(pass2filename)

    print('removing temp directory & contents (%s)' % tempdir, file=sys.stderr)
    shutil.rmtree(tempdir)

    n_passes = 1.0 + (float(save_pass2_total) / n_reads)
    percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\
        n_reads * 100.0

    print('read %d reads, %d bp' % (n_reads, n_bp,), file=sys.stderr)
    print('wrote %d reads, %d bp' % (written_reads, written_bp,),
          file=sys.stderr)
    print('looked at %d reads twice (%.2f passes)' % (save_pass2_total,
                                                      n_passes),
          file=sys.stderr)
    print('removed %d reads and trimmed %d reads (%.2f%%)' %
          (n_reads - written_reads, trimmed_reads, percent_reads_trimmed),
          file=sys.stderr)
    print('trimmed or removed %.2f%% of bases (%d total)' %
          ((1 - (written_bp / float(n_bp))) * 100.0, n_bp - written_bp),
          file=sys.stderr)

    if args.variable_coverage:
        percent_reads_hicov = 100.0 * float(n_reads - skipped_n) / n_reads
        print('%d reads were high coverage (%.2f%%);' % (n_reads - skipped_n,
                                                         percent_reads_hicov),
              file=sys.stderr)
        print('skipped %d reads/%d bases because of low coverage' %
              (skipped_n, skipped_bp),
              file=sys.stderr)

    fp_rate = \
        khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975
    print('fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate),
          file=sys.stderr)

    print('output in *.abundtrim', file=sys.stderr)

    if args.savegraph:
        print("Saving k-mer countgraph to",
              args.savegraph, file=sys.stderr)
        ct.save(args.savegraph)
示例#19
0
def main():
    args = sanitize_help(get_parser()).parse_args()

    configure_logging(args.quiet)
    check_input_files(args.datafile, args.force)
    check_space([args.datafile], args.force)

    if args.savegraph:
        tablesize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, tablesize, args.force)

    report_on_config(args)

    log_info('making countgraph')
    graph = khmer_args.create_countgraph(args)

    # first, load reads into graph
    rparser = khmer.ReadParser(args.datafile)
    threads = []
    log_info('consuming input, round 1 -- {datafile}', datafile=args.datafile)
    for _ in range(args.threads):
        cur_thread = \
            threading.Thread(
                target=graph.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(cur_thread)
        cur_thread.start()

    for _ in threads:
        _.join()

    log_info('Total number of unique k-mers: {nk}', nk=graph.n_unique_kmers())

    fp_rate = khmer.calc_expected_collisions(graph, args.force)
    log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate)

    # the filtering loop
    log_info('filtering {datafile}', datafile=args.datafile)
    if args.outfile is None:
        outfile = os.path.basename(args.datafile) + '.abundfilt'
    else:
        outfile = args.outfile
    outfp = open(outfile, 'wb')
    outfp = get_file_writer(outfp, args.gzip, args.bzip)

    paired_iter = broken_paired_reader(ReadParser(args.datafile),
                                       min_length=graph.ksize(),
                                       force_single=True)

    for n, is_pair, read1, read2 in paired_iter:
        assert not is_pair
        assert read2 is None

        trimmed_record, _ = trim_record(graph, read1, args.cutoff,
                                        args.variable_coverage,
                                        args.normalize_to)
        if trimmed_record:
            print((trimmed_record,))
            write_record(trimmed_record, outfp)

    log_info('output in {outfile}', outfile=outfile)

    if args.savegraph:
        log_info('Saving k-mer countgraph filename {graph}',
                 graph=args.savegraph)
        graph.save(args.savegraph)
示例#20
0
def main():
    args = sanitize_help(get_parser()).parse_args()

    infile = args.infile
    check_input_files(infile, args.force)
    check_space([infile], args.force)

    # decide where to put output files - specific directory? or just default?
    if infile in ('/dev/stdin', '-'):
        # seqan only treats '-' as "read from stdin"
        infile = '-'
        if not (args.output_paired and args.output_single):
            print("Accepting input from stdin; output filenames must be "
                  "provided.", file=sys.stderr)
            sys.exit(1)
    elif args.output_dir:
        if not os.path.exists(args.output_dir):
            os.makedirs(args.output_dir)
        out1 = args.output_dir + '/' + os.path.basename(infile) + '.se'
        out2 = args.output_dir + '/' + os.path.basename(infile) + '.pe'
    else:
        out1 = os.path.basename(infile) + '.se'
        out2 = os.path.basename(infile) + '.pe'

    # OVERRIDE default output file locations with -p, -s
    if args.output_paired:
        paired_fp = get_file_writer(args.output_paired, args.gzip, args.bzip)
        out2 = paired_fp.name
    else:
        # Don't override, just open the default filename from above
        paired_fp = get_file_writer(open(out2, 'wb'), args.gzip, args.bzip)
    if args.output_single:
        single_fp = get_file_writer(args.output_single, args.gzip, args.bzip)
        out1 = args.output_single.name
    else:
        # Don't override, just open the default filename from above
        single_fp = get_file_writer(open(out1, 'wb'), args.gzip, args.bzip)

    print('reading file "%s"' % infile, file=sys.stderr)
    print('outputting interleaved pairs to "%s"' % out2, file=sys.stderr)
    print('outputting orphans to "%s"' % out1, file=sys.stderr)

    n_pe = 0
    n_se = 0

    screed_iter = ReadParser(infile)
    for index, is_pair, read1, read2 in broken_paired_reader(screed_iter):
        if index % 100000 == 0 and index > 0:
            print('...', index, file=sys.stderr)

        if is_pair:
            write_record_pair(read1, read2, paired_fp)
            n_pe += 1
        else:
            write_record(read1, single_fp)
            n_se += 1

    single_fp.close()
    paired_fp.close()

    if n_pe == 0:
        raise Exception("no paired reads!? check file formats...")

    print('DONE; read %d sequences,'
          ' %d pairs and %d singletons' %
          (n_pe * 2 + n_se, n_pe, n_se), file=sys.stderr)

    print('wrote to: %s and %s' % (out2, out1),
          file=sys.stderr)
示例#21
0
def main():
    args = sanitize_help(get_parser()).parse_args()

    distfilename = args.prefix + '.dist'

    for infile in args.part_filenames:
        check_input_files(infile, args.force)

    check_space(args.part_filenames, args.force)

    print('---', file=sys.stderr)
    print('reading partitioned files:', repr(
        args.part_filenames), file=sys.stderr)
    if args.output_groups:
        print('outputting to files named "%s.groupN.fa"' %
              args.prefix, file=sys.stderr)
        print('min reads to keep a partition:',
              args.min_part_size, file=sys.stderr)
        print('max size of a group file:', args.max_size, file=sys.stderr)
    else:
        print('NOT outputting groups! Beware!', file=sys.stderr)

    if args.output_unassigned:
        print('outputting unassigned reads to "%s.unassigned.fa"' %
              args.prefix, file=sys.stderr)
    print('partition size distribution will go to %s'
          % distfilename, file=sys.stderr)
    print('---', file=sys.stderr)

    #

    suffix = None
    is_fastq = None

    with PartitionedReader(args.part_filenames, True, True) as reader:
        for read, _ in reader:
            if is_fastq is None:
                is_fastq = hasattr(read, 'quality')
            else:
                assert hasattr(read, 'quality') == is_fastq,\
                    "Input files must have consistent format."

    if is_fastq:
        suffix = "fq"
    else:
        suffix = "fa"

    # remember folks, generators exhaust themseleves
    extractor = PartitionExtractor(args.part_filenames,
                                   args.min_part_size,
                                   args.max_size)

    if args.output_unassigned:
        ofile = open('%s.unassigned.%s' % (args.prefix, suffix), 'wb')
        unassigned_fp = get_file_writer(ofile, args.gzip, args.bzip)
        extractor.process_unassigned(unassigned_fp)
        unassigned_fp.close()
    else:
        extractor.process_unassigned()

    extractor.output_histogram(distfilename)

    if not args.output_groups:
        sys.exit(0)

    extractor.develop_groups()

    print('%d groups' % extractor.group_n, file=sys.stderr)
    if extractor.group_n == 0:
        print('nothing to output; exiting!', file=sys.stderr)
        return

    # open a bunch of output files for the different groups
    group_fps = {}
    for index in range(extractor.group_n):
        fname = '%s.group%04d.%s' % (args.prefix, index, suffix)
        group_fp = get_file_writer(open(fname, 'wb'), args.gzip,
                                   args.bzip)
        group_fps[index] = group_fp

    # write 'em all out!
    # refresh the generator
    read_generator = PartitionExtractor.ReadGroupGenerator(extractor)

    with PartitionedReader(args.part_filenames) as reader:
        for read, group_n in read_generator(reader):
            outfp = group_fps[group_n]
            write_record(read, outfp)

    print('---', file=sys.stderr)
    print('Of %d total seqs,' % read_generator.total_seqs, file=sys.stderr)
    print('extracted %d partitioned seqs into group files,' %
          read_generator.part_seqs, file=sys.stderr)
    print('discarded %d sequences from small partitions (see -m),' %
          read_generator.toosmall_parts, file=sys.stderr)
    print('and found %d unpartitioned sequences (see -U).' %
          extractor.n_unassigned, file=sys.stderr)
    print('', file=sys.stderr)
    print('Created %d group files named %s.groupXXXX.%s' %
          (len(group_fps),
           args.prefix,
           suffix), file=sys.stderr)
示例#22
0
def main():  # pylint: disable=too-many-branches,too-many-statements
    start_time = time.time()
    parser = sanitize_help(get_parser())
    args = parser.parse_args()

    configure_logging(args.quiet)
    report_on_config(args)

    report_fp = args.report
    force_single = args.force_single

    # check for similar filenames
    # if we're using a single output file only check for identical filenames
    # otherwise, check for identical BASE names as well.
    filenames = []
    basenames = []
    for pathfilename in args.input_filenames:
        filenames.append(pathfilename)
        if args.single_output_file:
            continue  # nothing more to worry about

        basename = os.path.basename(pathfilename)
        if basename in basenames:
            log_error('ERROR: Duplicate filename--Cannot handle this!')
            log_error('** Exiting!')
            sys.exit(1)

        basenames.append(basename)

    # check that files exist and there is sufficient output disk space.
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savegraph is not None:
        graphsize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, graphsize, args.force)

    # load or create counting table.
    if args.loadgraph:
        log_info('loading k-mer countgraph from {graph}', graph=args.loadgraph)
        countgraph1 = Countgraph.load(args.loadgraph)

    # load second counting table.
    if args.loadgraph2:
        log_info('loading k-mer countgraph from {graph}',
                 graph=args.loadgraph2)
        countgraph2 = Countgraph.load(args.loadgraph2)

    # make a list of all filenames and if they're paired or not;
    # if we don't know if they're paired, default to allowing but not
    # forcing pairing.
    files = []
    for element in filenames:
        files.append([element, args.paired])
    if args.unpaired_reads:
        files.append([args.unpaired_reads, False])

    #
    # main loop: iterate over all files given, do diginorm.
    #

    for filename, require_paired in files:
        if not args.single_output_file:
            output_name = os.path.basename(filename) + '.keep'
            outfp = open(output_name, 'wb')
            outfp = get_file_writer(outfp, args.gzip, args.bzip)

        screed_iter = clean_input_reads(screed.open(filename))
        reader = broken_paired_reader(screed_iter,
                                      min_length=args.ksize,
                                      force_single=force_single,
                                      require_paired=require_paired)

        # actually do diginorm
        for _, is_paired, read0, read1 in reader:
            for record in snarf(is_paired, read0, read1, countgraph1,
                                countgraph2):
                if record is not None:
                    write_record(record, outfp)

    print("--- %s seconds ---" % (time.time() - start_time))
示例#23
0
def main():  # pylint: disable=too-many-branches,too-many-statements
    parser = sanitize_help(get_parser())
    args = parser.parse_args()

    configure_logging(args.quiet)
    report_on_config(args)

    report_fp = args.report
    force_single = args.force_single

    # check for similar filenames
    # if we're using a single output file only check for identical filenames
    # otherwise, check for identical BASE names as well.
    filenames = []
    basenames = []
    for pathfilename in args.input_filenames:
        filenames.append(pathfilename)
        if args.single_output_file:
            continue  # nothing more to worry about

        basename = os.path.basename(pathfilename)
        if basename in basenames:
            log_error('ERROR: Duplicate filename--Cannot handle this!')
            log_error('** Exiting!')
            sys.exit(1)

        basenames.append(basename)

    # check that files exist and there is sufficient output disk space.
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savegraph is not None:
        graphsize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, graphsize, args.force)

    # load or create counting table.
    if args.loadgraph:
        log_info('loading k-mer countgraph from {graph}', graph=args.loadgraph)
        countgraph = khmer.load_countgraph(args.loadgraph)
    else:
        log_info('making countgraph')
        countgraph = khmer_args.create_countgraph(args)

    # create an object to handle diginorm of all files
    norm = Normalizer(args.cutoff, countgraph)
    with_diagnostics = WithDiagnostics(norm, report_fp, args.report_frequency)

    # make a list of all filenames and if they're paired or not;
    # if we don't know if they're paired, default to allowing but not
    # forcing pairing.
    files = []
    for element in filenames:
        files.append([element, args.paired])
    if args.unpaired_reads:
        files.append([args.unpaired_reads, False])

    corrupt_files = []
    outfp = None
    output_name = None

    if args.single_output_file:
        outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip)
    else:
        if '-' in filenames or '/dev/stdin' in filenames:
            print(
                "Accepting input from stdin; output filename must "
                "be provided with '-o'.",
                file=sys.stderr)
            sys.exit(1)

    #
    # main loop: iterate over all files given, do diginorm.
    #

    for filename, require_paired in files:
        if not args.single_output_file:
            output_name = os.path.basename(filename) + '.keep'
            outfp = open(output_name, 'wb')
            outfp = get_file_writer(outfp, args.gzip, args.bzip)

        # failsafe context manager in case an input file breaks
        with catch_io_errors(filename, outfp, args.single_output_file,
                             args.force, corrupt_files):
            screed_iter = clean_input_reads(screed.open(filename))
            reader = broken_paired_reader(screed_iter,
                                          min_length=args.ksize,
                                          force_single=force_single,
                                          require_paired=require_paired)

            # actually do diginorm
            for record in with_diagnostics(reader, filename):
                if record is not None:
                    write_record(record, outfp)

            log_info('output in {name}', name=describe_file_handle(outfp))
            if not args.single_output_file:
                outfp.close()

    # finished - print out some diagnostics.

    log_info('Total number of unique k-mers: {umers}',
             umers=countgraph.n_unique_kmers())

    if args.savegraph is not None:
        log_info('...saving to {name}', name=args.savegraph)
        countgraph.save(args.savegraph)

    fp_rate = \
        khmer.calc_expected_collisions(countgraph, False, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975

    log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate)

    if args.force and len(corrupt_files) > 0:
        log_error("** WARNING: Finished with errors!")
        log_error("** I/O Errors occurred in the following files:")
        log_error("\t" + " ".join(corrupt_files))
示例#24
0
def main():
    args = sanitize_help(get_parser()).parse_args()
    if not args.quiet:
        info('filter-abund.py', ['counting'])

    configure_logging(args.quiet)

    infiles = args.input_filename
    if ('-' in infiles or '/dev/stdin' in infiles) and not \
       args.single_output_file:
        log_error("Accepting input from stdin; output filename must "
                  "be provided with -o.")
        sys.exit(1)

    for filename in infiles:
        check_input_files(filename, args.force)

    check_space(infiles, args.force)

    log_info('loading countgraph: {graph}', graph=args.input_graph)
    countgraph = khmer.load_countgraph(args.input_graph)
    ksize = countgraph.ksize()

    log_info("K: {ksize}", ksize=ksize)

    # the filtering function.
    def process_fn(record):
        name = record.name
        seq = record.sequence
        seqN = seq.replace('N', 'A')

        if args.variable_coverage:  # only trim when sequence has high enough C
            med, _, _ = countgraph.get_median_count(seqN)
            if med < args.normalize_to:
                return name, seq

        _, trim_at = countgraph.trim_on_abundance(seqN, args.cutoff)

        if trim_at >= ksize:
            # be sure to not to change the 'N's in the trimmed sequence -
            # so, return 'seq' and not 'seqN'.
            return name, seq[:trim_at]

        return None, None

    if args.single_output_file:
        outfile = args.single_output_file.name
        outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip)

    # the filtering loop
    for infile in infiles:
        log_info('filtering {infile}', infile=infile)
        if not args.single_output_file:
            outfile = os.path.basename(infile) + '.abundfilt'
            outfp = open(outfile, 'wb')
            outfp = get_file_writer(outfp, args.gzip, args.bzip)

        tsp = ThreadedSequenceProcessor(process_fn, n_workers=args.threads,
                                        verbose=not args.quiet)
        tsp.start(verbose_loader(infile), outfp)

        log_info('output in {outfile}', outfile=outfile)
示例#25
0
def main():
    info('filter-abund-single.py', ['counting', 'SeqAn'])
    args = sanitize_help(get_parser()).parse_args()

    check_input_files(args.datafile, args.force)
    check_space([args.datafile], args.force)

    if args.savegraph:
        tablesize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, tablesize, args.force)

    report_on_config(args)

    print('making countgraph', file=sys.stderr)
    graph = khmer_args.create_countgraph(args)

    # first, load reads into graph
    rparser = khmer.ReadParser(args.datafile)
    threads = []
    print('consuming input, round 1 --', args.datafile, file=sys.stderr)
    for _ in range(args.threads):
        cur_thread = \
            threading.Thread(
                target=graph.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(cur_thread)
        cur_thread.start()

    for _ in threads:
        _.join()

    print('Total number of unique k-mers: {0}'.format(graph.n_unique_kmers()),
          file=sys.stderr)

    fp_rate = khmer.calc_expected_collisions(graph, args.force)
    print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr)

    # now, trim.

    # the filtering function.
    def process_fn(record):
        name = record.name
        seq = record.sequence
        seqN = seq.replace('N', 'A')

        _, trim_at = graph.trim_on_abundance(seqN, args.cutoff)

        if trim_at >= args.ksize:
            # be sure to not to change the 'N's in the trimmed sequence -
            # so, return 'seq' and not 'seqN'.
            return name, seq[:trim_at]

        return None, None

    # the filtering loop
    print('filtering', args.datafile, file=sys.stderr)
    outfile = os.path.basename(args.datafile) + '.abundfilt'
    outfile = open(outfile, 'wb')
    outfp = get_file_writer(outfile, args.gzip, args.bzip)

    tsp = ThreadedSequenceProcessor(process_fn)
    tsp.start(verbose_loader(args.datafile), outfp)

    print('output in', outfile.name, file=sys.stderr)

    if args.savegraph:
        print('Saving k-mer countgraph filename',
              args.savegraph,
              file=sys.stderr)
        graph.save(args.savegraph)
示例#26
0
def main():
    sys.argv = parse_humanfriendly_mem(sys.argv)

    info('filter-abund-single.py', ['counting', 'SeqAn'])
    args = sanitize_help(get_parser()).parse_args()

    check_input_files(args.datafile, args.force)
    check_space([args.datafile], args.force)

    if args.savegraph:
        tablesize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, tablesize, args.force)

    report_on_config(args)

    print('making countgraph', file=sys.stderr)
    graph = khmer_args.create_countgraph(args)

    # first, load reads into graph
    rparser = khmer.ReadParser(args.datafile)
    threads = []
    print('consuming input, round 1 --', args.datafile, file=sys.stderr)
    for _ in range(args.threads):
        cur_thread = \
            threading.Thread(
                target=graph.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(cur_thread)
        cur_thread.start()

    for _ in threads:
        _.join()

    print('Total number of unique k-mers: {0}'.format(
        graph.n_unique_kmers()), file=sys.stderr)

    fp_rate = khmer.calc_expected_collisions(graph, args.force)
    print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr)

    # now, trim.

    # the filtering function.
    def process_fn(record):
        name = record.name
        seq = record.sequence
        seqN = seq.replace('N', 'A')

        _, trim_at = graph.trim_on_abundance(seqN, args.cutoff)

        if trim_at >= args.ksize:
            # be sure to not to change the 'N's in the trimmed sequence -
            # so, return 'seq' and not 'seqN'.
            return name, seq[:trim_at]

        return None, None

    # the filtering loop
    print('filtering', args.datafile, file=sys.stderr)
    outfile = os.path.basename(args.datafile) + '.abundfilt'
    outfile = open(outfile, 'wb')
    outfp = get_file_writer(outfile, args.gzip, args.bzip)

    tsp = ThreadedSequenceProcessor(process_fn)
    tsp.start(verbose_loader(args.datafile), outfp)

    print('output in', outfile.name, file=sys.stderr)

    if args.savegraph:
        print('Saving k-mer countgraph filename',
              args.savegraph, file=sys.stderr)
        graph.save(args.savegraph)
def main():  # pylint: disable=too-many-locals,too-many-branches
    info('extract-partitions.py', ['graph'])
    args = sanitize_help(get_parser()).parse_args()

    distfilename = args.prefix + '.dist'

    n_unassigned = 0

    for infile in args.part_filenames:
        check_input_files(infile, args.force)

    check_space(args.part_filenames, args.force)

    print('---', file=sys.stderr)
    print('reading partitioned files:',
          repr(args.part_filenames),
          file=sys.stderr)
    if args.output_groups:
        print('outputting to files named "%s.groupN.fa"' % args.prefix,
              file=sys.stderr)
        print('min reads to keep a partition:',
              args.min_part_size,
              file=sys.stderr)
        print('max size of a group file:', args.max_size, file=sys.stderr)
    else:
        print('NOT outputting groups! Beware!', file=sys.stderr)

    if args.output_unassigned:
        print('outputting unassigned reads to "%s.unassigned.fa"' %
              args.prefix,
              file=sys.stderr)
    print('partition size distribution will go to %s' % distfilename,
          file=sys.stderr)
    print('---', file=sys.stderr)

    #

    suffix = 'fa'
    is_fastq = False

    for index, read, pid in read_partition_file(args.part_filenames[0]):
        if hasattr(read, 'quality'):
            suffix = 'fq'
            is_fastq = True
        break

    for filename in args.part_filenames:
        for index, read, pid in read_partition_file(filename):
            if is_fastq:
                assert hasattr(read, 'quality'), \
                    "all input files must be FASTQ if the first one is"
            else:
                assert not hasattr(read, 'quality'), \
                    "all input files must be FASTA if the first one is"

            break

    if args.output_unassigned:
        ofile = open('%s.unassigned.%s' % (args.prefix, suffix), 'wb')
        unassigned_fp = get_file_writer(ofile, args.gzip, args.bzip)
    count = {}
    for filename in args.part_filenames:
        for index, read, pid in read_partition_file(filename):
            if index % 100000 == 0:
                print('...', index, file=sys.stderr)

            count[pid] = count.get(pid, 0) + 1

            if pid == 0:
                n_unassigned += 1
                if args.output_unassigned:
                    write_record(read, unassigned_fp)

    if args.output_unassigned:
        unassigned_fp.close()

    if 0 in count:  # eliminate unpartitioned sequences
        del count[0]

    # develop histogram of partition sizes
    dist = {}
    for pid, size in list(count.items()):
        dist[size] = dist.get(size, 0) + 1

    # output histogram
    distfp = open(distfilename, 'w')

    total = 0
    wtotal = 0
    for counter, index in sorted(dist.items()):
        total += index
        wtotal += counter * index
        distfp.write('%d %d %d %d\n' % (counter, index, total, wtotal))
    distfp.close()

    if not args.output_groups:
        sys.exit(0)

    # sort groups by size
    divvy = sorted(list(count.items()), key=lambda y: y[1])
    divvy = [y for y in divvy if y[1] > args.min_part_size]

    # divvy up into different groups, based on having max_size sequences
    # in each group.
    total = 0
    group = set()
    group_n = 0
    group_d = {}
    for partition_id, n_reads in divvy:
        group.add(partition_id)
        total += n_reads

        if total > args.max_size:
            for partition_id in group:
                group_d[partition_id] = group_n
                # print 'group_d', partition_id, group_n

            group_n += 1
            group = set()
            total = 0

    if group:
        for partition_id in group:
            group_d[partition_id] = group_n
            # print 'group_d', partition_id, group_n
        group_n += 1

    print('%d groups' % group_n, file=sys.stderr)
    if group_n == 0:
        print('nothing to output; exiting!', file=sys.stderr)
        return

    # open a bunch of output files for the different groups
    group_fps = {}
    for index in range(group_n):
        fname = '%s.group%04d.%s' % (args.prefix, index, suffix)
        group_fp = get_file_writer(open(fname, 'wb'), args.gzip, args.bzip)
        group_fps[index] = group_fp

    # write 'em all out!

    total_seqs = 0
    part_seqs = 0
    toosmall_parts = 0
    for filename in args.part_filenames:
        for index, read, partition_id in read_partition_file(filename):
            total_seqs += 1
            if index % 100000 == 0:
                print('...x2', index, file=sys.stderr)

            if partition_id == 0:
                continue

            try:
                group_n = group_d[partition_id]
            except KeyError:
                assert count[partition_id] <= args.min_part_size
                toosmall_parts += 1
                continue

            outfp = group_fps[group_n]

            write_record(read, outfp)
            part_seqs += 1

    print('---', file=sys.stderr)
    print('Of %d total seqs,' % total_seqs, file=sys.stderr)
    print('extracted %d partitioned seqs into group files,' % part_seqs,
          file=sys.stderr)
    print('discarded %d sequences from small partitions (see -m),' %
          toosmall_parts,
          file=sys.stderr)
    print('and found %d unpartitioned sequences (see -U).' % n_unassigned,
          file=sys.stderr)
    print('', file=sys.stderr)
    print('Created %d group files named %s.groupXXXX.%s' %
          (len(group_fps), args.prefix, suffix),
          file=sys.stderr)
示例#28
0
def main():
    parser = sanitize_help(get_parser())
    args = parser.parse_args()
    if not args.quiet:
        info('trim-low-abund.py', ['streaming'])

    configure_logging(args.quiet)

    ###

    if len(set(args.input_filenames)) != len(args.input_filenames):
        log_error("Error: Cannot input the same filename multiple times.")
        sys.exit(1)

    if args.trim_at_coverage != DEFAULT_TRIM_AT_COVERAGE and \
       not args.variable_coverage:
        log_error("Error: --trim-at-coverage/-Z given, but "
                  "--variable-coverage/-V not specified.")
        sys.exit(1)

    if args.diginorm_coverage != DEFAULT_DIGINORM_COVERAGE and \
       not args.diginorm:
        log_error("Error: --diginorm-coverage given, but "
                  "--diginorm not specified.")
        sys.exit(1)

    if args.diginorm and args.single_pass:
        log_error("Error: --diginorm and --single-pass are incompatible!\n"
                  "You probably want to use normalize-by-median.py instead.")
        sys.exit(1)

    ###

    report_on_config(args)
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savegraph:
        graphsize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, graphsize, args.force)

    if ('-' in args.input_filenames or '/dev/stdin' in args.input_filenames) \
       and not args.output:
        log_error("Accepting input from stdin; output filename must "
                  "be provided with -o.")
        sys.exit(1)

    if args.loadgraph:
        log_info('loading countgraph from {graph}', graph=args.loadgraph)
        ct = khmer.load_countgraph(args.loadgraph)
    else:
        log_info('making countgraph')
        ct = khmer_args.create_countgraph(args)

    K = ct.ksize()
    tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir)
    log_info('created temporary directory {temp};\n'
             'use -T to change location', temp=tempdir)

    trimmer = Trimmer(ct, not args.variable_coverage, args.cutoff,
                      args.trim_at_coverage)
    if args.diginorm:
        trimmer.set_diginorm(args.diginorm_coverage)

    # ### FIRST PASS ###

    save_pass2_total = 0

    written_bp = 0
    written_reads = 0

    # only create the file writer once if outfp is specified; otherwise,
    # create it for each file.
    if args.output:
        trimfp = get_file_writer(args.output, args.gzip, args.bzip)

    pass2list = []
    for filename in args.input_filenames:
        # figure out temporary filename for 2nd pass
        pass2filename = os.path.basename(filename) + '.pass2'
        pass2filename = os.path.join(tempdir, pass2filename)
        pass2fp = open(pass2filename, 'w')

        # construct output filenames
        if args.output is None:
            # note: this will be saved in trimfp.
            outfp = open(os.path.basename(filename) + '.abundtrim', 'wb')

            # get file handle w/gzip, bzip
            trimfp = get_file_writer(outfp, args.gzip, args.bzip)

        # record all this info
        pass2list.append((filename, pass2filename, trimfp))

        # input file stuff: get a broken_paired reader.
        screed_iter = screed.open(filename)
        paired_iter = broken_paired_reader(screed_iter, min_length=K,
                                           force_single=args.ignore_pairs)

        # main loop through the file.
        n_start = trimmer.n_reads
        save_start = trimmer.n_saved

        watermark = REPORT_EVERY_N_READS
        for read in trimmer.pass1(paired_iter, pass2fp):
            if (trimmer.n_reads - n_start) > watermark:
                log_info("... {filename} {n_saved} {n_reads} {n_bp} "
                         "{w_reads} {w_bp}", filename=filename,
                         n_saved=trimmer.n_saved, n_reads=trimmer.n_reads,
                         n_bp=trimmer.n_bp, w_reads=written_reads,
                         w_bp=written_bp)
                watermark += REPORT_EVERY_N_READS

            # write out the trimmed/etc sequences that AREN'T going to be
            # revisited in a 2nd pass.
            write_record(read, trimfp)
            written_bp += len(read)
            written_reads += 1
        pass2fp.close()

        log_info("{filename}: kept aside {kept} of {total} from first pass",
                 filename=filename, kept=trimmer.n_saved - save_start,
                 total=trimmer.n_reads - n_start)

    # first pass goes across all the data, so record relevant stats...
    n_reads = trimmer.n_reads
    n_bp = trimmer.n_bp
    n_skipped = trimmer.n_skipped
    bp_skipped = trimmer.bp_skipped
    save_pass2_total = trimmer.n_saved

    # ### SECOND PASS. ###

    # nothing should have been skipped yet!
    assert trimmer.n_skipped == 0
    assert trimmer.bp_skipped == 0

    if args.single_pass:
        pass2list = []

    # go back through all the files again.
    for _, pass2filename, trimfp in pass2list:
        log_info('second pass: looking at sequences kept aside in {pass2}',
                 pass2=pass2filename)

        # note that for this second pass, we don't care about paired
        # reads - they will be output in the same order they're read in,
        # so pairs will stay together if not orphaned.  This is in contrast
        # to the first loop.  Hence, force_single=True below.

        screed_iter = screed.open(pass2filename, parse_description=False)
        paired_iter = broken_paired_reader(screed_iter, min_length=K,
                                           force_single=True)

        watermark = REPORT_EVERY_N_READS
        for read in trimmer.pass2(paired_iter):
            if (trimmer.n_reads - n_start) > watermark:
                log_info('... x 2 {a} {b} {c} {d} {e} {f} {g}',
                         a=trimmer.n_reads - n_start,
                         b=pass2filename, c=trimmer.n_saved,
                         d=trimmer.n_reads, e=trimmer.n_bp,
                         f=written_reads, g=written_bp)
                watermark += REPORT_EVERY_N_READS

            write_record(read, trimfp)
            written_reads += 1
            written_bp += len(read)

        log_info('removing {pass2}', pass2=pass2filename)
        os.unlink(pass2filename)

        # if we created our own trimfps, close 'em.
        if not args.output:
            trimfp.close()

    log_info('removing temp directory & contents ({temp})', temp=tempdir)
    shutil.rmtree(tempdir)

    trimmed_reads = trimmer.trimmed_reads

    n_passes = 1.0 + (float(save_pass2_total) / n_reads)
    percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\
        n_reads * 100.0

    log_info('read {read} reads, {bp} bp', read=n_reads, bp=n_bp)
    log_info('wrote {wr} reads, {wbp} bp', wr=written_reads, wbp=written_bp)
    log_info('looked at {st} reads twice ({np:.2f} passes)',
             st=save_pass2_total, np=n_passes)
    log_info('removed {r} reads and trimmed {t} reads ({p:.2f}%)',
             r=n_reads - written_reads, t=trimmed_reads,
             p=percent_reads_trimmed)
    log_info('trimmed or removed {p:.2f}%% of bases ({bp} total)',
             p=(1 - (written_bp / float(n_bp))) * 100.0, bp=n_bp - written_bp)

    if args.variable_coverage:
        percent_reads_hicov = 100.0 * float(n_reads - n_skipped) / n_reads
        log_info('{n} reads were high coverage ({p:.2f}%);',
                 n=n_reads - n_skipped, p=percent_reads_hicov)
        log_info('skipped {r} reads/{bp} bases because of low coverage',
                 r=n_skipped, bp=bp_skipped)

    fp_rate = \
        khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975
    log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate)

    log_info('output in *.abundtrim')

    if args.savegraph:
        log_info("Saving k-mer countgraph to {graph}", graph=args.savegraph)
        ct.save(args.savegraph)
示例#29
0
def main():
    info('split-paired-reads.py')
    args = sanitize_help(get_parser()).parse_args()

    infile = args.infile

    filenames = [infile]
    check_input_files(infile, args.force)
    check_space(filenames, args.force)

    basename = os.path.basename(infile)

    # decide where to put output files - specific directory? or just default?
    if infile in ('/dev/stdin', '-'):
        if not (args.output_first and args.output_second):
            print(
                "Accepting input from stdin; "
                "output filenames must be provided.",
                file=sys.stderr)
            sys.exit(1)
    elif args.output_directory:
        if not os.path.exists(args.output_directory):
            os.makedirs(args.output_directory)
        out1 = os.path.join(args.output_directory, basename + '.1')
        out2 = os.path.join(args.output_directory, basename + '.2')
    else:
        out1 = basename + '.1'
        out2 = basename + '.2'

    # OVERRIDE output file locations with -1, -2
    if args.output_first:
        fp_out1 = get_file_writer(args.output_first, args.gzip, args.bzip)
        out1 = fp_out1.name
    else:
        # Use default filename created above
        fp_out1 = get_file_writer(open(out1, 'wb'), args.gzip, args.bzip)
    if args.output_second:
        fp_out2 = get_file_writer(args.output_second, args.gzip, args.bzip)
        out2 = fp_out2.name
    else:
        # Use default filename created above
        fp_out2 = get_file_writer(open(out2, 'wb'), args.gzip, args.bzip)

    # put orphaned reads here, if -0!
    if args.output_orphaned:
        fp_out0 = get_file_writer(args.output_orphaned, args.gzip, args.bzip)
        out0 = describe_file_handle(args.output_orphaned)

    counter1 = 0
    counter2 = 0
    counter3 = 0
    index = None

    screed_iter = screed.open(infile)

    # walk through all the reads in broken-paired mode.
    paired_iter = broken_paired_reader(screed_iter,
                                       require_paired=not args.output_orphaned)

    try:
        for index, is_pair, record1, record2 in paired_iter:
            if index % 10000 == 0:
                print('...', index, file=sys.stderr)

            if is_pair:
                write_record(record1, fp_out1)
                counter1 += 1
                write_record(record2, fp_out2)
                counter2 += 1
            elif args.output_orphaned:
                write_record(record1, fp_out0)
                counter3 += 1
    except UnpairedReadsError as e:
        print("Unpaired reads found starting at {name}; exiting".format(
            name=e.read1.name),
              file=sys.stderr)
        sys.exit(1)

    print("DONE; split %d sequences (%d left, %d right, %d orphans)" %
          (counter1 + counter2, counter1, counter2, counter3),
          file=sys.stderr)
    print("/1 reads in %s" % out1, file=sys.stderr)
    print("/2 reads in %s" % out2, file=sys.stderr)
    if args.output_orphaned:
        print("orphans in %s" % out0, file=sys.stderr)
示例#30
0
def main():
    info('extract-partitions.py', ['graph'])
    args = sanitize_help(get_parser()).parse_args()

    distfilename = args.prefix + '.dist'

    for infile in args.part_filenames:
        check_input_files(infile, args.force)

    check_space(args.part_filenames, args.force)

    print('---', file=sys.stderr)
    print('reading partitioned files:',
          repr(args.part_filenames),
          file=sys.stderr)
    if args.output_groups:
        print('outputting to files named "%s.groupN.fa"' % args.prefix,
              file=sys.stderr)
        print('min reads to keep a partition:',
              args.min_part_size,
              file=sys.stderr)
        print('max size of a group file:', args.max_size, file=sys.stderr)
    else:
        print('NOT outputting groups! Beware!', file=sys.stderr)

    if args.output_unassigned:
        print('outputting unassigned reads to "%s.unassigned.fa"' %
              args.prefix,
              file=sys.stderr)
    print('partition size distribution will go to %s' % distfilename,
          file=sys.stderr)
    print('---', file=sys.stderr)

    #

    suffix = None
    is_fastq = None

    with PartitionedReader(args.part_filenames, True, True) as reader:
        for read, _ in reader:
            if is_fastq is None:
                is_fastq = hasattr(read, 'quality')
            else:
                assert hasattr(read, 'quality') == is_fastq,\
                    "Input files must have consistent format."

    if is_fastq:
        suffix = "fq"
    else:
        suffix = "fa"

    # remember folks, generators exhaust themseleves
    extractor = PartitionExtractor(args.part_filenames, args.min_part_size,
                                   args.max_size)

    if args.output_unassigned:
        ofile = open('%s.unassigned.%s' % (args.prefix, suffix), 'wb')
        unassigned_fp = get_file_writer(ofile, args.gzip, args.bzip)
        extractor.process_unassigned(unassigned_fp)
        unassigned_fp.close()
    else:
        extractor.process_unassigned()

    extractor.output_histogram(distfilename)

    if not args.output_groups:
        sys.exit(0)

    extractor.develop_groups()

    print('%d groups' % extractor.group_n, file=sys.stderr)
    if extractor.group_n == 0:
        print('nothing to output; exiting!', file=sys.stderr)
        return

    # open a bunch of output files for the different groups
    group_fps = {}
    for index in range(extractor.group_n):
        fname = '%s.group%04d.%s' % (args.prefix, index, suffix)
        group_fp = get_file_writer(open(fname, 'wb'), args.gzip, args.bzip)
        group_fps[index] = group_fp

    # write 'em all out!
    # refresh the generator
    read_generator = PartitionExtractor.ReadGroupGenerator(extractor)

    with PartitionedReader(args.part_filenames) as reader:
        for read, group_n in read_generator(reader):
            outfp = group_fps[group_n]
            write_record(read, outfp)

    print('---', file=sys.stderr)
    print('Of %d total seqs,' % read_generator.total_seqs, file=sys.stderr)
    print('extracted %d partitioned seqs into group files,' %
          read_generator.part_seqs,
          file=sys.stderr)
    print('discarded %d sequences from small partitions (see -m),' %
          read_generator.toosmall_parts,
          file=sys.stderr)
    print('and found %d unpartitioned sequences (see -U).' %
          extractor.n_unassigned,
          file=sys.stderr)
    print('', file=sys.stderr)
    print('Created %d group files named %s.groupXXXX.%s' %
          (len(group_fps), args.prefix, suffix),
          file=sys.stderr)
示例#31
0
def main():
    parser = get_parser()
    parser.epilog = parser.epilog.replace(
        "`reservoir sampling\n"
        "<http://en.wikipedia.org/wiki/Reservoir_sampling>`__ algorithm.",
        "reservoir sampling algorithm. "
        "http://en.wikipedia.org/wiki/Reservoir_sampling")
    args = sanitize_help(parser).parse_args()

    for name in args.filenames:
        check_input_files(name, args.force)

    # seed the random number generator?
    if args.random_seed:
        random.seed(args.random_seed)

    # bound n_samples
    num_samples = max(args.num_samples, 1)

    #
    # Figure out what the output filename is going to be

    if args.output_file:
        output_filename = args.output_file.name
        if num_samples > 1:
            sys.stderr.write(
                "Error: cannot specify -o with more than one sample.")
            if not args.force:
                print(
                    "NOTE: This can be overridden using the --force"
                    " argument",
                    file=sys.stderr)
                sys.exit(1)
    else:
        filename = args.filenames[0]
        if filename in ('/dev/stdin', '-'):
            print(
                "Accepting input from stdin; output filename must "
                "be provided with '-o'.",
                file=sys.stderr)
            sys.exit(1)
        output_filename = os.path.basename(filename) + '.subset'

    filename = args.filenames[0]
    if filename in ('/dev/stdin', '-'):
        # seqan only treats '-' as "read from stdin"
        filename = '-'

    if num_samples == 1:
        print('Subsampling %d reads using reservoir sampling.' %
              args.num_reads,
              file=sys.stderr)
        print('Subsampled reads will be placed in %s' % output_filename,
              file=sys.stderr)
        print('', file=sys.stderr)
    else:  # > 1
        print('Subsampling %d reads, %d times,' %
              (args.num_reads, num_samples),
              ' using reservoir sampling.',
              file=sys.stderr)
        print('Subsampled reads will be placed in %s.N' % output_filename,
              file=sys.stderr)
        print('', file=sys.stderr)

    reads = []
    for _ in range(num_samples):
        reads.append([])

    # read through all the sequences and load/resample the reservoir
    for filename in args.filenames:
        print('opening', filename, 'for reading', file=sys.stderr)

        for count, (_, _, rcrd1, rcrd2) in enumerate(
                broken_paired_reader(ReadParser(filename),
                                     force_single=args.force_single)):
            if count % 10000 == 0:
                print('...', count, 'reads scanned', file=sys.stderr)
                if count >= args.max_reads:
                    print('reached upper limit of %d reads' % args.max_reads,
                          '(see -M); exiting',
                          file=sys.stderr)
                    break

            # collect first N reads
            if count < args.num_reads:
                for sample in range(num_samples):
                    reads[sample].append((rcrd1, rcrd2))
            else:
                for sample in range(num_samples):
                    assert len(reads[sample]) <= count

                # use reservoir sampling to replace reads at random
                # see http://en.wikipedia.org/wiki/Reservoir_sampling

                for n in range(num_samples):
                    guess = random.randint(1, count)
                    if guess <= args.num_reads:
                        reads[n][guess - 1] = (rcrd1, rcrd2)

    # output all the subsampled reads:
    if len(reads) == 1:
        print('Writing %d sequences to %s' % (len(reads[0]), output_filename),
              file=sys.stderr)

        output_file = args.output_file
        if not output_file:
            output_file = open(output_filename, 'wb')

        output_file = get_file_writer(output_file, args.gzip, args.bzip)

        for records in reads[0]:
            write_record(records[0], output_file)
            if records[1] is not None:
                write_record(records[1], output_file)
    else:
        for n in range(num_samples):
            n_filename = output_filename + '.%d' % n
            print('Writing %d sequences to %s' % (len(reads[n]), n_filename),
                  file=sys.stderr)
            output_file = get_file_writer(open(n_filename, 'wb'), args.gzip,
                                          args.bzip)
            for records in reads[n]:
                write_record(records[0], output_file)
                if records[1] is not None:
                    write_record(records[1], output_file)
示例#32
0
def main():
    args = sanitize_help(get_parser()).parse_args()
    if not args.quiet:
        info('filter-abund-single.py', ['counting', 'SeqAn'])

    configure_logging(args.quiet)
    check_input_files(args.datafile, args.force)
    check_space([args.datafile], args.force)

    if args.savegraph:
        tablesize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, tablesize, args.force)

    report_on_config(args)

    log_info('making countgraph')
    graph = khmer_args.create_countgraph(args)

    # first, load reads into graph
    rparser = khmer.ReadParser(args.datafile)
    threads = []
    log_info('consuming input, round 1 -- {datafile}', datafile=args.datafile)
    for _ in range(args.threads):
        cur_thread = \
            threading.Thread(
                target=graph.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(cur_thread)
        cur_thread.start()

    for _ in threads:
        _.join()

    log_info('Total number of unique k-mers: {nk}', nk=graph.n_unique_kmers())

    fp_rate = khmer.calc_expected_collisions(graph, args.force)
    log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate)

    # now, trim.

    # the filtering function.
    def process_fn(record):
        name = record.name
        seq = record.sequence
        seqN = seq.replace('N', 'A')

        if args.variable_coverage:  # only trim when sequence has high enough C
            med, _, _ = graph.get_median_count(seqN)
            if med < args.normalize_to:
                return name, seq

        _, trim_at = graph.trim_on_abundance(seqN, args.cutoff)

        if trim_at >= args.ksize:
            # be sure to not to change the 'N's in the trimmed sequence -
            # so, return 'seq' and not 'seqN'.
            return name, seq[:trim_at]

        return None, None

    # the filtering loop
    log_info('filtering {datafile}', datafile=args.datafile)
    if args.outfile is None:
        outfile = os.path.basename(args.datafile) + '.abundfilt'
    else:
        outfile = args.outfile
    outfp = open(outfile, 'wb')
    outfp = get_file_writer(outfp, args.gzip, args.bzip)

    tsp = ThreadedSequenceProcessor(process_fn, verbose=not args.quiet)
    tsp.start(verbose_loader(args.datafile), outfp)

    log_info('output in {outfile}', outfile=outfile)

    if args.savegraph:
        log_info('Saving k-mer countgraph filename {graph}',
                 graph=args.savegraph)
        graph.save(args.savegraph)
def main():
    info('sample-reads-randomly.py')
    args = get_parser().parse_args()

    for _ in args.filenames:
        check_input_files(_, args.force)

    # seed the random number generator?
    if args.random_seed:
        random.seed(args.random_seed)

    # bound n_samples
    num_samples = max(args.num_samples, 1)

    #
    # Figure out what the output filename is going to be

    if args.output_file:
        output_filename = args.output_file.name
        if num_samples > 1:
            sys.stderr.write(
                "Error: cannot specify -o with more than one sample.")
            if not args.force:
                print("NOTE: This can be overridden using the --force"
                      " argument", file=sys.stderr)
                sys.exit(1)
    else:
        filename = args.filenames[0]
        if filename in ('/dev/stdin', '-'):
            print("Accepting input from stdin; output filename must "
                  "be provided with '-o'.", file=sys.stderr)
            sys.exit(1)
        output_filename = os.path.basename(filename) + '.subset'

    if num_samples == 1:
        print('Subsampling %d reads using reservoir sampling.' %
              args.num_reads, file=sys.stderr)
        print('Subsampled reads will be placed in %s' %
              output_filename, file=sys.stderr)
        print('', file=sys.stderr)
    else:  # > 1
        print('Subsampling %d reads, %d times,'
              % (args.num_reads, num_samples), ' using reservoir sampling.',
              file=sys.stderr)
        print('Subsampled reads will be placed in %s.N'
              % output_filename, file=sys.stderr)
        print('', file=sys.stderr)

    reads = []
    for n in range(num_samples):
        reads.append([])

    # read through all the sequences and load/resample the reservoir
    for filename in args.filenames:
        print('opening', filename, 'for reading', file=sys.stderr)
        screed_iter = screed.open(filename)

        for count, (_, ispair, rcrd1, rcrd2) in enumerate(broken_paired_reader(
                screed_iter,
                force_single=args.force_single)):
            if count % 10000 == 0:
                print('...', count, 'reads scanned', file=sys.stderr)
                if count >= args.max_reads:
                    print('reached upper limit of %d reads' %
                          args.max_reads, '(see -M); exiting', file=sys.stderr)
                    break

            # collect first N reads
            if count < args.num_reads:
                for n in range(num_samples):
                    reads[n].append((rcrd1, rcrd2))
            else:
                assert len(reads[n]) <= count

                # use reservoir sampling to replace reads at random
                # see http://en.wikipedia.org/wiki/Reservoir_sampling

                for n in range(num_samples):
                    guess = random.randint(1, count)
                    if guess <= args.num_reads:
                        reads[n][guess - 1] = (rcrd1, rcrd2)

    # output all the subsampled reads:
    if len(reads) == 1:
        print('Writing %d sequences to %s' %
              (len(reads[0]), output_filename), file=sys.stderr)

        output_file = args.output_file
        if not output_file:
            output_file = open(output_filename, 'wb')

        output_file = get_file_writer(output_file, args.gzip, args.bzip)

        for records in reads[0]:
            write_record(records[0], output_file)
            if records[1] is not None:
                write_record(records[1], output_file)
    else:
        for n in range(num_samples):
            n_filename = output_filename + '.%d' % n
            print('Writing %d sequences to %s' %
                  (len(reads[n]), n_filename), file=sys.stderr)
            output_file = get_file_writer(open(n_filename, 'wb'), args.gzip,
                                          args.bzip)
            for records in reads[n]:
                write_record(records[0], output_file)
                if records[1] is not None:
                    write_record(records[1], output_file)
def main():
    info('trim-low-abund.py', ['streaming'])
    parser = sanitize_help(get_parser())
    args = parser.parse_args()

    ###

    if len(set(args.input_filenames)) != len(args.input_filenames):
        print("Error: Cannot input the same filename multiple times.",
              file=sys.stderr)
        sys.exit(1)

    ###

    report_on_config(args)
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savegraph:
        graphsize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, graphsize, args.force)

    if ('-' in args.input_filenames or '/dev/stdin' in args.input_filenames) \
       and not args.output:
        print(
            "Accepting input from stdin; output filename must "
            "be provided with -o.",
            file=sys.stderr)
        sys.exit(1)

    if args.loadgraph:
        print('loading countgraph from', args.loadgraph, file=sys.stderr)
        ct = khmer.load_countgraph(args.loadgraph)
    else:
        print('making countgraph', file=sys.stderr)
        ct = khmer_args.create_countgraph(args)

    K = ct.ksize()
    CUTOFF = args.cutoff
    NORMALIZE_LIMIT = args.normalize_to

    tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir)
    print('created temporary directory %s; '
          'use -T to change location' % tempdir,
          file=sys.stderr)

    # ### FIRST PASS ###

    save_pass2_total = 0

    n_bp = 0
    n_reads = 0
    written_bp = 0
    written_reads = 0
    trimmed_reads = 0

    pass2list = []
    for filename in args.input_filenames:
        pass2filename = os.path.basename(filename) + '.pass2'
        pass2filename = os.path.join(tempdir, pass2filename)
        if args.output is None:
            trimfp = get_file_writer(
                open(os.path.basename(filename) + '.abundtrim', 'wb'),
                args.gzip, args.bzip)
        else:
            trimfp = get_file_writer(args.output, args.gzip, args.bzip)

        pass2list.append((filename, pass2filename, trimfp))

        screed_iter = screed.open(filename)
        pass2fp = open(pass2filename, 'w')

        save_pass2 = 0
        n = 0

        paired_iter = broken_paired_reader(screed_iter,
                                           min_length=K,
                                           force_single=args.ignore_pairs)
        for n, is_pair, read1, read2 in paired_iter:
            if n % 10000 == 0:
                print('...',
                      n,
                      filename,
                      save_pass2,
                      n_reads,
                      n_bp,
                      written_reads,
                      written_bp,
                      file=sys.stderr)

            # we want to track paired reads here, to make sure that pairs
            # are not split between first pass and second pass.

            if is_pair:
                n_reads += 2
                n_bp += len(read1.sequence) + len(read2.sequence)

                seq1 = read1.sequence.replace('N', 'A')
                seq2 = read2.sequence.replace('N', 'A')

                med1, _, _ = ct.get_median_count(seq1)
                med2, _, _ = ct.get_median_count(seq2)

                if med1 < NORMALIZE_LIMIT or med2 < NORMALIZE_LIMIT:
                    ct.consume(seq1)
                    ct.consume(seq2)
                    write_record_pair(read1, read2, pass2fp)
                    save_pass2 += 2
                else:
                    _, trim_at1 = ct.trim_on_abundance(seq1, CUTOFF)
                    _, trim_at2 = ct.trim_on_abundance(seq2, CUTOFF)

                    if trim_at1 >= K:
                        read1 = trim_record(read1, trim_at1)

                    if trim_at2 >= K:
                        read2 = trim_record(read2, trim_at2)

                    if trim_at1 != len(seq1):
                        trimmed_reads += 1
                    if trim_at2 != len(seq2):
                        trimmed_reads += 1

                    write_record_pair(read1, read2, trimfp)
                    written_reads += 2
                    written_bp += trim_at1 + trim_at2
            else:
                n_reads += 1
                n_bp += len(read1.sequence)

                seq = read1.sequence.replace('N', 'A')

                med, _, _ = ct.get_median_count(seq)

                # has this portion of the graph saturated? if not,
                # consume & save => pass2.
                if med < NORMALIZE_LIMIT:
                    ct.consume(seq)
                    write_record(read1, pass2fp)
                    save_pass2 += 1
                else:  # trim!!
                    _, trim_at = ct.trim_on_abundance(seq, CUTOFF)
                    if trim_at >= K:
                        new_read = trim_record(read1, trim_at)
                        write_record(new_read, trimfp)

                        written_reads += 1
                        written_bp += trim_at

                        if trim_at != len(read1.sequence):
                            trimmed_reads += 1

        pass2fp.close()

        print('%s: kept aside %d of %d from first pass, in %s' %
              (filename, save_pass2, n, filename),
              file=sys.stderr)
        save_pass2_total += save_pass2

    # ### SECOND PASS. ###

    skipped_n = 0
    skipped_bp = 0
    for _, pass2filename, trimfp in pass2list:
        print('second pass: looking at sequences kept aside in %s' %
              pass2filename,
              file=sys.stderr)

        # note that for this second pass, we don't care about paired
        # reads - they will be output in the same order they're read in,
        # so pairs will stay together if not orphaned.  This is in contrast
        # to the first loop.

        for n, read in enumerate(screed.open(pass2filename)):
            if n % 10000 == 0:
                print('... x 2',
                      n,
                      pass2filename,
                      written_reads,
                      written_bp,
                      file=sys.stderr)

            seq = read.sequence.replace('N', 'A')
            med, _, _ = ct.get_median_count(seq)

            # do we retain low-abundance components unchanged?
            if med < NORMALIZE_LIMIT and args.variable_coverage:
                write_record(read, trimfp)

                written_reads += 1
                written_bp += len(read.sequence)
                skipped_n += 1
                skipped_bp += len(read.sequence)

            # otherwise, examine/trim/truncate.
            else:  # med >= NORMALIZE LIMIT or not args.variable_coverage
                _, trim_at = ct.trim_on_abundance(seq, CUTOFF)
                if trim_at >= K:
                    new_read = trim_record(read, trim_at)
                    write_record(new_read, trimfp)

                    written_reads += 1
                    written_bp += trim_at

                    if trim_at != len(read.sequence):
                        trimmed_reads += 1

        print('removing %s' % pass2filename, file=sys.stderr)
        os.unlink(pass2filename)

    print('removing temp directory & contents (%s)' % tempdir, file=sys.stderr)
    shutil.rmtree(tempdir)

    n_passes = 1.0 + (float(save_pass2_total) / n_reads)
    percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\
        n_reads * 100.0

    print('read %d reads, %d bp' % (
        n_reads,
        n_bp,
    ), file=sys.stderr)
    print('wrote %d reads, %d bp' % (
        written_reads,
        written_bp,
    ),
          file=sys.stderr)
    print('looked at %d reads twice (%.2f passes)' %
          (save_pass2_total, n_passes),
          file=sys.stderr)
    print('removed %d reads and trimmed %d reads (%.2f%%)' %
          (n_reads - written_reads, trimmed_reads, percent_reads_trimmed),
          file=sys.stderr)
    print('trimmed or removed %.2f%% of bases (%d total)' %
          ((1 - (written_bp / float(n_bp))) * 100.0, n_bp - written_bp),
          file=sys.stderr)

    if args.variable_coverage:
        percent_reads_hicov = 100.0 * float(n_reads - skipped_n) / n_reads
        print('%d reads were high coverage (%.2f%%);' %
              (n_reads - skipped_n, percent_reads_hicov),
              file=sys.stderr)
        print('skipped %d reads/%d bases because of low coverage' %
              (skipped_n, skipped_bp),
              file=sys.stderr)

    fp_rate = \
        khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975
    print('fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate),
          file=sys.stderr)

    print('output in *.abundtrim', file=sys.stderr)

    if args.savegraph:
        print("Saving k-mer countgraph to", args.savegraph, file=sys.stderr)
        ct.save(args.savegraph)