Python log_error 예제들, khmer.khmer_logger.log_error Python 예제들

예제 #1

0

파일 보기

def main():
    args = sanitize_help(get_parser()).parse_args()
    if not args.quiet:
        info('filter-abund.py', ['counting'])

    configure_logging(args.quiet)

    infiles = args.input_filename
    if ('-' in infiles or '/dev/stdin' in infiles) and not \
       args.single_output_file:
        log_error("Accepting input from stdin; output filename must "
                  "be provided with -o.")
        sys.exit(1)

    for filename in infiles:
        check_input_files(filename, args.force)

    check_space(infiles, args.force)

    log_info('loading countgraph: {graph}', graph=args.input_graph)
    countgraph = khmer.load_countgraph(args.input_graph)
    ksize = countgraph.ksize()

    log_info("K: {ksize}", ksize=ksize)

    if args.single_output_file:
        outfile = args.single_output_file.name
        outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip)

    # the filtering loop
    for infile in infiles:
        log_info('filtering {infile}', infile=infile)
        if not args.single_output_file:
            outfile = os.path.basename(infile) + '.abundfilt'
            outfp = open(outfile, 'wb')
            outfp = get_file_writer(outfp, args.gzip, args.bzip)

        paired_iter = broken_paired_reader(ReadParser(infile),
                                           min_length=ksize,
                                           force_single=True)

        for n, is_pair, read1, read2 in paired_iter:
            assert not is_pair
            assert read2 is None

            trimmed_record, _ = trim_record(countgraph, read1, args.cutoff,
                                            args.variable_coverage,
                                            args.normalize_to)
            if trimmed_record:
                write_record(trimmed_record, outfp)

        log_info('output in {outfile}', outfile=outfile)

예제 #2

0

파일 보기

파일: filter-abund.py 프로젝트: costypetrisor/khmer

def main():
    args = sanitize_help(get_parser()).parse_args()

    configure_logging(args.quiet)

    infiles = args.input_filename
    if ('-' in infiles or '/dev/stdin' in infiles) and not \
       args.single_output_file:
        log_error("Accepting input from stdin; output filename must "
                  "be provided with -o.")
        sys.exit(1)

    for filename in infiles:
        check_input_files(filename, args.force)

    check_space(infiles, args.force)

    log_info('loading countgraph: {graph}', graph=args.input_graph)
    countgraph = khmer.load_countgraph(args.input_graph)
    ksize = countgraph.ksize()

    log_info("K: {ksize}", ksize=ksize)

    if args.single_output_file:
        outfile = args.single_output_file.name
        outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip)

    # the filtering loop
    for infile in infiles:
        log_info('filtering {infile}', infile=infile)
        if not args.single_output_file:
            outfile = os.path.basename(infile) + '.abundfilt'
            outfp = open(outfile, 'wb')
            outfp = get_file_writer(outfp, args.gzip, args.bzip)

        paired_iter = broken_paired_reader(ReadParser(infile),
                                           min_length=ksize,
                                           force_single=True)

        for n, is_pair, read1, read2 in paired_iter:
            assert not is_pair
            assert read2 is None

            trimmed_record, _ = trim_record(countgraph, read1, args.cutoff,
                                            args.variable_coverage,
                                            args.normalize_to)
            if trimmed_record:
                write_record(trimmed_record, outfp)

        log_info('output in {outfile}', outfile=outfile)

예제 #3

0

파일 보기

def catch_io_errors(ifile, out1, out2, out3, out4, out5, out6, force,
                    corrupt_files):
    """Context manager to do boilerplate handling of IOErrors."""
    try:
        yield
    except (IOError, OSError, ValueError) as error:
        log_error('** ERROR: {error}', error=str(error))
        log_error('** Failed on {name}: ', name=ifile)
        if not force:
            log_error('** Exiting!')

            sys.exit(1)
        else:
            log_error('*** Skipping error file, moving on...')
            corrupt_files.append(ifile)

예제 #4

0

파일 보기

파일: normalize-by-median.py 프로젝트: betatim/khmer

def catch_io_errors(ifile, out, single_out, force, corrupt_files):
    """Context manager to do boilerplate handling of IOErrors."""
    try:
        yield
    except (IOError, OSError, ValueError) as error:
        log_error('** ERROR: {error}', error=str(error))
        log_error('** Failed on {name}: ', name=ifile)
        if not single_out:
            os.remove(out.name)
        if not force:
            log_error('** Exiting!')

            sys.exit(1)
        else:
            log_error('*** Skipping error file, moving on...')
            corrupt_files.append(ifile)

예제 #5

0

파일 보기

파일: filter2.py 프로젝트: sermare/diffhash

def main():  # pylint: disable=too-many-branches,too-many-statements
    start_time = time.time()
    parser = sanitize_help(get_parser())
    args = parser.parse_args()

    configure_logging(args.quiet)
    report_on_config(args)

    report_fp = args.report
    force_single = args.force_single

    # check for similar filenames
    # if we're using a single output file only check for identical filenames
    # otherwise, check for identical BASE names as well.
    filenames = []
    basenames = []
    for pathfilename in args.input_filenames:
        filenames.append(pathfilename)
        if args.single_output_file:
            continue  # nothing more to worry about

        basename = os.path.basename(pathfilename)
        if basename in basenames:
            log_error('ERROR: Duplicate filename--Cannot handle this!')
            log_error('** Exiting!')
            sys.exit(1)

        basenames.append(basename)

    # check that files exist and there is sufficient output disk space.
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savegraph is not None:
        graphsize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, graphsize, args.force)

    # load or create counting table.
    if args.loadgraph:
        log_info('loading k-mer countgraph from {graph}', graph=args.loadgraph)
        countgraph1 = Countgraph.load(args.loadgraph)

    # load second counting table.
    if args.loadgraph2:
        log_info('loading k-mer countgraph from {graph}',
                 graph=args.loadgraph2)
        countgraph2 = Countgraph.load(args.loadgraph2)

    # make a list of all filenames and if they're paired or not;
    # if we don't know if they're paired, default to allowing but not
    # forcing pairing.
    files = []
    for element in filenames:
        files.append([element, args.paired])
    if args.unpaired_reads:
        files.append([args.unpaired_reads, False])

    #
    # main loop: iterate over all files given, do diginorm.
    #

    for filename, require_paired in files:
        if not args.single_output_file:
            output_name = os.path.basename(filename) + '.keep'
            outfp = open(output_name, 'wb')
            outfp = get_file_writer(outfp, args.gzip, args.bzip)

        screed_iter = clean_input_reads(screed.open(filename))
        reader = broken_paired_reader(screed_iter,
                                      min_length=args.ksize,
                                      force_single=force_single,
                                      require_paired=require_paired)

        # actually do diginorm
        for _, is_paired, read0, read1 in reader:
            for record in snarf(is_paired, read0, read1, countgraph1,
                                countgraph2):
                if record is not None:
                    write_record(record, outfp)

    print("--- %s seconds ---" % (time.time() - start_time))

예제 #6

0

파일 보기

파일: normalize-by-median.py 프로젝트: trilynn/khmer

def main():  # pylint: disable=too-many-branches,too-many-statements
    parser = sanitize_help(get_parser())
    args = parser.parse_args()

    configure_logging(args.quiet)
    report_on_config(args)

    report_fp = args.report
    force_single = args.force_single

    # check for similar filenames
    # if we're using a single output file only check for identical filenames
    # otherwise, check for identical BASE names as well.
    filenames = []
    basenames = []
    for pathfilename in args.input_filenames:
        filenames.append(pathfilename)
        if args.single_output_file:
            continue  # nothing more to worry about

        basename = os.path.basename(pathfilename)
        if basename in basenames:
            log_error('ERROR: Duplicate filename--Cannot handle this!')
            log_error('** Exiting!')
            sys.exit(1)

        basenames.append(basename)

    # check that files exist and there is sufficient output disk space.
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savegraph is not None:
        graphsize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, graphsize, args.force)

    # load or create counting table.
    if args.loadgraph:
        log_info('loading k-mer countgraph from {graph}', graph=args.loadgraph)
        countgraph = khmer.load_countgraph(args.loadgraph)
    else:
        log_info('making countgraph')
        countgraph = khmer_args.create_countgraph(args)

    # create an object to handle diginorm of all files
    norm = Normalizer(args.cutoff, countgraph)
    with_diagnostics = WithDiagnostics(norm, report_fp, args.report_frequency)

    # make a list of all filenames and if they're paired or not;
    # if we don't know if they're paired, default to allowing but not
    # forcing pairing.
    files = []
    for element in filenames:
        files.append([element, args.paired])
    if args.unpaired_reads:
        files.append([args.unpaired_reads, False])

    corrupt_files = []
    outfp = None
    output_name = None

    if args.single_output_file:
        outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip)
    else:
        if '-' in filenames or '/dev/stdin' in filenames:
            print(
                "Accepting input from stdin; output filename must "
                "be provided with '-o'.",
                file=sys.stderr)
            sys.exit(1)

    #
    # main loop: iterate over all files given, do diginorm.
    #

    for filename, require_paired in files:
        if not args.single_output_file:
            output_name = os.path.basename(filename) + '.keep'
            outfp = open(output_name, 'wb')
            outfp = get_file_writer(outfp, args.gzip, args.bzip)

        # failsafe context manager in case an input file breaks
        with catch_io_errors(filename, outfp, args.single_output_file,
                             args.force, corrupt_files):
            screed_iter = clean_input_reads(screed.open(filename))
            reader = broken_paired_reader(screed_iter,
                                          min_length=args.ksize,
                                          force_single=force_single,
                                          require_paired=require_paired)

            # actually do diginorm
            for record in with_diagnostics(reader, filename):
                if record is not None:
                    write_record(record, outfp)

            log_info('output in {name}', name=describe_file_handle(outfp))
            if not args.single_output_file:
                outfp.close()

    # finished - print out some diagnostics.

    log_info('Total number of unique k-mers: {umers}',
             umers=countgraph.n_unique_kmers())

    if args.savegraph is not None:
        log_info('...saving to {name}', name=args.savegraph)
        countgraph.save(args.savegraph)

    fp_rate = \
        khmer.calc_expected_collisions(countgraph, False, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975

    log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate)

    if args.force and len(corrupt_files) > 0:
        log_error("** WARNING: Finished with errors!")
        log_error("** I/O Errors occurred in the following files:")
        log_error("\t" + " ".join(corrupt_files))

예제 #7

0

파일 보기

파일: normalize-by-median.py 프로젝트: betatim/khmer

def main():  # pylint: disable=too-many-branches,too-many-statements
    parser = sanitize_help(get_parser())
    args = parser.parse_args()

    configure_logging(args.quiet)
    report_on_config(args)

    report_fp = args.report
    force_single = args.force_single

    # check for similar filenames
    # if we're using a single output file only check for identical filenames
    # otherwise, check for identical BASE names as well.
    filenames = []
    basenames = []
    for pathfilename in args.input_filenames:
        filenames.append(pathfilename)
        if args.single_output_file:
            continue  # nothing more to worry about

        basename = os.path.basename(pathfilename)
        if basename in basenames:
            log_error('ERROR: Duplicate filename--Cannot handle this!')
            log_error('** Exiting!')
            sys.exit(1)

        basenames.append(basename)

    # check that files exist and there is sufficient output disk space.
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savegraph is not None:
        graphsize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, graphsize, args.force)

    # load or create counting table.
    if args.loadgraph:
        log_info('loading k-mer countgraph from {graph}',
                 graph=args.loadgraph)
        countgraph = Countgraph.load(args.loadgraph)
    else:
        log_info('making countgraph')
        countgraph = khmer_args.create_countgraph(args)

    # create an object to handle diginorm of all files
    norm = Normalizer(args.cutoff, countgraph)
    with_diagnostics = WithDiagnostics(norm, report_fp, args.report_frequency)

    # make a list of all filenames and if they're paired or not;
    # if we don't know if they're paired, default to allowing but not
    # forcing pairing.
    files = []
    for element in filenames:
        files.append([element, args.paired])
    if args.unpaired_reads:
        files.append([args.unpaired_reads, False])

    corrupt_files = []
    outfp = None
    output_name = None

    if args.single_output_file:
        outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip)
    else:
        if '-' in filenames or '/dev/stdin' in filenames:
            print("Accepting input from stdin; output filename must "
                  "be provided with '-o'.", file=sys.stderr)
            sys.exit(1)

    #
    # main loop: iterate over all files given, do diginorm.
    #

    for filename, require_paired in files:
        if not args.single_output_file:
            output_name = os.path.basename(filename) + '.keep'
            outfp = open(output_name, 'wb')
            outfp = get_file_writer(outfp, args.gzip, args.bzip)

        # failsafe context manager in case an input file breaks
        with catch_io_errors(filename, outfp, args.single_output_file,
                             args.force, corrupt_files):
            screed_iter = clean_input_reads(screed.open(filename))
            reader = broken_paired_reader(screed_iter, min_length=args.ksize,
                                          force_single=force_single,
                                          require_paired=require_paired)

            # actually do diginorm
            for record in with_diagnostics(reader, filename):
                if record is not None:
                    write_record(record, outfp)

            log_info('output in {name}', name=describe_file_handle(outfp))
            if not args.single_output_file:
                outfp.close()

    # finished - print out some diagnostics.

    log_info('Total number of unique k-mers: {umers}',
             umers=countgraph.n_unique_kmers())

    if args.savegraph is not None:
        log_info('...saving to {name}', name=args.savegraph)
        countgraph.save(args.savegraph)

    fp_rate = \
        khmer.calc_expected_collisions(countgraph, False, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975

    log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate)

    if args.force and len(corrupt_files) > 0:
        log_error("** WARNING: Finished with errors!")
        log_error("** I/O Errors occurred in the following files:")
        log_error("\t" + " ".join(corrupt_files))

예제 #8

0

파일 보기

파일: abundance-dist-single.py 프로젝트: nfahlgren/khmer

def main():  # pylint: disable=too-many-locals,too-many-branches
    args = sanitize_help(get_parser()).parse_args()
    graph_type = 'smallcountgraph' if args.small_count else 'countgraph'

    configure_logging(args.quiet)
    report_on_config(args, graph_type)

    check_input_files(args.input_sequence_filename, args.force)
    if args.savegraph is not None:
        graphsize = calculate_graphsize(args, graph_type)
        check_space_for_graph(args.savegraph, graphsize, args.force)
    if (not args.squash_output
            and os.path.exists(args.output_histogram_filename)):
        log_error('ERROR: {output} exists; not squashing.',
                  output=args.output_histogram_filename)
        sys.exit(1)
    else:
        hist_fp = open(args.output_histogram_filename, 'w')
        hist_fp_csv = csv.writer(hist_fp)
        # write headers:
        hist_fp_csv.writerow(
            ['abundance', 'count', 'cumulative', 'cumulative_fraction'])

    log_info('making countgraph')
    # In case the user specified a maximum memory usage, use 8/(9+eps) of that
    # for the countgraph and 1/(9+eps) for the tracking nodegraph
    # `eps` is used to account for the memory used by the python interpreter
    countgraph = khmer_args.create_countgraph(args, multiplier=8 / (9. + 0.3))
    countgraph.set_use_bigcount(args.bigcount)

    log_info('building k-mer tracking graph')
    tracking = khmer_args.create_matching_nodegraph(countgraph)

    log_info('kmer_size: {ksize}', ksize=countgraph.ksize())
    log_info('k-mer countgraph sizes: {sizes}', sizes=countgraph.hashsizes())
    log_info('outputting to {output}', output=args.output_histogram_filename)

    # start loading
    rparser = khmer.ReadParser(args.input_sequence_filename)
    threads = []
    log_info('consuming input, round 1 -- {input}',
             input=args.input_sequence_filename)
    for _ in range(args.threads):
        thread = \
            threading.Thread(
                target=countgraph.consume_seqfile_with_reads_parser,
                args=(rparser, )
            )
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

    log_info('Total number of unique k-mers: {nk}',
             nk=countgraph.n_unique_kmers())

    abundance_lists = []

    def __do_abundance_dist__(read_parser):
        abundances = countgraph.abundance_distribution_with_reads_parser(
            read_parser, tracking)
        abundance_lists.append(abundances)

    log_info('preparing hist from {seqfile}...',
             seqfile=args.input_sequence_filename)
    rparser = khmer.ReadParser(args.input_sequence_filename)
    threads = []
    log_info('consuming input, round 2 -- {filename}',
             filename=args.input_sequence_filename)
    for _ in range(args.threads):
        thread = \
            threading.Thread(
                target=__do_abundance_dist__,
                args=(rparser, )
            )
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

    assert len(abundance_lists) == args.threads, len(abundance_lists)
    abundance = {}
    for abundance_list in abundance_lists:
        for i, count in enumerate(abundance_list):
            abundance[i] = abundance.get(i, 0) + count

    total = sum(abundance.values())

    if 0 == total:
        log_error("ERROR: abundance distribution is uniformly zero; "
                  "nothing to report.")
        log_error("\tPlease verify that the input files are valid.")
        sys.exit(1)

    sofar = 0
    for _, i in sorted(abundance.items()):
        if i == 0 and not args.output_zero:
            continue

        sofar += i
        frac = sofar / float(total)

        hist_fp_csv.writerow([_, i, sofar, round(frac, 3)])

        if sofar == total:
            break

    if args.savegraph is not None:
        log_info('Saving k-mer countgraph to {savegraph}',
                 savegraph=args.savegraph)
        countgraph.save(args.savegraph)

    log_info('wrote to: {output}', output=args.output_histogram_filename)

예제 #9

0

파일 보기

파일: trim-low-abund.py 프로젝트: shannonekj/khmer

def main():
    parser = sanitize_help(get_parser())
    args = parser.parse_args()
    if not args.quiet:
        info('trim-low-abund.py', ['streaming'])

    configure_logging(args.quiet)

    ###

    if len(set(args.input_filenames)) != len(args.input_filenames):
        log_error("Error: Cannot input the same filename multiple times.")
        sys.exit(1)

    if args.trim_at_coverage != DEFAULT_TRIM_AT_COVERAGE and \
       not args.variable_coverage:
        log_error("Error: --trim-at-coverage/-Z given, but "
                  "--variable-coverage/-V not specified.")
        sys.exit(1)

    if args.diginorm_coverage != DEFAULT_DIGINORM_COVERAGE and \
       not args.diginorm:
        log_error("Error: --diginorm-coverage given, but "
                  "--diginorm not specified.")
        sys.exit(1)

    if args.diginorm and args.single_pass:
        log_error("Error: --diginorm and --single-pass are incompatible!\n"
                  "You probably want to use normalize-by-median.py instead.")
        sys.exit(1)

    ###

    report_on_config(args)
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savegraph:
        graphsize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, graphsize, args.force)

    if ('-' in args.input_filenames or '/dev/stdin' in args.input_filenames) \
       and not args.output:
        log_error("Accepting input from stdin; output filename must "
                  "be provided with -o.")
        sys.exit(1)

    if args.loadgraph:
        log_info('loading countgraph from {graph}', graph=args.loadgraph)
        ct = khmer.load_countgraph(args.loadgraph)
    else:
        log_info('making countgraph')
        ct = khmer_args.create_countgraph(args)

    K = ct.ksize()
    tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir)
    log_info(
        'created temporary directory {temp};\n'
        'use -T to change location',
        temp=tempdir)

    trimmer = Trimmer(ct, not args.variable_coverage, args.cutoff,
                      args.trim_at_coverage)
    if args.diginorm:
        trimmer.set_diginorm(args.diginorm_coverage)

    # ### FIRST PASS ###

    save_pass2_total = 0

    written_bp = 0
    written_reads = 0

    # only create the file writer once if outfp is specified; otherwise,
    # create it for each file.
    if args.output:
        trimfp = get_file_writer(args.output, args.gzip, args.bzip)

    pass2list = []
    for filename in args.input_filenames:
        # figure out temporary filename for 2nd pass
        pass2filename = os.path.basename(filename) + '.pass2'
        pass2filename = os.path.join(tempdir, pass2filename)
        pass2fp = open(pass2filename, 'w')

        # construct output filenames
        if args.output is None:
            # note: this will be saved in trimfp.
            outfp = open(os.path.basename(filename) + '.abundtrim', 'wb')

            # get file handle w/gzip, bzip
            trimfp = get_file_writer(outfp, args.gzip, args.bzip)

        # record all this info
        pass2list.append((filename, pass2filename, trimfp))

        # input file stuff: get a broken_paired reader.
        screed_iter = screed.open(filename)
        paired_iter = broken_paired_reader(screed_iter,
                                           min_length=K,
                                           force_single=args.ignore_pairs)

        # main loop through the file.
        n_start = trimmer.n_reads
        save_start = trimmer.n_saved

        watermark = REPORT_EVERY_N_READS
        for read in trimmer.pass1(paired_iter, pass2fp):
            if (trimmer.n_reads - n_start) > watermark:
                log_info(
                    "... {filename} {n_saved} {n_reads} {n_bp} "
                    "{w_reads} {w_bp}",
                    filename=filename,
                    n_saved=trimmer.n_saved,
                    n_reads=trimmer.n_reads,
                    n_bp=trimmer.n_bp,
                    w_reads=written_reads,
                    w_bp=written_bp)
                watermark += REPORT_EVERY_N_READS

            # write out the trimmed/etc sequences that AREN'T going to be
            # revisited in a 2nd pass.
            write_record(read, trimfp)
            written_bp += len(read)
            written_reads += 1
        pass2fp.close()

        log_info("{filename}: kept aside {kept} of {total} from first pass",
                 filename=filename,
                 kept=trimmer.n_saved - save_start,
                 total=trimmer.n_reads - n_start)

    # first pass goes across all the data, so record relevant stats...
    n_reads = trimmer.n_reads
    n_bp = trimmer.n_bp
    n_skipped = trimmer.n_skipped
    bp_skipped = trimmer.bp_skipped
    save_pass2_total = trimmer.n_saved

    # ### SECOND PASS. ###

    # nothing should have been skipped yet!
    assert trimmer.n_skipped == 0
    assert trimmer.bp_skipped == 0

    if args.single_pass:
        pass2list = []

    # go back through all the files again.
    for _, pass2filename, trimfp in pass2list:
        log_info('second pass: looking at sequences kept aside in {pass2}',
                 pass2=pass2filename)

        # note that for this second pass, we don't care about paired
        # reads - they will be output in the same order they're read in,
        # so pairs will stay together if not orphaned.  This is in contrast
        # to the first loop.  Hence, force_single=True below.

        screed_iter = screed.open(pass2filename, parse_description=False)
        paired_iter = broken_paired_reader(screed_iter,
                                           min_length=K,
                                           force_single=True)

        watermark = REPORT_EVERY_N_READS
        for read in trimmer.pass2(paired_iter):
            if (trimmer.n_reads - n_start) > watermark:
                log_info('... x 2 {a} {b} {c} {d} {e} {f} {g}',
                         a=trimmer.n_reads - n_start,
                         b=pass2filename,
                         c=trimmer.n_saved,
                         d=trimmer.n_reads,
                         e=trimmer.n_bp,
                         f=written_reads,
                         g=written_bp)
                watermark += REPORT_EVERY_N_READS

            write_record(read, trimfp)
            written_reads += 1
            written_bp += len(read)

        log_info('removing {pass2}', pass2=pass2filename)
        os.unlink(pass2filename)

        # if we created our own trimfps, close 'em.
        if not args.output:
            trimfp.close()

    log_info('removing temp directory & contents ({temp})', temp=tempdir)
    shutil.rmtree(tempdir)

    trimmed_reads = trimmer.trimmed_reads

    n_passes = 1.0 + (float(save_pass2_total) / n_reads)
    percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\
        n_reads * 100.0

    log_info('read {read} reads, {bp} bp', read=n_reads, bp=n_bp)
    log_info('wrote {wr} reads, {wbp} bp', wr=written_reads, wbp=written_bp)
    log_info('looked at {st} reads twice ({np:.2f} passes)',
             st=save_pass2_total,
             np=n_passes)
    log_info('removed {r} reads and trimmed {t} reads ({p:.2f}%)',
             r=n_reads - written_reads,
             t=trimmed_reads,
             p=percent_reads_trimmed)
    log_info('trimmed or removed {p:.2f}%% of bases ({bp} total)',
             p=(1 - (written_bp / float(n_bp))) * 100.0,
             bp=n_bp - written_bp)

    if args.variable_coverage:
        percent_reads_hicov = 100.0 * float(n_reads - n_skipped) / n_reads
        log_info('{n} reads were high coverage ({p:.2f}%);',
                 n=n_reads - n_skipped,
                 p=percent_reads_hicov)
        log_info('skipped {r} reads/{bp} bases because of low coverage',
                 r=n_skipped,
                 bp=bp_skipped)

    fp_rate = \
        khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975
    log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate)

    log_info('output in *.abundtrim')

    if args.savegraph:
        log_info("Saving k-mer countgraph to {graph}", graph=args.savegraph)
        ct.save(args.savegraph)

예제 #10

0

파일 보기

def main():
    args = sanitize_help(get_parser()).parse_args()

    configure_logging(args.quiet)

    infiles = [args.input_count_graph_filename, args.input_sequence_filename]
    for infile in infiles:
        check_input_files(infile, False)

    log_info('Loading counting graph from {graph}',
             graph=args.input_count_graph_filename)
    countgraph = khmer.load_countgraph(args.input_count_graph_filename)

    if not countgraph.get_use_bigcount() and args.bigcount:
        log_warn("WARNING: The loaded graph has bigcount DISABLED while "
                 "bigcount reporting is ENABLED--counts higher than 255 will "
                 "not be reported.")

    countgraph.set_use_bigcount(args.bigcount)

    kmer_size = countgraph.ksize()
    hashsizes = countgraph.hashsizes()
    tracking = khmer._Nodegraph(  # pylint: disable=protected-access
        kmer_size, hashsizes)

    log_info('K: {ksize}', ksize=kmer_size)
    log_info('outputting to {output}', output=args.output_histogram_filename)

    if args.output_histogram_filename in ('-', '/dev/stdout'):
        pass
    elif os.path.exists(args.output_histogram_filename):
        if not args.squash_output:
            log_error('ERROR: {output} exists; not squashing.',
                      output=args.output_histogram_filename)
            sys.exit(1)

        log_info('** squashing existing file {output}',
                 output=args.output_histogram_filename)

    log_info('preparing hist...')
    abundances = countgraph.abundance_distribution(
        args.input_sequence_filename, tracking)
    total = sum(abundances)

    if 0 == total:
        log_error("ERROR: abundance distribution is uniformly zero; "
                  "nothing to report.")
        log_error("\tPlease verify that the input files are valid.")
        sys.exit(1)

    if args.output_histogram_filename in ('-', '/dev/stdout'):
        countgraph_fp = sys.stdout
    else:
        countgraph_fp = open(args.output_histogram_filename, 'w')
    countgraph_fp_csv = csv.writer(countgraph_fp)
    # write headers:
    countgraph_fp_csv.writerow(
        ['abundance', 'count', 'cumulative', 'cumulative_fraction'])

    sofar = 0
    for _, i in enumerate(abundances):
        if i == 0 and not args.output_zero:
            continue

        sofar += i
        frac = sofar / float(total)

        countgraph_fp_csv.writerow([_, i, sofar, round(frac, 3)])

        if sofar == total:
            break

예제 #11

0

파일 보기

파일: abundance-dist.py 프로젝트: 52teth/khmer

def main():
    args = sanitize_help(get_parser()).parse_args()
    if not args.quiet:
        info('abundance-dist.py', ['counting'])

    configure_logging(args.quiet)

    infiles = [args.input_count_graph_filename,
               args.input_sequence_filename]
    for infile in infiles:
        check_input_files(infile, False)

    log_info('Loading counting graph from {graph}',
             graph=args.input_count_graph_filename)
    countgraph = khmer.load_countgraph(
        args.input_count_graph_filename)

    if not countgraph.get_use_bigcount() and args.bigcount:
        log_warn("WARNING: The loaded graph has bigcount DISABLED while "
                 "bigcount reporting is ENABLED--counts higher than 255 will "
                 "not be reported.")

    countgraph.set_use_bigcount(args.bigcount)

    kmer_size = countgraph.ksize()
    hashsizes = countgraph.hashsizes()
    tracking = khmer._Nodegraph(  # pylint: disable=protected-access
        kmer_size, hashsizes)

    log_info('K: {ksize}', ksize=kmer_size)
    log_info('outputting to {output}', output=args.output_histogram_filename)

    if args.output_histogram_filename in ('-', '/dev/stdout'):
        pass
    elif os.path.exists(args.output_histogram_filename):
        if not args.squash_output:
            log_error('ERROR: {output} exists; not squashing.',
                      output=args.output_histogram_filename)
            sys.exit(1)

        log_info('** squashing existing file {output}',
                 output=args.output_histogram_filename)

    log_info('preparing hist...')
    abundances = countgraph.abundance_distribution(
        args.input_sequence_filename, tracking)
    total = sum(abundances)

    if 0 == total:
        log_error("ERROR: abundance distribution is uniformly zero; "
                  "nothing to report.")
        log_error("\tPlease verify that the input files are valid.")
        sys.exit(1)

    if args.output_histogram_filename in ('-', '/dev/stdout'):
        countgraph_fp = sys.stdout
    else:
        countgraph_fp = open(args.output_histogram_filename, 'w')
    countgraph_fp_csv = csv.writer(countgraph_fp)
    # write headers:
    countgraph_fp_csv.writerow(['abundance', 'count', 'cumulative',
                                'cumulative_fraction'])

    sofar = 0
    for _, i in enumerate(abundances):
        if i == 0 and not args.output_zero:
            continue

        sofar += i
        frac = sofar / float(total)

        countgraph_fp_csv.writerow([_, i, sofar, round(frac, 3)])

        if sofar == total:
            break

예제 #12

0

파일 보기

def main():
    args = sanitize_help(get_parser()).parse_args()
    if not args.quiet:
        info('filter-abund.py', ['counting'])

    configure_logging(args.quiet)

    infiles = args.input_filename
    if ('-' in infiles or '/dev/stdin' in infiles) and not \
       args.single_output_file:
        log_error("Accepting input from stdin; output filename must "
                  "be provided with -o.")
        sys.exit(1)

    for filename in infiles:
        check_input_files(filename, args.force)

    check_space(infiles, args.force)

    log_info('loading countgraph: {graph}', graph=args.input_graph)
    countgraph = khmer.load_countgraph(args.input_graph)
    ksize = countgraph.ksize()

    log_info("K: {ksize}", ksize=ksize)

    # the filtering function.
    def process_fn(record):
        name = record.name
        seq = record.sequence
        seqN = seq.replace('N', 'A')

        if args.variable_coverage:  # only trim when sequence has high enough C
            med, _, _ = countgraph.get_median_count(seqN)
            if med < args.normalize_to:
                return name, seq

        _, trim_at = countgraph.trim_on_abundance(seqN, args.cutoff)

        if trim_at >= ksize:
            # be sure to not to change the 'N's in the trimmed sequence -
            # so, return 'seq' and not 'seqN'.
            return name, seq[:trim_at]

        return None, None

    if args.single_output_file:
        outfile = args.single_output_file.name
        outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip)

    # the filtering loop
    for infile in infiles:
        log_info('filtering {infile}', infile=infile)
        if not args.single_output_file:
            outfile = os.path.basename(infile) + '.abundfilt'
            outfp = open(outfile, 'wb')
            outfp = get_file_writer(outfp, args.gzip, args.bzip)

        tsp = ThreadedSequenceProcessor(process_fn, n_workers=args.threads,
                                        verbose=not args.quiet)
        tsp.start(verbose_loader(infile), outfp)

        log_info('output in {outfile}', outfile=outfile)

예제 #13

0

파일 보기

파일: trim-low-abund.py 프로젝트: 52teth/khmer

def main():
    parser = sanitize_help(get_parser())
    args = parser.parse_args()
    if not args.quiet:
        info('trim-low-abund.py', ['streaming'])

    configure_logging(args.quiet)

    ###

    if len(set(args.input_filenames)) != len(args.input_filenames):
        log_error("Error: Cannot input the same filename multiple times.")
        sys.exit(1)

    if args.trim_at_coverage != DEFAULT_TRIM_AT_COVERAGE and \
       not args.variable_coverage:
        log_error("Error: --trim-at-coverage/-Z given, but "
                  "--variable-coverage/-V not specified.")
        sys.exit(1)

    if args.diginorm_coverage != DEFAULT_DIGINORM_COVERAGE and \
       not args.diginorm:
        log_error("Error: --diginorm-coverage given, but "
                  "--diginorm not specified.")
        sys.exit(1)

    if args.diginorm and args.single_pass:
        log_error("Error: --diginorm and --single-pass are incompatible!\n"
                  "You probably want to use normalize-by-median.py instead.")
        sys.exit(1)

    ###

    report_on_config(args)
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savegraph:
        graphsize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, graphsize, args.force)

    if ('-' in args.input_filenames or '/dev/stdin' in args.input_filenames) \
       and not args.output:
        log_error("Accepting input from stdin; output filename must "
                  "be provided with -o.")
        sys.exit(1)

    if args.loadgraph:
        log_info('loading countgraph from {graph}', graph=args.loadgraph)
        ct = khmer.load_countgraph(args.loadgraph)
    else:
        log_info('making countgraph')
        ct = khmer_args.create_countgraph(args)

    K = ct.ksize()
    tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir)
    log_info('created temporary directory {temp};\n'
             'use -T to change location', temp=tempdir)

    trimmer = Trimmer(ct, not args.variable_coverage, args.cutoff,
                      args.trim_at_coverage)
    if args.diginorm:
        trimmer.set_diginorm(args.diginorm_coverage)

    # ### FIRST PASS ###

    save_pass2_total = 0

    written_bp = 0
    written_reads = 0

    # only create the file writer once if outfp is specified; otherwise,
    # create it for each file.
    if args.output:
        trimfp = get_file_writer(args.output, args.gzip, args.bzip)

    pass2list = []
    for filename in args.input_filenames:
        # figure out temporary filename for 2nd pass
        pass2filename = os.path.basename(filename) + '.pass2'
        pass2filename = os.path.join(tempdir, pass2filename)
        pass2fp = open(pass2filename, 'w')

        # construct output filenames
        if args.output is None:
            # note: this will be saved in trimfp.
            outfp = open(os.path.basename(filename) + '.abundtrim', 'wb')

            # get file handle w/gzip, bzip
            trimfp = get_file_writer(outfp, args.gzip, args.bzip)

        # record all this info
        pass2list.append((filename, pass2filename, trimfp))

        # input file stuff: get a broken_paired reader.
        screed_iter = screed.open(filename)
        paired_iter = broken_paired_reader(screed_iter, min_length=K,
                                           force_single=args.ignore_pairs)

        # main loop through the file.
        n_start = trimmer.n_reads
        save_start = trimmer.n_saved

        watermark = REPORT_EVERY_N_READS
        for read in trimmer.pass1(paired_iter, pass2fp):
            if (trimmer.n_reads - n_start) > watermark:
                log_info("... {filename} {n_saved} {n_reads} {n_bp} "
                         "{w_reads} {w_bp}", filename=filename,
                         n_saved=trimmer.n_saved, n_reads=trimmer.n_reads,
                         n_bp=trimmer.n_bp, w_reads=written_reads,
                         w_bp=written_bp)
                watermark += REPORT_EVERY_N_READS

            # write out the trimmed/etc sequences that AREN'T going to be
            # revisited in a 2nd pass.
            write_record(read, trimfp)
            written_bp += len(read)
            written_reads += 1
        pass2fp.close()

        log_info("{filename}: kept aside {kept} of {total} from first pass",
                 filename=filename, kept=trimmer.n_saved - save_start,
                 total=trimmer.n_reads - n_start)

    # first pass goes across all the data, so record relevant stats...
    n_reads = trimmer.n_reads
    n_bp = trimmer.n_bp
    n_skipped = trimmer.n_skipped
    bp_skipped = trimmer.bp_skipped
    save_pass2_total = trimmer.n_saved

    # ### SECOND PASS. ###

    # nothing should have been skipped yet!
    assert trimmer.n_skipped == 0
    assert trimmer.bp_skipped == 0

    if args.single_pass:
        pass2list = []

    # go back through all the files again.
    for _, pass2filename, trimfp in pass2list:
        log_info('second pass: looking at sequences kept aside in {pass2}',
                 pass2=pass2filename)

        # note that for this second pass, we don't care about paired
        # reads - they will be output in the same order they're read in,
        # so pairs will stay together if not orphaned.  This is in contrast
        # to the first loop.  Hence, force_single=True below.

        screed_iter = screed.open(pass2filename, parse_description=False)
        paired_iter = broken_paired_reader(screed_iter, min_length=K,
                                           force_single=True)

        watermark = REPORT_EVERY_N_READS
        for read in trimmer.pass2(paired_iter):
            if (trimmer.n_reads - n_start) > watermark:
                log_info('... x 2 {a} {b} {c} {d} {e} {f} {g}',
                         a=trimmer.n_reads - n_start,
                         b=pass2filename, c=trimmer.n_saved,
                         d=trimmer.n_reads, e=trimmer.n_bp,
                         f=written_reads, g=written_bp)
                watermark += REPORT_EVERY_N_READS

            write_record(read, trimfp)
            written_reads += 1
            written_bp += len(read)

        log_info('removing {pass2}', pass2=pass2filename)
        os.unlink(pass2filename)

        # if we created our own trimfps, close 'em.
        if not args.output:
            trimfp.close()

    log_info('removing temp directory & contents ({temp})', temp=tempdir)
    shutil.rmtree(tempdir)

    trimmed_reads = trimmer.trimmed_reads

    n_passes = 1.0 + (float(save_pass2_total) / n_reads)
    percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\
        n_reads * 100.0

    log_info('read {read} reads, {bp} bp', read=n_reads, bp=n_bp)
    log_info('wrote {wr} reads, {wbp} bp', wr=written_reads, wbp=written_bp)
    log_info('looked at {st} reads twice ({np:.2f} passes)',
             st=save_pass2_total, np=n_passes)
    log_info('removed {r} reads and trimmed {t} reads ({p:.2f}%)',
             r=n_reads - written_reads, t=trimmed_reads,
             p=percent_reads_trimmed)
    log_info('trimmed or removed {p:.2f}%% of bases ({bp} total)',
             p=(1 - (written_bp / float(n_bp))) * 100.0, bp=n_bp - written_bp)

    if args.variable_coverage:
        percent_reads_hicov = 100.0 * float(n_reads - n_skipped) / n_reads
        log_info('{n} reads were high coverage ({p:.2f}%);',
                 n=n_reads - n_skipped, p=percent_reads_hicov)
        log_info('skipped {r} reads/{bp} bases because of low coverage',
                 r=n_skipped, bp=bp_skipped)

    fp_rate = \
        khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975
    log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate)

    log_info('output in *.abundtrim')

    if args.savegraph:
        log_info("Saving k-mer countgraph to {graph}", graph=args.savegraph)
        ct.save(args.savegraph)

예제 #14

0

파일 보기

파일: abundance-dist-single.py 프로젝트: betatim/khmer

def main():  # pylint: disable=too-many-locals,too-many-branches
    args = sanitize_help(get_parser()).parse_args()
    graph_type = 'smallcountgraph' if args.small_count else 'countgraph'

    configure_logging(args.quiet)
    report_on_config(args, graph_type)

    check_input_files(args.input_sequence_filename, args.force)
    if args.savegraph is not None:
        graphsize = calculate_graphsize(args, graph_type)
        check_space_for_graph(args.savegraph, graphsize, args.force)
    if (not args.squash_output and
            os.path.exists(args.output_histogram_filename)):
        log_error('ERROR: {output} exists; not squashing.',
                  output=args.output_histogram_filename)
        sys.exit(1)
    else:
        hist_fp = open(args.output_histogram_filename, 'w')
        hist_fp_csv = csv.writer(hist_fp)
        # write headers:
        hist_fp_csv.writerow(['abundance', 'count', 'cumulative',
                              'cumulative_fraction'])

    log_info('making countgraph')
    # In case the user specified a maximum memory usage, use 8/(9+eps) of that
    # for the countgraph and 1/(9+eps) for the tracking nodegraph
    # `eps` is used to account for the memory used by the python interpreter
    countgraph = khmer_args.create_countgraph(args, multiplier=8 / (9. + 0.3))

    log_info('building k-mer tracking graph')
    tracking = khmer_args.create_matching_nodegraph(countgraph)

    log_info('kmer_size: {ksize}', ksize=countgraph.ksize())
    log_info('k-mer countgraph sizes: {sizes}', sizes=countgraph.hashsizes())
    log_info('outputting to {output}', output=args.output_histogram_filename)

    # start loading
    rparser = khmer.ReadParser(args.input_sequence_filename)
    threads = []
    log_info('consuming input, round 1 -- {input}',
             input=args.input_sequence_filename)
    for _ in range(args.threads):
        thread = \
            threading.Thread(
                target=countgraph.consume_seqfile,
                args=(rparser, )
            )
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

    log_info('Total number of unique k-mers: {nk}',
             nk=countgraph.n_unique_kmers())

    abundance_lists = []

    def __do_abundance_dist__(read_parser):
        abundances = countgraph.abundance_distribution(
            read_parser, tracking)
        abundance_lists.append(abundances)

    log_info('preparing hist from {seqfile}...',
             seqfile=args.input_sequence_filename)
    rparser = khmer.ReadParser(args.input_sequence_filename)
    threads = []
    log_info('consuming input, round 2 -- {filename}',
             filename=args.input_sequence_filename)
    for _ in range(args.threads):
        thread = \
            threading.Thread(
                target=__do_abundance_dist__,
                args=(rparser, )
            )
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

    assert len(abundance_lists) == args.threads, len(abundance_lists)
    abundance = {}
    for abundance_list in abundance_lists:
        for i, count in enumerate(abundance_list):
            abundance[i] = abundance.get(i, 0) + count

    total = sum(abundance.values())

    if 0 == total:
        log_error("ERROR: abundance distribution is uniformly zero; "
                  "nothing to report.")
        log_error("\tPlease verify that the input files are valid.")
        sys.exit(1)

    sofar = 0
    for _, i in sorted(abundance.items()):
        if i == 0 and not args.output_zero:
            continue

        sofar += i
        frac = sofar / float(total)

        hist_fp_csv.writerow([_, i, sofar, round(frac, 3)])

        if sofar == total:
            break

    if args.savegraph is not None:
        log_info('Saving k-mer countgraph to {savegraph}',
                 savegraph=args.savegraph)
        countgraph.save(args.savegraph)

    log_info('wrote to: {output}', output=args.output_histogram_filename)