Пример #1
0
def main():
    counting_ht = sys.argv[1]
    infiles = sys.argv[2:]

    print('file with ht: %s' % counting_ht)

    print('making hashtable')
    ht = Countgraph.load(counting_ht)
    K = ht.ksize()

    for infile in infiles:
        print('filtering', infile)
        outfile = os.path.basename(infile) + '.below'

        outfp = open(outfile, 'w')

        paired_iter = broken_paired_reader(ReadParser(infile), min_length=K,
                                           force_single=True)
        for n, is_pair, read1, read2 in paired_iter:
            name = read1.name
            seq = read1.sequence
            if 'N' in seq:
                return None, None

            trim_seq, trim_at = ht.trim_below_abundance(seq, CUTOFF)

            if trim_at >= K:
                write_record(screed.Record(name=name, sequence=trim_seq), outfp)
Пример #2
0
def paired_reader(readstream):
    i = 0
    for n, ispaired, read1, read2 in broken_paired_reader(readstream):
        i += 1
        yield i, read1, read2
        if ispaired:
            i += 1
            yield i, read2, read1
Пример #3
0
def main():
    args = sanitize_help(get_parser()).parse_args()

    configure_logging(args.quiet)
    check_input_files(args.datafile, args.force)
    check_space([args.datafile], args.force)

    if args.savegraph:
        tablesize = calculate_graphsize(args, "countgraph")
        check_space_for_graph(args.savegraph, tablesize, args.force)

    report_on_config(args)

    log_info("making countgraph")
    graph = khmer_args.create_countgraph(args)

    # first, load reads into graph
    rparser = khmer.ReadParser(args.datafile)
    threads = []
    log_info("consuming input, round 1 -- {datafile}", datafile=args.datafile)
    for _ in range(args.threads):
        cur_thread = threading.Thread(target=graph.consume_fasta_with_reads_parser, args=(rparser,))
        threads.append(cur_thread)
        cur_thread.start()

    for _ in threads:
        _.join()

    log_info("Total number of unique k-mers: {nk}", nk=graph.n_unique_kmers())

    fp_rate = khmer.calc_expected_collisions(graph, args.force)
    log_info("fp rate estimated to be {fpr:1.3f}", fpr=fp_rate)

    # the filtering loop
    log_info("filtering {datafile}", datafile=args.datafile)
    if args.outfile is None:
        outfile = os.path.basename(args.datafile) + ".abundfilt"
    else:
        outfile = args.outfile
    outfp = open(outfile, "wb")
    outfp = get_file_writer(outfp, args.gzip, args.bzip)

    paired_iter = broken_paired_reader(ReadParser(args.datafile), min_length=graph.ksize(), force_single=True)

    for n, is_pair, read1, read2 in paired_iter:
        assert not is_pair
        assert read2 is None

        trimmed_record, _ = trim_record(graph, read1, args.cutoff, args.variable_coverage, args.normalize_to)
        if trimmed_record:
            print((trimmed_record,))
            write_record(trimmed_record, outfp)

    log_info("output in {outfile}", outfile=outfile)

    if args.savegraph:
        log_info("Saving k-mer countgraph filename {graph}", graph=args.savegraph)
        graph.save(args.savegraph)
Пример #4
0
def main():
    args = sanitize_help(get_parser()).parse_args()
    if not args.quiet:
        info('filter-abund.py', ['counting'])

    configure_logging(args.quiet)

    infiles = args.input_filename
    if ('-' in infiles or '/dev/stdin' in infiles) and not \
       args.single_output_file:
        log_error("Accepting input from stdin; output filename must "
                  "be provided with -o.")
        sys.exit(1)

    for filename in infiles:
        check_input_files(filename, args.force)

    check_space(infiles, args.force)

    log_info('loading countgraph: {graph}', graph=args.input_graph)
    countgraph = khmer.load_countgraph(args.input_graph)
    ksize = countgraph.ksize()

    log_info("K: {ksize}", ksize=ksize)

    if args.single_output_file:
        outfile = args.single_output_file.name
        outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip)

    # the filtering loop
    for infile in infiles:
        log_info('filtering {infile}', infile=infile)
        if not args.single_output_file:
            outfile = os.path.basename(infile) + '.abundfilt'
            outfp = open(outfile, 'wb')
            outfp = get_file_writer(outfp, args.gzip, args.bzip)

        paired_iter = broken_paired_reader(ReadParser(infile),
                                           min_length=ksize,
                                           force_single=True)

        for n, is_pair, read1, read2 in paired_iter:
            assert not is_pair
            assert read2 is None

            trimmed_record, _ = trim_record(countgraph, read1, args.cutoff,
                                            args.variable_coverage,
                                            args.normalize_to)
            if trimmed_record:
                write_record(trimmed_record, outfp)

        log_info('output in {outfile}', outfile=outfile)
Пример #5
0
    def gather(self, **kw):
        iter = broken_paired_reader(self.stream, **kw)

        x = []
        m = 0
        for n, is_pair, read1, read2 in iter:
            if is_pair:
                x.append((read1.name, read2.name))
            else:
                x.append((read1.name, None))
            m += 1

        return x, n, m
Пример #6
0
    def gather(self, **kw):
        iter = broken_paired_reader(self.stream, **kw)

        x = []
        m = 0
        for n, is_pair, read1, read2 in iter:
            if is_pair:
                x.append((read1.name, read2.name))
            else:
                x.append((read1.name, None))
            m += 1

        return x, n, m
Пример #7
0
def main():
    args = sanitize_help(get_parser()).parse_args()

    configure_logging(args.quiet)

    infiles = args.input_filename
    if ('-' in infiles or '/dev/stdin' in infiles) and not \
       args.single_output_file:
        log_error("Accepting input from stdin; output filename must "
                  "be provided with -o.")
        sys.exit(1)

    for filename in infiles:
        check_input_files(filename, args.force)

    check_space(infiles, args.force)

    log_info('loading countgraph: {graph}', graph=args.input_graph)
    countgraph = khmer.load_countgraph(args.input_graph)
    ksize = countgraph.ksize()

    log_info("K: {ksize}", ksize=ksize)

    if args.single_output_file:
        outfile = args.single_output_file.name
        outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip)

    # the filtering loop
    for infile in infiles:
        log_info('filtering {infile}', infile=infile)
        if not args.single_output_file:
            outfile = os.path.basename(infile) + '.abundfilt'
            outfp = open(outfile, 'wb')
            outfp = get_file_writer(outfp, args.gzip, args.bzip)

        paired_iter = broken_paired_reader(ReadParser(infile),
                                           min_length=ksize,
                                           force_single=True)

        for n, is_pair, read1, read2 in paired_iter:
            assert not is_pair
            assert read2 is None

            trimmed_record, _ = trim_record(countgraph, read1, args.cutoff,
                                            args.variable_coverage,
                                            args.normalize_to)
            if trimmed_record:
                write_record(trimmed_record, outfp)

        log_info('output in {outfile}', outfile=outfile)
Пример #8
0
def test_diginorm():
    filename = 'test_files/simple-metagenome-reads.fa'

    graph = khmer.new_counting_hash(20, 1e7, 4)
    out_fp = open(os.path.basename(filename) + '.keep', 'w')

    ## khmer scripts/normalize-by-median.py, using generators
    input_iter = screed.open(filename)
    input_iter = broken_paired_reader(input_iter)
    input_iter = clean_reads(input_iter)
    input_iter = diginorm(input_iter, graph, 20)

    script_result = screed.open('test_files/'
                                'simple-metagenome-reads.fa.keep.k20.C20')
    for read_a, read_b in zip(broken_paired_to_single(input_iter), script_result):
        print read_a.name
        assert read_a == read_b, (read_a, read_b)
Пример #9
0
def main():
    info('extract-paired-reads.py')
    args = get_parser().parse_args()

    check_input_files(args.infile, args.force)
    infiles = [args.infile]
    check_space(infiles, args.force)

    outfile = os.path.basename(args.infile)
    if len(sys.argv) > 2:
        outfile = sys.argv[2]

    single_fp = open(outfile + '.se', 'w')
    paired_fp = open(outfile + '.pe', 'w')

    print >>sys.stderr, 'reading file "%s"' % args.infile
    print >>sys.stderr, 'outputting interleaved pairs to "%s.pe"' % outfile
    print >>sys.stderr, 'outputting orphans to "%s.se"' % outfile

    n_pe = 0
    n_se = 0

    screed_iter = screed.open(args.infile, parse_description=False)
    for index, is_pair, read1, read2 in broken_paired_reader(screed_iter):
        if index % 100000 == 0 and index > 0:
            print >>sys.stderr, '...', index

        if is_pair:
            write_record_pair(read1, read2, paired_fp)
            n_pe += 1
        else:
            write_record(read1, single_fp)
            n_se += 1

    single_fp.close()
    paired_fp.close()

    if n_pe == 0:
        raise Exception("no paired reads!? check file formats...")

    print >>sys.stderr, 'DONE; read %d sequences,' \
        ' %d pairs and %d singletons' % \
        (n_pe * 2 + n_se, n_pe, n_se)

    print >> sys.stderr, 'wrote to: ' + outfile \
        + '.se' + ' and ' + outfile + '.pe'
Пример #10
0
def main():
    info('extract-paired-reads.py')
    args = get_parser().parse_args()

    check_file_status(args.infile, args.force)
    infiles = [args.infile]
    check_space(infiles, args.force)

    outfile = os.path.basename(args.infile)
    if len(sys.argv) > 2:
        outfile = sys.argv[2]

    single_fp = open(outfile + '.se', 'w')
    paired_fp = open(outfile + '.pe', 'w')

    print >> sys.stderr, 'reading file "%s"' % args.infile
    print >> sys.stderr, 'outputting interleaved pairs to "%s.pe"' % outfile
    print >> sys.stderr, 'outputting orphans to "%s.se"' % outfile

    n_pe = 0
    n_se = 0

    screed_iter = screed.open(args.infile, parse_description=False)
    for index, is_pair, read1, read2 in broken_paired_reader(screed_iter):
        if index % 100000 == 0 and index > 0:
            print >> sys.stderr, '...', index

        if is_pair:
            write_record_pair(read1, read2, paired_fp)
            n_pe += 1
        else:
            write_record(read1, single_fp)
            n_se += 1

    single_fp.close()
    paired_fp.close()

    if n_pe == 0:
        raise Exception("no paired reads!? check file formats...")

    print >>sys.stderr, 'DONE; read %d sequences,' \
        ' %d pairs and %d singletons' % \
        (n_pe * 2 + n_se, n_pe, n_se)

    print >> sys.stderr, 'wrote to: ' + outfile \
        + '.se' + ' and ' + outfile + '.pe'
Пример #11
0
def main():
    parser = build_nodegraph_args()
    parser.add_argument('-o',
                        '--outfile',
                        help='output file; default is "infile".sweep2')
    parser.add_argument('-q', '--quiet')
    parser.add_argument('input_filename')
    parser.add_argument('read_filename')

    args = parser.parse_args()

    inp = args.input_filename
    readsfile = args.read_filename

    outfile = os.path.basename(readsfile) + '.sweep2'
    if args.outfile:
        outfile = args.outfile
    outfp = open(outfile, 'w')

    # create a nodegraph data structure
    ht = khmer_args.create_countgraph(args)

    # load contigs, connect into N partitions
    print('loading input reads from', inp)
    ht.consume_seqfile(inp)

    print('starting sweep.')

    m = 0
    K = ht.ksize()
    instream = screed.open(readsfile)
    for n, is_pair, read1, read2 in broken_paired_reader(instream):
        if n % 10000 == 0:
            print('...', n, m)

        if is_pair:
            count1 = ht.get_median_count(read1.sequence)[0]
            count2 = ht.get_median_count(read2.sequence)[0]
            if count1 or count2:
                m += 1
                write_record_pair(read1, read2, outfp)
        else:
            count = ht.get_median_count(read1.sequence)[0]
            if count:
                m += 1
                write_record(read1, outfp)
Пример #12
0
def main():
    parser = build_nodegraph_args()
    parser.add_argument('-o', '--outfile',
                        help='output file; default is "infile".sweep2')
    parser.add_argument('-q', '--quiet')
    parser.add_argument('input_filename')
    parser.add_argument('read_filename')

    args = parser.parse_args()

    inp = args.input_filename
    readsfile = args.read_filename

    outfile = os.path.basename(readsfile) + '.sweep2'
    if args.outfile:
        outfile = args.outfile
    outfp = open(outfile, 'w')

    # create a nodegraph data structure
    ht = khmer_args.create_countgraph(args)

    # load contigs, connect into N partitions
    print('loading input reads from', inp)
    ht.consume_fasta(inp)

    print('starting sweep.')

    m = 0
    K = ht.ksize()
    instream = screed.open(readsfile)
    for n, is_pair, read1, read2 in broken_paired_reader(instream):
        if n % 10000 == 0:
            print('...', n, m)

        if is_pair:
            count1 = ht.get_median_count(read1.sequence)[0]
            count2 = ht.get_median_count(read2.sequence)[0]
            if count1 or count2:
                m += 1
                write_record_pair(read1, read2, outfp)
        else:
            count = ht.get_median_count(read1.sequence)[0]
            if count:
                m += 1
                write_record(read1, outfp)
Пример #13
0
def test_BrokenPairedReader_lowercase():
    stream = [screed.Record(name='seq1/1', sequence='acgtn'),
              screed.Record(name='seq1/2', sequence='AcGtN'),
              screed.Record(name='seq1/2', sequence='aCgTn')]

    results = []
    for num, is_pair, read1, read2 in broken_paired_reader(stream):
        results.append((read1, read2))

    a, b = results[0]
    assert a.sequence == 'acgtn'
    assert a.cleaned_seq == 'ACGTA'
    assert b.sequence == 'AcGtN'
    assert b.cleaned_seq == 'ACGTA'

    c, d = results[1]
    assert c.sequence == 'aCgTn'
    assert c.cleaned_seq == 'ACGTA'
    assert d is None
Пример #14
0
def test_BrokenPairedReader_lowercase():
    stream = [screed.Record(name='seq1/1', sequence='acgtn'),
              screed.Record(name='seq1/2', sequence='AcGtN'),
              screed.Record(name='seq1/2', sequence='aCgTn')]

    results = []
    for num, is_pair, read1, read2 in broken_paired_reader(stream):
        results.append((read1, read2))

    a, b = results[0]
    assert a.sequence == 'acgtn'
    assert a.cleaned_seq == 'ACGTA'
    assert b.sequence == 'AcGtN'
    assert b.cleaned_seq == 'ACGTA'

    c, d = results[1]
    assert c.sequence == 'aCgTn'
    assert c.cleaned_seq == 'ACGTA'
    assert d is None
Пример #15
0
def test_BrokenPairedReader_lowercase_khmer_Read():
    # use khmer.Read objects which should automatically have a `cleaned_seq`
    # attribute
    stream = [khmer.Read(name='seq1/1', sequence='acgtn'),
              khmer.Read(name='seq1/2', sequence='AcGtN'),
              khmer.Read(name='seq1/2', sequence='aCgTn')]

    results = []
    for num, is_pair, read1, read2 in broken_paired_reader(stream):
        results.append((read1, read2))

    a, b = results[0]
    assert a.sequence == 'acgtn'
    assert a.cleaned_seq == 'ACGTA'
    assert b.sequence == 'AcGtN'
    assert b.cleaned_seq == 'ACGTA'

    c, d = results[1]
    assert c.sequence == 'aCgTn'
    assert c.cleaned_seq == 'ACGTA'
    assert d is None
Пример #16
0
def main():
    info('trim-low-abund.py', ['streaming'])
    parser = get_parser()
    args = parser.parse_args()

    ###

    if len(set(args.input_filenames)) != len(args.input_filenames):
        print >>sys.stderr, \
            "Error: Cannot input the same filename multiple times."
        sys.exit(1)

    ###

    report_on_config(args)
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savetable:
        check_space_for_hashtable(args.n_tables * args.min_tablesize,
                                  args.force)

    K = args.ksize

    CUTOFF = args.cutoff
    NORMALIZE_LIMIT = args.normalize_to

    if args.loadtable:
        print >> sys.stderr, 'loading k-mer counting table from', args.loadtable
        ct = khmer.load_counting_hash(args.loadtable)
    else:
        print >> sys.stderr, 'making k-mer counting table'
        ct = khmer.new_counting_hash(K, args.min_tablesize, args.n_tables)

    tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir)
    print >>sys.stderr, 'created temporary directory %s; ' \
                        'use -T to change location' % tempdir

    # ### FIRST PASS ###

    save_pass2_total = 0

    n_bp = 0
    n_reads = 0
    written_bp = 0
    written_reads = 0
    trimmed_reads = 0

    pass2list = []
    for filename in args.input_filenames:
        pass2filename = os.path.basename(filename) + '.pass2'
        pass2filename = os.path.join(tempdir, pass2filename)
        if args.out is None:
            trimfp = open(os.path.basename(filename) + '.abundtrim', 'w')
        else:
            trimfp = args.out

        pass2list.append((filename, pass2filename, trimfp))

        screed_iter = screed.open(filename, parse_description=False)
        pass2fp = open(pass2filename, 'w')

        save_pass2 = 0
        n = 0

        paired_iter = broken_paired_reader(screed_iter,
                                           min_length=K,
                                           force_single=args.ignore_pairs)
        for n, is_pair, read1, read2 in paired_iter:
            if n % 10000 == 0:
                print >>sys.stderr, '...', n, filename, save_pass2, \
                    n_reads, n_bp, written_reads, written_bp

            # we want to track paired reads here, to make sure that pairs
            # are not split between first pass and second pass.

            if is_pair:
                n_reads += 2
                n_bp += len(read1.sequence) + len(read2.sequence)

                seq1 = read1.sequence.replace('N', 'A')
                seq2 = read2.sequence.replace('N', 'A')

                med1, _, _ = ct.get_median_count(seq1)
                med2, _, _ = ct.get_median_count(seq2)

                if med1 < NORMALIZE_LIMIT or med2 < NORMALIZE_LIMIT:
                    ct.consume(seq1)
                    ct.consume(seq2)
                    write_record_pair(read1, read2, pass2fp)
                    save_pass2 += 2
                else:
                    _, trim_at1 = ct.trim_on_abundance(seq1, CUTOFF)
                    _, trim_at2 = ct.trim_on_abundance(seq2, CUTOFF)

                    if trim_at1 >= K:
                        read1 = trim_record(read1, trim_at1)

                    if trim_at2 >= K:
                        read2 = trim_record(read2, trim_at2)

                    if trim_at1 != len(seq1):
                        trimmed_reads += 1
                    if trim_at2 != len(seq2):
                        trimmed_reads += 1

                    write_record_pair(read1, read2, trimfp)
                    written_reads += 2
                    written_bp += trim_at1 + trim_at2
            else:
                n_reads += 1
                n_bp += len(read1.sequence)

                seq = read1.sequence.replace('N', 'A')

                med, _, _ = ct.get_median_count(seq)

                # has this portion of the graph saturated? if not,
                # consume & save => pass2.
                if med < NORMALIZE_LIMIT:
                    ct.consume(seq)
                    write_record(read1, pass2fp)
                    save_pass2 += 1
                else:  # trim!!
                    _, trim_at = ct.trim_on_abundance(seq, CUTOFF)
                    if trim_at >= K:
                        new_read = trim_record(read1, trim_at)
                        write_record(new_read, trimfp)

                        written_reads += 1
                        written_bp += trim_at

                        if trim_at != len(read1.sequence):
                            trimmed_reads += 1

        pass2fp.close()

        print >>sys.stderr, '%s: kept aside %d of %d from first pass, in %s' \
            % (filename, save_pass2, n, filename)
        save_pass2_total += save_pass2

    # ### SECOND PASS. ###

    skipped_n = 0
    skipped_bp = 0
    for _, pass2filename, trimfp in pass2list:
        print >> sys.stderr, ('second pass: looking at sequences kept aside '
                              'in %s') % pass2filename

        # note that for this second pass, we don't care about paired
        # reads - they will be output in the same order they're read in,
        # so pairs will stay together if not orphaned.  This is in contrast
        # to the first loop.

        for n, read in enumerate(
                screed.open(pass2filename, parse_description=False)):
            if n % 10000 == 0:
                print >>sys.stderr, '... x 2', n, pass2filename, \
                    written_reads, written_bp

            seq = read.sequence.replace('N', 'A')
            med, _, _ = ct.get_median_count(seq)

            # do we retain low-abundance components unchanged?
            if med < NORMALIZE_LIMIT and args.variable_coverage:
                write_record(read, trimfp)

                written_reads += 1
                written_bp += len(read.sequence)
                skipped_n += 1
                skipped_bp += len(read.sequence)

            # otherwise, examine/trim/truncate.
            else:  # med >= NORMALIZE LIMIT or not args.variable_coverage
                _, trim_at = ct.trim_on_abundance(seq, CUTOFF)
                if trim_at >= K:
                    new_read = trim_record(read, trim_at)
                    write_record(new_read, trimfp)

                    written_reads += 1
                    written_bp += trim_at

                    if trim_at != len(read.sequence):
                        trimmed_reads += 1

        print >> sys.stderr, 'removing %s' % pass2filename
        os.unlink(pass2filename)

    print >> sys.stderr, 'removing temp directory & contents (%s)' % tempdir
    shutil.rmtree(tempdir)

    n_passes = 1.0 + (float(save_pass2_total) / n_reads)
    percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\
        n_reads * 100.0

    print >> sys.stderr, 'read %d reads, %d bp' % (
        n_reads,
        n_bp,
    )
    print >> sys.stderr, 'wrote %d reads, %d bp' % (
        written_reads,
        written_bp,
    )
    print >>sys.stderr, 'looked at %d reads twice (%.2f passes)' % \
        (save_pass2_total, n_passes)
    print >>sys.stderr, 'removed %d reads and trimmed %d reads (%.2f%%)' % \
        (n_reads - written_reads, trimmed_reads, percent_reads_trimmed)
    print >>sys.stderr, 'trimmed or removed %.2f%% of bases (%d total)' % \
        ((1 - (written_bp / float(n_bp))) * 100.0, n_bp - written_bp)

    if args.variable_coverage:
        percent_reads_hicov = 100.0 * float(n_reads - skipped_n) / n_reads
        print >>sys.stderr, '%d reads were high coverage (%.2f%%);' % \
            (n_reads - skipped_n, percent_reads_hicov)
        print >> sys.stderr, ('skipped %d reads/%d bases because of low'
                              'coverage') % (skipped_n, skipped_bp)

    fp_rate = \
        khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975
    print >>sys.stderr, \
        'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate)

    print >> sys.stderr, 'output in *.abundtrim'

    if args.savetable:
        print >> sys.stderr, "Saving k-mer counting table to", args.savetable
        ct.save(args.savetable)
Пример #17
0
def main():  # pylint: disable=too-many-branches,too-many-statements
    parser = sanitize_help(get_parser())
    args = parser.parse_args()

    configure_logging(args.quiet)
    report_on_config(args)

    report_fp = args.report
    force_single = args.force_single

    # check for similar filenames
    # if we're using a single output file only check for identical filenames
    # otherwise, check for identical BASE names as well.
    filenames = []
    basenames = []
    for pathfilename in args.input_filenames:
        filenames.append(pathfilename)
        if args.single_output_file:
            continue  # nothing more to worry about

        basename = os.path.basename(pathfilename)
        if basename in basenames:
            log_error('ERROR: Duplicate filename--Cannot handle this!')
            log_error('** Exiting!')
            sys.exit(1)

        basenames.append(basename)

    # check that files exist and there is sufficient output disk space.
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savegraph is not None:
        graphsize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, graphsize, args.force)

    # load or create counting table.
    if args.loadgraph:
        log_info('loading k-mer countgraph from {graph}',
                 graph=args.loadgraph)
        countgraph = Countgraph.load(args.loadgraph)
    else:
        log_info('making countgraph')
        countgraph = khmer_args.create_countgraph(args)

    # create an object to handle diginorm of all files
    norm = Normalizer(args.cutoff, countgraph)
    with_diagnostics = WithDiagnostics(norm, report_fp, args.report_frequency)

    # make a list of all filenames and if they're paired or not;
    # if we don't know if they're paired, default to allowing but not
    # forcing pairing.
    files = []
    for element in filenames:
        files.append([element, args.paired])
    if args.unpaired_reads:
        files.append([args.unpaired_reads, False])

    corrupt_files = []
    outfp = None
    output_name = None

    if args.single_output_file:
        outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip)
    else:
        if '-' in filenames or '/dev/stdin' in filenames:
            print("Accepting input from stdin; output filename must "
                  "be provided with '-o'.", file=sys.stderr)
            sys.exit(1)

    #
    # main loop: iterate over all files given, do diginorm.
    #

    for filename, require_paired in files:
        if not args.single_output_file:
            output_name = os.path.basename(filename) + '.keep'
            outfp = open(output_name, 'wb')
            outfp = get_file_writer(outfp, args.gzip, args.bzip)

        # failsafe context manager in case an input file breaks
        with catch_io_errors(filename, outfp, args.single_output_file,
                             args.force, corrupt_files):
            screed_iter = clean_input_reads(screed.open(filename))
            reader = broken_paired_reader(screed_iter, min_length=args.ksize,
                                          force_single=force_single,
                                          require_paired=require_paired)

            # actually do diginorm
            for record in with_diagnostics(reader, filename):
                if record is not None:
                    write_record(record, outfp)

            log_info('output in {name}', name=describe_file_handle(outfp))
            if not args.single_output_file:
                outfp.close()

    # finished - print out some diagnostics.

    log_info('Total number of unique k-mers: {umers}',
             umers=countgraph.n_unique_kmers())

    if args.savegraph is not None:
        log_info('...saving to {name}', name=args.savegraph)
        countgraph.save(args.savegraph)

    fp_rate = \
        khmer.calc_expected_collisions(countgraph, False, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975

    log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate)

    if args.force and len(corrupt_files) > 0:
        log_error("** WARNING: Finished with errors!")
        log_error("** I/O Errors occurred in the following files:")
        log_error("\t" + " ".join(corrupt_files))
Пример #18
0
def main():
    info('correct-reads.py', ['streaming'])
    parser = get_parser()
    args = parser.parse_args()

    ###

    if len(set(args.input_filenames)) != len(args.input_filenames):
        print >>sys.stderr, \
            "Error: Cannot input the same filename multiple times."
        sys.exit(1)

    ###

    report_on_config(args)
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savegraph:
        check_space_for_graph(
            args.n_tables * args.min_tablesize, args.force)

    K = args.ksize

    CUTOFF = args.cutoff
    NORMALIZE_LIMIT = args.normalize_to

    if args.loadgraph:
        print >>sys.stderr, 'loading k-mer countgraph from', args.loadgraph
        ct = khmer.load_countgraph(args.loadgraph)
    else:
        print >>sys.stderr, 'making k-mer countgraph'
        ct = khmer.new_countgraph(K, args.min_tablesize, args.n_tables)

    tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir)
    print >>sys.stderr, 'created temporary directory %s; ' \
                        'use -T to change location' % tempdir

    aligner = khmer.ReadAligner(ct, args.cutoff, args.bits_theta)

    # ### FIRST PASS ###

    save_pass2_total = 0

    n_bp = 0
    n_reads = 0
    written_bp = 0
    written_reads = 0
    corrected_reads = 0

    pass2list = []
    for filename in args.input_filenames:
        pass2filename = os.path.basename(filename) + '.pass2'
        pass2filename = os.path.join(tempdir, pass2filename)
        if args.out is None:
            corrfp = open(os.path.basename(filename) + '.corr', 'w')
        else:
            corrfp = args.out

        pass2list.append((filename, pass2filename, corrfp))

        screed_iter = screed.open(filename, parse_description=False)
        pass2fp = open(pass2filename, 'w')

        save_pass2 = 0
        n = 0

        paired_iter = broken_paired_reader(screed_iter, min_length=K,
                                           force_single=args.ignore_pairs)
        for n, is_pair, read1, read2 in paired_iter:
            if n % 10000 == 0:
                print >>sys.stderr, '...', n, filename, save_pass2, \
                    n_reads, n_bp, written_reads, written_bp

            # we want to track paired reads here, to make sure that pairs
            # are not split between first pass and second pass.

            if is_pair:
                n_reads += 2
                n_bp += len(read1.sequence) + len(read2.sequence)

                seq1 = read1.sequence.replace('N', 'A')
                seq2 = read2.sequence.replace('N', 'A')

                med1, _, _ = ct.get_median_count(seq1)
                med2, _, _ = ct.get_median_count(seq2)

                if med1 < NORMALIZE_LIMIT or med2 < NORMALIZE_LIMIT:
                    ct.consume(seq1)
                    ct.consume(seq2)
                    write_record_pair(read1, read2, pass2fp)
                    save_pass2 += 2
                else:
                    is_aligned, new_seq1 = correct_sequence(aligner, seq1)
                    if is_aligned:
                        if new_seq1 != read1.sequence:
                            corrected_reads += 1
                        read1.sequence = new_seq1
                        if hasattr(read1, 'quality'):
                            fix_quality(read1)

                    is_aligned, new_seq2 = correct_sequence(aligner, seq2)
                    if is_aligned:
                        if new_seq2 != read2.sequence:
                            corrected_reads += 1
                        read2.sequence = new_seq2
                        if hasattr(read2, 'quality'):
                            fix_quality(read2)

                    write_record_pair(read1, read2, corrfp)
                    written_reads += 2
                    written_bp += len(read1)
                    written_bp += len(read2)
            else:
                n_reads += 1
                n_bp += len(read1.sequence)

                seq = read1.sequence.replace('N', 'A')

                med, _, _ = ct.get_median_count(seq)

                # has this portion of the graph saturated? if not,
                # consume & save => pass2.
                if med < NORMALIZE_LIMIT:
                    ct.consume(seq)
                    write_record(read1, pass2fp)
                    save_pass2 += 1
                else:                       # trim!!
                    is_aligned, new_seq = correct_sequence(aligner, seq)
                    if is_aligned:
                        if new_seq != read1.sequence:
                            corrected_reads += 1
                        read1.sequence = new_seq
                        if hasattr(read1, 'quality'):
                            fix_quality(read1)

                        write_record(read1, corrfp)

                        written_reads += 1
                        written_bp += len(new_seq)

        pass2fp.close()

        print >>sys.stderr, '%s: kept aside %d of %d from first pass, in %s' \
            % (filename, save_pass2, n, filename)
        save_pass2_total += save_pass2

    # ### SECOND PASS. ###

    skipped_n = 0
    skipped_bp = 0
    for _, pass2filename, corrfp in pass2list:
        print >>sys.stderr, ('second pass: looking at sequences kept aside '
                             'in %s') % pass2filename

        # note that for this second pass, we don't care about paired
        # reads - they will be output in the same order they're read in,
        # so pairs will stay together if not orphaned.  This is in contrast
        # to the first loop.

        for n, read in enumerate(screed.open(pass2filename,
                                             parse_description=False)):
            if n % 10000 == 0:
                print >>sys.stderr, '... x 2', n, pass2filename, \
                    written_reads, written_bp

            seq = read.sequence.replace('N', 'A')
            med, _, _ = ct.get_median_count(seq)

            # do we retain low-abundance components unchanged?
            if med < NORMALIZE_LIMIT and args.variable_coverage:
                write_record(read, corrfp)

                written_reads += 1
                written_bp += len(read.sequence)
                skipped_n += 1
                skipped_bp += len(read.sequence)

            # otherwise, examine/correct.
            else:    # med >= NORMALIZE LIMIT or not args.variable_coverage
                is_aligned, new_seq = correct_sequence(aligner, seq)
                if is_aligned:
                    if new_seq != read.sequence:
                        corrected_reads += 1
                    read.sequence = new_seq
                    if hasattr(read, 'quality'):
                        fix_quality(read)
                    write_record(read, corrfp)

                    written_reads += 1
                    written_bp += len(new_seq)

        print >>sys.stderr, 'removing %s' % pass2filename
        os.unlink(pass2filename)

    print >>sys.stderr, 'removing temp directory & contents (%s)' % tempdir
    shutil.rmtree(tempdir)

    n_passes = 1.0 + (float(save_pass2_total) / n_reads)
    percent_reads_corrected = float(corrected_reads +
                                    (n_reads - written_reads)) /\
        n_reads * 100.0

    print >>sys.stderr, 'read %d reads, %d bp' % (n_reads, n_bp,)
    print >>sys.stderr, 'wrote %d reads, %d bp' % (written_reads, written_bp,)
    print >>sys.stderr, 'looked at %d reads twice (%.2f passes)' % \
        (save_pass2_total, n_passes)
    print >>sys.stderr, 'removed %d reads and corrected %d reads (%.2f%%)' % \
        (n_reads - written_reads, corrected_reads, percent_reads_corrected)
    print >>sys.stderr, 'removed %.2f%% of bases (%d total)' % \
        ((1 - (written_bp / float(n_bp))) * 100.0, n_bp - written_bp)

    if args.variable_coverage:
        percent_reads_hicov = 100.0 * float(n_reads - skipped_n) / n_reads
        print >>sys.stderr, '%d reads were high coverage (%.2f%%);' % \
            (n_reads - skipped_n, percent_reads_hicov)
        print >>sys.stderr, ('skipped %d reads/%d bases because of low'
                             'coverage') % (skipped_n, skipped_bp)

    fp_rate = \
        khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975
    print >>sys.stderr, \
        'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate)

    print >>sys.stderr, 'output in *.corr'

    if args.savegraph:
        print >>sys.stderr, "Saving k-mer countgraph to", args.savegraph
        ct.save(args.savegraph)
Пример #19
0
def main():
    parser = sanitize_help(get_parser())
    args = parser.parse_args()
    if not args.quiet:
        info('trim-low-abund.py', ['streaming'])

    configure_logging(args.quiet)

    ###

    if len(set(args.input_filenames)) != len(args.input_filenames):
        log_error("Error: Cannot input the same filename multiple times.")
        sys.exit(1)

    if args.trim_at_coverage != DEFAULT_TRIM_AT_COVERAGE and \
       not args.variable_coverage:
        log_error("Error: --trim-at-coverage/-Z given, but "
                  "--variable-coverage/-V not specified.")
        sys.exit(1)

    if args.diginorm_coverage != DEFAULT_DIGINORM_COVERAGE and \
       not args.diginorm:
        log_error("Error: --diginorm-coverage given, but "
                  "--diginorm not specified.")
        sys.exit(1)

    if args.diginorm and args.single_pass:
        log_error("Error: --diginorm and --single-pass are incompatible!\n"
                  "You probably want to use normalize-by-median.py instead.")
        sys.exit(1)

    ###

    report_on_config(args)
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savegraph:
        graphsize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, graphsize, args.force)

    if ('-' in args.input_filenames or '/dev/stdin' in args.input_filenames) \
       and not args.output:
        log_error("Accepting input from stdin; output filename must "
                  "be provided with -o.")
        sys.exit(1)

    if args.loadgraph:
        log_info('loading countgraph from {graph}', graph=args.loadgraph)
        ct = khmer.load_countgraph(args.loadgraph)
    else:
        log_info('making countgraph')
        ct = khmer_args.create_countgraph(args)

    K = ct.ksize()
    tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir)
    log_info(
        'created temporary directory {temp};\n'
        'use -T to change location',
        temp=tempdir)

    trimmer = Trimmer(ct, not args.variable_coverage, args.cutoff,
                      args.trim_at_coverage)
    if args.diginorm:
        trimmer.set_diginorm(args.diginorm_coverage)

    # ### FIRST PASS ###

    save_pass2_total = 0

    written_bp = 0
    written_reads = 0

    # only create the file writer once if outfp is specified; otherwise,
    # create it for each file.
    if args.output:
        trimfp = get_file_writer(args.output, args.gzip, args.bzip)

    pass2list = []
    for filename in args.input_filenames:
        # figure out temporary filename for 2nd pass
        pass2filename = os.path.basename(filename) + '.pass2'
        pass2filename = os.path.join(tempdir, pass2filename)
        pass2fp = open(pass2filename, 'w')

        # construct output filenames
        if args.output is None:
            # note: this will be saved in trimfp.
            outfp = open(os.path.basename(filename) + '.abundtrim', 'wb')

            # get file handle w/gzip, bzip
            trimfp = get_file_writer(outfp, args.gzip, args.bzip)

        # record all this info
        pass2list.append((filename, pass2filename, trimfp))

        # input file stuff: get a broken_paired reader.
        screed_iter = screed.open(filename)
        paired_iter = broken_paired_reader(screed_iter,
                                           min_length=K,
                                           force_single=args.ignore_pairs)

        # main loop through the file.
        n_start = trimmer.n_reads
        save_start = trimmer.n_saved

        watermark = REPORT_EVERY_N_READS
        for read in trimmer.pass1(paired_iter, pass2fp):
            if (trimmer.n_reads - n_start) > watermark:
                log_info(
                    "... {filename} {n_saved} {n_reads} {n_bp} "
                    "{w_reads} {w_bp}",
                    filename=filename,
                    n_saved=trimmer.n_saved,
                    n_reads=trimmer.n_reads,
                    n_bp=trimmer.n_bp,
                    w_reads=written_reads,
                    w_bp=written_bp)
                watermark += REPORT_EVERY_N_READS

            # write out the trimmed/etc sequences that AREN'T going to be
            # revisited in a 2nd pass.
            write_record(read, trimfp)
            written_bp += len(read)
            written_reads += 1
        pass2fp.close()

        log_info("{filename}: kept aside {kept} of {total} from first pass",
                 filename=filename,
                 kept=trimmer.n_saved - save_start,
                 total=trimmer.n_reads - n_start)

    # first pass goes across all the data, so record relevant stats...
    n_reads = trimmer.n_reads
    n_bp = trimmer.n_bp
    n_skipped = trimmer.n_skipped
    bp_skipped = trimmer.bp_skipped
    save_pass2_total = trimmer.n_saved

    # ### SECOND PASS. ###

    # nothing should have been skipped yet!
    assert trimmer.n_skipped == 0
    assert trimmer.bp_skipped == 0

    if args.single_pass:
        pass2list = []

    # go back through all the files again.
    for _, pass2filename, trimfp in pass2list:
        log_info('second pass: looking at sequences kept aside in {pass2}',
                 pass2=pass2filename)

        # note that for this second pass, we don't care about paired
        # reads - they will be output in the same order they're read in,
        # so pairs will stay together if not orphaned.  This is in contrast
        # to the first loop.  Hence, force_single=True below.

        screed_iter = screed.open(pass2filename, parse_description=False)
        paired_iter = broken_paired_reader(screed_iter,
                                           min_length=K,
                                           force_single=True)

        watermark = REPORT_EVERY_N_READS
        for read in trimmer.pass2(paired_iter):
            if (trimmer.n_reads - n_start) > watermark:
                log_info('... x 2 {a} {b} {c} {d} {e} {f} {g}',
                         a=trimmer.n_reads - n_start,
                         b=pass2filename,
                         c=trimmer.n_saved,
                         d=trimmer.n_reads,
                         e=trimmer.n_bp,
                         f=written_reads,
                         g=written_bp)
                watermark += REPORT_EVERY_N_READS

            write_record(read, trimfp)
            written_reads += 1
            written_bp += len(read)

        log_info('removing {pass2}', pass2=pass2filename)
        os.unlink(pass2filename)

        # if we created our own trimfps, close 'em.
        if not args.output:
            trimfp.close()

    log_info('removing temp directory & contents ({temp})', temp=tempdir)
    shutil.rmtree(tempdir)

    trimmed_reads = trimmer.trimmed_reads

    n_passes = 1.0 + (float(save_pass2_total) / n_reads)
    percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\
        n_reads * 100.0

    log_info('read {read} reads, {bp} bp', read=n_reads, bp=n_bp)
    log_info('wrote {wr} reads, {wbp} bp', wr=written_reads, wbp=written_bp)
    log_info('looked at {st} reads twice ({np:.2f} passes)',
             st=save_pass2_total,
             np=n_passes)
    log_info('removed {r} reads and trimmed {t} reads ({p:.2f}%)',
             r=n_reads - written_reads,
             t=trimmed_reads,
             p=percent_reads_trimmed)
    log_info('trimmed or removed {p:.2f}%% of bases ({bp} total)',
             p=(1 - (written_bp / float(n_bp))) * 100.0,
             bp=n_bp - written_bp)

    if args.variable_coverage:
        percent_reads_hicov = 100.0 * float(n_reads - n_skipped) / n_reads
        log_info('{n} reads were high coverage ({p:.2f}%);',
                 n=n_reads - n_skipped,
                 p=percent_reads_hicov)
        log_info('skipped {r} reads/{bp} bases because of low coverage',
                 r=n_skipped,
                 bp=bp_skipped)

    fp_rate = \
        khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975
    log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate)

    log_info('output in *.abundtrim')

    if args.savegraph:
        log_info("Saving k-mer countgraph to {graph}", graph=args.savegraph)
        ct.save(args.savegraph)
Пример #20
0
def main():
    info('split-paired-reads.py')
    args = get_parser().parse_args()

    infile = args.infile

    filenames = [infile]
    check_input_files(infile, args.force)
    check_space(filenames, args.force)

    # decide where to put output files - specific directory? or just default?
    if infile == '/dev/stdin' or infile == '-':
        if not (args.output_first and args.output_second):
            print >> sys.stderr, ("Accepting input from stdin; "
                                  "output filenames must be provided.")
            sys.exit(1)
    elif args.output_directory:
        if not os.path.exists(args.output_directory):
            os.makedirs(args.output_directory)
        out1 = args.output_directory + '/' + os.path.basename(infile) + '.1'
        out2 = args.output_directory + '/' + os.path.basename(infile) + '.2'
    else:
        out1 = os.path.basename(infile) + '.1'
        out2 = os.path.basename(infile) + '.2'

    # OVERRIDE output file locations with -1, -2
    if args.output_first:
        fp_out1 = args.output_first
        out1 = fp_out1.name
    else:
        # Use default filename created above
        fp_out1 = open(out1, 'w')
    if args.output_second:
        fp_out2 = args.output_second
        out2 = fp_out2.name
    else:
        # Use default filename created above
        fp_out2 = open(out2, 'w')

    counter1 = 0
    counter2 = 0
    index = None

    screed_iter = screed.open(infile, parse_description=False)

    # walk through all the reads in broken-paired mode.
    paired_iter = broken_paired_reader(screed_iter)
    for index, is_pair, record1, record2 in paired_iter:
        if index % 10000 == 0:
            print('...', index, file=sys.stderr)

        # are we requiring pairs?
        if args.force_paired and not is_pair:
            print('ERROR, %s is not part of a pair' % record1.name,
                  file=sys.stderr)
            sys.exit(1)

        if is_pair:
            write_record(record1, fp_out1)
            counter1 += 1
            write_record(record2, fp_out2)
            counter2 += 1
        else:
            name = record1.name
            if check_is_left(name):
                write_record(record1, fp_out1)
                counter1 += 1
            elif check_is_right(name):
                write_record(record1, fp_out2)
                counter2 += 1
            else:
                print("Unrecognized format for read pair information: %s" %
                      name,
                      file=sys.stderr)
                print("Exiting.", file=sys.stderr)
                sys.exit(1)

    print("DONE; split %d sequences (%d left, %d right)" %
          (counter1 + counter2, counter1, counter2),
          file=sys.stderr)
    print("/1 reads in %s" % out1, file=sys.stderr)
    print("/2 reads in %s" % out2, file=sys.stderr)
Пример #21
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('database')
    parser.add_argument('input_filenames',
                        metavar='input_sequence_filename',
                        help='Input FAST[AQ] sequence filename.',
                        nargs='+')
    parser.add_argument('-k', '--ksize', type=int, default=31)
    parser.add_argument('-p',
                        '--paired',
                        action='store_true',
                        help='require that all sequences be properly paired')
    parser.add_argument('--force_single',
                        dest='force_single',
                        action='store_true',
                        help='treat all sequences as single-ended/unpaired')
    parser.add_argument('-u',
                        '--unpaired-reads',
                        metavar="unpaired_reads_filename",
                        help='include a file of unpaired reads to which '
                        '-p/--paired does not apply.')
    parser.add_argument('-f',
                        '--force',
                        dest='force',
                        help='continue past file reading errors',
                        action='store_true')
    args = parser.parse_args()

    force_single = args.force_single

    #if args.reads == '-':
    #    args.reads = sys.stdin

    # check that input files exist
    check_valid_file_exists(args.input_filenames)

    filenames = []
    for pathfilename in args.input_filenames:
        filenames.append(pathfilename)

    # make a list of all filenames and if they're paired or not;
    # if we don't know if they're paired, default to allowing but not
    # forcing pairing.
    files = []
    for element in filenames:
        files.append([element, args.paired])
    if args.unpaired_reads:
        files.append([args.unpaired_reads, False])

    # create object of Nodetable in Khmer to use its
    kh = khmer.Nodetable(args.ksize, 1, 1)

    # load database
    mphf_filename = args.database + '.mphf'
    array_filename = args.database + '.arr'
    print('loading database {}'.format(args.database))

    with open(array_filename, 'rb') as fp:
        mphf_to_kmer, mphf_to_cdbg, family_ids, cdbg_to_family_id = pickle.load(
            fp)
    mphf = bbhash.load_mphf(mphf_filename)

    print('done!')

    def get_kmer_to_family_ids(hashval):
        mphf_hash = mphf.lookup(hashval)
        if mphf_hash is None:
            return set()

        kmer_hash = mphf_to_kmer[mphf_hash]
        if kmer_hash != hashval:
            return set()

        cdbg_id = mphf_to_cdbg[mphf_hash]
        id_list = cdbg_to_family_id[cdbg_id]
        return id_list

    def readFusion(read):
        global n_unmatched, n_same, n_amb_same, n_clear_fusion, n_ambig_fusion, n_mutli_fusion
        flag = None
        lf_ids = set()
        rt_ids = set()
        families = []
        shared_kmers = []
        gaps = []

        hashvals = kh.get_kmer_hashes(read.sequence)

        # find a matching k-mer at the beginning of the read
        lf = hashvals[0]
        lf_ids = get_kmer_to_family_ids(lf)
        idx = 1
        while idx < len(hashvals) and len(lf_ids) == 0:
            lf = hashvals[idx]
            lf_ids = get_kmer_to_family_ids(lf)
            idx += 1

        if len(lf_ids) == 0:
            #print('no single match')
            n_unmatched += 1
            flag = "unmatched"
        elif idx == len(hashvals):
            #print('same, only last kmer matched')
            families.append(lf_ids)
            if len(lf_ids) == 1:
                n_same += 1
                flag = "unique"
            else:
                n_amb_same += 1
                flag = "ambiguous"
        else:  # len(lf_ids) > 0 & idx < len(hashvals)
            # find a matching k-mer at the end of the read
            rt = hashvals[-1]
            rt_ids = get_kmer_to_family_ids(rt)
            idy = len(hashvals) - 2
            while idy >= idx and len(rt_ids) == 0:
                rt = hashvals[idy]
                rt_ids = get_kmer_to_family_ids(rt)
                idy -= 1

            if len(rt_ids) == 0:
                #print('same, only one non-last kmer matched ')
                families.append(lf_ids)
                if len(lf_ids) == 1:
                    n_same += 1
                    flag = "unique"
                else:
                    n_amb_same += 1
                    flag = "ambiguous"
            else:
                intersect_ids = lf_ids.intersection(rt_ids)
                if len(intersect_ids) > 0:
                    families.append(intersect_ids)
                    if len(intersect_ids) == 1:
                        n_same += 1
                        flag = "unique"
                    else:
                        n_amb_same += 1
                        flag = "ambiguous"
                else:  # fusion to be resolved
                    shared_kmer = 1
                    gap_size = 0
                    gap = False
                    while idx <= idy + 1:
                        temp = hashvals[idx]
                        temp_ids = get_kmer_to_family_ids(temp)
                        if len(temp_ids) > 0:
                            intersect_ids = lf_ids.intersection(temp_ids)
                            if len(intersect_ids) > 0:
                                lf_ids = intersect_ids
                                shared_kmer += 1
                                gap_size = 0
                            else:  # len(intersect_ids) == 0
                                families.append(lf_ids)
                                shared_kmers.append(shared_kmer)
                                lf_ids = temp_ids
                                shared_kmer = 1
                                gaps.append(gap_size)
                                gap_size = 0
                        else:
                            gap_size += 1
                        idx += 1

                    families.append(lf_ids)
                    shared_kmers.append(shared_kmer)

                    assert len(families) > 1
                    if len(families) == 2:
                        if len(families[0]) == 1 and len(families[1]) == 1:
                            n_clear_fusion += 1
                            flag = "clear_fusion"
                        else:
                            n_ambig_fusion += 1
                            flag = "ambig_fusion"
                    else:  # len(families) > 2
                        n_mutli_fusion += 1
                        flag = "multi_fusion"

        #if len(families) == 0:
        #    families = "-"

        #if len(shared_kmers) == 0:
        #    shared_kmers = "-"

        return flag, families, shared_kmers, gaps

    fusion_filename = args.database + '_fusion.fa'
    fusion_fp = open(fusion_filename, 'w')
    fusionInfo_filename = args.database + '_fusion.info'
    fusionInfo_fp = open(fusionInfo_filename, 'w')
    print("fileName",
          "recordIndex",
          "whichInPair",
          "align_class",
          "gene_families",
          "shared_kmers",
          "gaps",
          file=fusionInfo_fp,
          sep='\t')
    fusionCalc_filename = args.database + '_fusion.calc'
    fusionCalc_fp = open(fusionCalc_filename, 'w')
    print("fileName",
          "recordIndex",
          "whichInPair",
          "align_class",
          "familiy_A",
          "familiy_B",
          "no_families",
          "len_families",
          "shared_kmers",
          "gaps",
          "sorted_keys",
          file=fusionCalc_fp,
          sep='\t')

    fusionPairs_filename = args.database + '_fusionPairs.fa'
    fusPair_fp = open(fusionPairs_filename, 'w')
    fusionPairsInfo_filename = args.database + '_fusionPairs.info'
    fusPairInfo_fp = open(fusionPairsInfo_filename, 'w')
    print("fileName",
          "recordIndex",
          "fusion_class",
          "R1_family",
          "R2_family",
          file=fusPairInfo_fp,
          sep='\t')
    fusionPairsCalc_filename = args.database + '_fusionPairs.calc'
    fusPairCalc_fp = open(fusionPairsCalc_filename, 'w')
    print("fileName",
          "recordIndex",
          "fusion_class",
          "familiy_A",
          "familiy_B",
          "len_families",
          "sorted_keys",
          file=fusPairCalc_fp,
          sep='\t')

    corrupt_files = []
    family_names = dict(zip(family_ids.values(), family_ids.keys()))
    n = 0
    n_paired_fusion = 0
    sameRef = ("unique", "ambiguous")
    fusion = ("clear_fusion", "ambig_fusion", "multi_fusion")
    for filename, require_paired in files:
        with catch_io_errors(filename, fusion_fp, fusionInfo_fp, fusionCalc_fp,
                             fusPair_fp, fusPairInfo_fp, fusPairCalc_fp,
                             args.force, corrupt_files):
            screed_iter = clean_input_reads(screed.open(filename))
            reader = broken_paired_reader(screed_iter,
                                          min_length=args.ksize,
                                          force_single=force_single,
                                          require_paired=require_paired)

            for r_index, is_paired, read0, read1 in reader:
                n += 1
                if n % 10000 == 0:
                    print('...', n)
                    #if n > 5000:
                    #    break

                flag0, families0, shared_kmers0, gaps0 = readFusion(read0)

                if not is_paired and flag0 in fusion:
                    #families_names0 = []
                    #for gp in families0:
                    #    gp_names = []
                    #    for family_id in gp:
                    #        family_name = family_names[family_id]
                    #        gp_names.append(family_name)
                    #    families_names0.append(gp_names)

                    print(filename,
                          r_index,
                          "single",
                          flag0,
                          families0,
                          shared_kmers0,
                          gaps0,
                          file=fusionInfo_fp,
                          sep='\t')
                    write_record(read0, fusion_fp)

                    #i = 1
                    #while i < len(families0):
                    #    for g1 in families0[i-1]:
                    #        for g2 in families0[i]:
                    #            print(filename, r_index, "single", flag0, sorted([g1,g2]), len(families0), len(families0[i-1]), len(families0[i]),
                    #                  shared_kmers0, gaps0, file=fusionCalc_fp, sep='\t')
                    #    i += 1

                    i = len(families0) - 1
                    for g1 in families0[0]:
                        g1_name = family_names[g1]
                        for g2 in families0[i]:
                            g2_name = family_names[g2]
                            print(filename,
                                  r_index,
                                  "single",
                                  flag0,
                                  '{}:{}'.format(g1, g1_name),
                                  '{}:{}'.format(g2, g2_name),
                                  len(families0), [len(f) for f in families0],
                                  shared_kmers0,
                                  gaps0,
                                  sorted([g1, g2]),
                                  file=fusionCalc_fp,
                                  sep='\t')

                if is_paired:
                    flag1, families1, shared_kmers1, gaps1 = readFusion(read1)

                    if flag0 in fusion or flag1 in fusion:
                        print(filename,
                              r_index,
                              "Read_1",
                              flag0,
                              families0,
                              shared_kmers0,
                              gaps0,
                              file=fusionInfo_fp,
                              sep='\t')
                        write_record(read0, fusion_fp)
                        print(filename,
                              r_index,
                              "Read_2",
                              flag1,
                              families1,
                              shared_kmers1,
                              gaps1,
                              file=fusionInfo_fp,
                              sep='\t')
                        write_record(read1, fusion_fp)

                        if flag0 in fusion:
                            i = len(families0) - 1
                            for g1 in families0[0]:
                                g1_name = family_names[g1]
                                for g2 in families0[i]:
                                    g2_name = family_names[g2]
                                    print(filename,
                                          r_index,
                                          "Read_1",
                                          flag0,
                                          '{}:{}'.format(g1, g1_name),
                                          '{}:{}'.format(g2, g2_name),
                                          len(families0),
                                          [len(f) for f in families0],
                                          shared_kmers0,
                                          gaps0,
                                          sorted([g1, g2]),
                                          file=fusionCalc_fp,
                                          sep='\t')

                        if flag1 in fusion:
                            i = len(families1) - 1
                            for g1 in families1[0]:
                                g1_name = family_names[g1]
                                for g2 in families1[i]:
                                    g2_name = family_names[g2]
                                    print(filename,
                                          r_index,
                                          "Read_2",
                                          flag1,
                                          '{}:{}'.format(g1, g1_name),
                                          '{}:{}'.format(g2, g2_name),
                                          len(families1),
                                          [len(f) for f in families1],
                                          shared_kmers1,
                                          gaps1,
                                          sorted([g1, g2]),
                                          file=fusionCalc_fp,
                                          sep='\t')

                    elif flag0 in sameRef and flag1 in sameRef:
                        if len(families0[0].intersection(families1[0])) == 0:
                            n_paired_fusion += 1

                            if flag0 == "unique" and flag1 == "unique":
                                fusion_class = "clear_fusion"
                            else:
                                fusion_class = "ambig_fusion"

                            print(filename,
                                  r_index,
                                  fusion_class,
                                  families0,
                                  families1,
                                  file=fusPairInfo_fp,
                                  sep='\t')
                            write_record(read0, fusPair_fp)
                            write_record(read1, fusPair_fp)

                            for g1 in families0[0]:
                                g1_name = family_names[g1]
                                for g2 in families1[0]:
                                    g2_name = family_names[g2]
                                    print(filename,
                                          r_index,
                                          fusion_class,
                                          '{}:{}'.format(g1, g1_name),
                                          '{}:{}'.format(g2, g2_name), [
                                              len(f) for f in (families0[0],
                                                               families1[0])
                                          ],
                                          sorted([g1, g2]),
                                          file=fusPairCalc_fp,
                                          sep='\t')

    print('No of input fragments: ', n)
    print('unmatched:', n_unmatched)
    print('Unique:', n_same)
    print('Ambiguous:', n_amb_same)
    print('Single read clear fusion:', n_clear_fusion)
    print('Single read ambiguous fusion:', n_ambig_fusion)
    print('Single read multi fusion:', n_mutli_fusion)
    print('paired read fusion:', n_paired_fusion)
Пример #22
0
def main():
    info('sample-reads-randomly.py')
    args = get_parser().parse_args()

    for _ in args.filenames:
        check_input_files(_, args.force)

    # seed the random number generator?
    if args.random_seed:
        random.seed(args.random_seed)

    # bound n_samples
    num_samples = max(args.num_samples, 1)

    #
    # Figure out what the output filename is going to be

    if args.output_file:
        output_filename = args.output_file.name
        if num_samples > 1:
            sys.stderr.write(
                "Error: cannot specify -o with more than one sample.")
            if not args.force:
                print("NOTE: This can be overridden using the --force"
                      " argument", file=sys.stderr)
                sys.exit(1)
    else:
        filename = args.filenames[0]
        if filename in ('/dev/stdin', '-'):
            print("Accepting input from stdin; output filename must "
                  "be provided with '-o'.", file=sys.stderr)
            sys.exit(1)
        output_filename = os.path.basename(filename) + '.subset'

    if num_samples == 1:
        print('Subsampling %d reads using reservoir sampling.' %
              args.num_reads, file=sys.stderr)
        print('Subsampled reads will be placed in %s' %
              output_filename, file=sys.stderr)
        print('', file=sys.stderr)
    else:  # > 1
        print('Subsampling %d reads, %d times,'
              % (args.num_reads, num_samples), ' using reservoir sampling.',
              file=sys.stderr)
        print('Subsampled reads will be placed in %s.N'
              % output_filename, file=sys.stderr)
        print('', file=sys.stderr)

    reads = []
    for n in range(num_samples):
        reads.append([])

    # read through all the sequences and load/resample the reservoir
    for filename in args.filenames:
        print('opening', filename, 'for reading', file=sys.stderr)
        screed_iter = screed.open(filename)

        for count, (_, ispair, rcrd1, rcrd2) in enumerate(broken_paired_reader(
                screed_iter,
                force_single=args.force_single)):
            if count % 10000 == 0:
                print('...', count, 'reads scanned', file=sys.stderr)
                if count >= args.max_reads:
                    print('reached upper limit of %d reads' %
                          args.max_reads, '(see -M); exiting', file=sys.stderr)
                    break

            # collect first N reads
            if count < args.num_reads:
                for n in range(num_samples):
                    reads[n].append((rcrd1, rcrd2))
            else:
                assert len(reads[n]) <= count

                # use reservoir sampling to replace reads at random
                # see http://en.wikipedia.org/wiki/Reservoir_sampling

                for n in range(num_samples):
                    guess = random.randint(1, count)
                    if guess <= args.num_reads:
                        reads[n][guess - 1] = (rcrd1, rcrd2)

    # output all the subsampled reads:
    if len(reads) == 1:
        print('Writing %d sequences to %s' %
              (len(reads[0]), output_filename), file=sys.stderr)

        output_file = args.output_file
        if not output_file:
            output_file = open(output_filename, 'wb')

        output_file = get_file_writer(output_file, args.gzip, args.bzip)

        for records in reads[0]:
            write_record(records[0], output_file)
            if records[1] is not None:
                write_record(records[1], output_file)
    else:
        for n in range(num_samples):
            n_filename = output_filename + '.%d' % n
            print('Writing %d sequences to %s' %
                  (len(reads[n]), n_filename), file=sys.stderr)
            output_file = get_file_writer(open(n_filename, 'wb'), args.gzip,
                                          args.bzip)
            for records in reads[n]:
                write_record(records[0], output_file)
                if records[1] is not None:
                    write_record(records[1], output_file)
Пример #23
0
def main():
    info('sample-reads-randomly.py')
    args = get_parser().parse_args()

    for _ in args.filenames:
        check_input_files(_, args.force)

    check_space(args.filenames, args.force)

    # seed the random number generator?
    if args.random_seed:
        random.seed(args.random_seed)

    # bound n_samples
    num_samples = max(args.num_samples, 1)

    #
    # Figure out what the output filename is going to be
    #

    output_file = args.output_file
    if output_file:
        if num_samples > 1:
            sys.stderr.write(
                "Error: cannot specify -o with more than one sample.")
            if not args.force:
                sys.exit(1)
        output_filename = output_file.name
    else:
        filename = args.filenames[0]
        output_filename = os.path.basename(filename) + '.subset'

    if num_samples == 1:
        print('Subsampling %d reads using reservoir sampling.' %
              args.num_reads, file=sys.stderr)
        print('Subsampled reads will be placed in %s' %
              output_filename, file=sys.stderr)
        print('', file=sys.stderr)
    else:  # > 1
        print('Subsampling %d reads, %d times,'
              % (args.num_reads, num_samples), ' using reservoir sampling.',
              file=sys.stderr)
        print('Subsampled reads will be placed in %s.N'
              % output_filename, file=sys.stderr)
        print('', file=sys.stderr)

    reads = []
    for n in range(num_samples):
        reads.append([])

    # read through all the sequences and load/resample the reservoir
    for filename in args.filenames:
        print('opening', filename, 'for reading', file=sys.stderr)
        screed_iter = screed.open(filename, parse_description=False)

        for count, (_, ispair, rcrd1, rcrd2) in enumerate(broken_paired_reader(
                screed_iter,
                force_single=args.force_single)):
            if count % 10000 == 0:
                print('...', count, 'reads scanned', file=sys.stderr)
                if count >= args.max_reads:
                    print('reached upper limit of %d reads' %
                          args.max_reads, '(see -M); exiting', file=sys.stderr)
                    break

            # collect first N reads
            if count < args.num_reads:
                for n in range(num_samples):
                    reads[n].append((rcrd1, rcrd2))
            else:
                assert len(reads[n]) <= count

                # use reservoir sampling to replace reads at random
                # see http://en.wikipedia.org/wiki/Reservoir_sampling

                for n in range(num_samples):
                    guess = random.randint(1, count)
                    if guess <= args.num_reads:
                        reads[n][guess - 1] = (rcrd1, rcrd2)

    # output all the subsampled reads:
    if len(reads) == 1:
        print('Writing %d sequences to %s' %
              (len(reads[0]), output_filename), file=sys.stderr)
        if not output_file:
            output_file = open(output_filename, 'w')

        for records in reads[0]:
            write_record(records[0], output_file)
            if records[1] is not None:
                write_record(records[1], output_file)
    else:
        for n in range(num_samples):
            n_filename = output_filename + '.%d' % n
            print('Writing %d sequences to %s' %
                  (len(reads[n]), n_filename), file=sys.stderr)
            output_file = open(n_filename, 'w')
            for records in reads[n]:
                write_record(records[0], output_file)
                if records[1] is not None:
                    write_record(records[1], output_file)
Пример #24
0
def main():  # pylint: disable=too-many-branches,too-many-statements
    parser = sanitize_help(get_parser())
    args = parser.parse_args()

    configure_logging(args.quiet)
    report_on_config(args)

    report_fp = args.report
    force_single = args.force_single

    # check for similar filenames
    # if we're using a single output file only check for identical filenames
    # otherwise, check for identical BASE names as well.
    filenames = []
    basenames = []
    for pathfilename in args.input_filenames:
        filenames.append(pathfilename)
        if args.single_output_file:
            continue  # nothing more to worry about

        basename = os.path.basename(pathfilename)
        if basename in basenames:
            log_error('ERROR: Duplicate filename--Cannot handle this!')
            log_error('** Exiting!')
            sys.exit(1)

        basenames.append(basename)

    # check that files exist and there is sufficient output disk space.
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savegraph is not None:
        graphsize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, graphsize, args.force)

    # load or create counting table.
    if args.loadgraph:
        log_info('loading k-mer countgraph from {graph}', graph=args.loadgraph)
        countgraph = khmer.load_countgraph(args.loadgraph)
    else:
        log_info('making countgraph')
        countgraph = khmer_args.create_countgraph(args)

    # create an object to handle diginorm of all files
    norm = Normalizer(args.cutoff, countgraph)
    with_diagnostics = WithDiagnostics(norm, report_fp, args.report_frequency)

    # make a list of all filenames and if they're paired or not;
    # if we don't know if they're paired, default to allowing but not
    # forcing pairing.
    files = []
    for element in filenames:
        files.append([element, args.paired])
    if args.unpaired_reads:
        files.append([args.unpaired_reads, False])

    corrupt_files = []
    outfp = None
    output_name = None

    if args.single_output_file:
        outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip)
    else:
        if '-' in filenames or '/dev/stdin' in filenames:
            print(
                "Accepting input from stdin; output filename must "
                "be provided with '-o'.",
                file=sys.stderr)
            sys.exit(1)

    #
    # main loop: iterate over all files given, do diginorm.
    #

    for filename, require_paired in files:
        if not args.single_output_file:
            output_name = os.path.basename(filename) + '.keep'
            outfp = open(output_name, 'wb')
            outfp = get_file_writer(outfp, args.gzip, args.bzip)

        # failsafe context manager in case an input file breaks
        with catch_io_errors(filename, outfp, args.single_output_file,
                             args.force, corrupt_files):
            screed_iter = clean_input_reads(screed.open(filename))
            reader = broken_paired_reader(screed_iter,
                                          min_length=args.ksize,
                                          force_single=force_single,
                                          require_paired=require_paired)

            # actually do diginorm
            for record in with_diagnostics(reader, filename):
                if record is not None:
                    write_record(record, outfp)

            log_info('output in {name}', name=describe_file_handle(outfp))
            if not args.single_output_file:
                outfp.close()

    # finished - print out some diagnostics.

    log_info('Total number of unique k-mers: {umers}',
             umers=countgraph.n_unique_kmers())

    if args.savegraph is not None:
        log_info('...saving to {name}', name=args.savegraph)
        countgraph.save(args.savegraph)

    fp_rate = \
        khmer.calc_expected_collisions(countgraph, False, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975

    log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate)

    if args.force and len(corrupt_files) > 0:
        log_error("** WARNING: Finished with errors!")
        log_error("** I/O Errors occurred in the following files:")
        log_error("\t" + " ".join(corrupt_files))
Пример #25
0
def main():
    info('correct-reads.py', ['streaming'])
    args = sanitize_help(get_parser()).parse_args()

    ###

    if len(set(args.input_filenames)) != len(args.input_filenames):
        print("Error: Cannot input the same filename multiple times.",
              file=sys.stderr)
        sys.exit(1)

    ###

    report_on_config(args)
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    tablesize = calculate_graphsize(args, 'countgraph')

    if args.savegraph:
        check_space_for_graph(args.savegraph, tablesize, args.force)

    K = args.ksize

    CUTOFF = args.cutoff
    NORMALIZE_LIMIT = args.normalize_to

    if args.loadgraph:
        print('loading k-mer countgraph from', args.loadgraph, file=sys.stderr)
        ct = Countgraph.load(args.loadgraph)
    else:
        print('making k-mer countgraph', file=sys.stderr)
        ct = create_countgraph(args, multiplier=8 / (9. + 0.3))
    tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir)
    print('created temporary directory %s; use -T to change location' %
          tempdir,
          file=sys.stderr)

    aligner = khmer.ReadAligner(ct, args.cutoff, args.bits_theta)

    # ### FIRST PASS ###

    save_pass2_total = 0

    n_bp = 0
    n_reads = 0
    written_bp = 0
    written_reads = 0
    corrected_reads = 0

    pass2list = []
    for filename in args.input_filenames:
        pass2filename = os.path.basename(filename) + '.pass2'
        pass2filename = os.path.join(tempdir, pass2filename)
        if args.out is None:
            corrfp = open(os.path.basename(filename) + '.corr', 'w')
        else:
            corrfp = args.out

        pass2list.append((filename, pass2filename, corrfp))

        screed_iter = screed.open(filename, parse_description=False)
        pass2fp = open(pass2filename, 'w')

        save_pass2 = 0
        n = 0

        paired_iter = broken_paired_reader(screed_iter,
                                           min_length=K,
                                           force_single=args.ignore_pairs)
        for n, is_pair, read1, read2 in paired_iter:
            if n % 10000 == 0:
                print('...',
                      n,
                      filename,
                      save_pass2,
                      n_reads,
                      n_bp,
                      written_reads,
                      written_bp,
                      file=sys.stderr)

            # we want to track paired reads here, to make sure that pairs
            # are not split between first pass and second pass.

            if is_pair:
                n_reads += 2
                n_bp += len(read1.sequence) + len(read2.sequence)

                seq1 = read1.sequence.replace('N', 'A')
                seq2 = read2.sequence.replace('N', 'A')

                med1, _, _ = ct.get_median_count(seq1)
                med2, _, _ = ct.get_median_count(seq2)

                if med1 < NORMALIZE_LIMIT or med2 < NORMALIZE_LIMIT:
                    ct.consume(seq1)
                    ct.consume(seq2)
                    write_record_pair(read1, read2, pass2fp)
                    save_pass2 += 2
                else:
                    is_aligned, new_seq1 = correct_sequence(aligner, seq1)
                    if is_aligned:
                        if new_seq1 != read1.sequence:
                            corrected_reads += 1
                        read1.sequence = new_seq1
                        if hasattr(read1, 'quality'):
                            fix_quality(read1)

                    is_aligned, new_seq2 = correct_sequence(aligner, seq2)
                    if is_aligned:
                        if new_seq2 != read2.sequence:
                            corrected_reads += 1
                        read2.sequence = new_seq2
                        if hasattr(read2, 'quality'):
                            fix_quality(read2)

                    write_record_pair(read1, read2, corrfp)
                    written_reads += 2
                    written_bp += len(read1)
                    written_bp += len(read2)
            else:
                n_reads += 1
                n_bp += len(read1.sequence)

                seq = read1.sequence.replace('N', 'A')

                med, _, _ = ct.get_median_count(seq)

                # has this portion of the graph saturated? if not,
                # consume & save => pass2.
                if med < NORMALIZE_LIMIT:
                    ct.consume(seq)
                    write_record(read1, pass2fp)
                    save_pass2 += 1
                else:  # trim!!
                    is_aligned, new_seq = correct_sequence(aligner, seq)
                    if is_aligned:
                        if new_seq != read1.sequence:
                            corrected_reads += 1
                        read1.sequence = new_seq
                        if hasattr(read1, 'quality'):
                            fix_quality(read1)

                        write_record(read1, corrfp)

                        written_reads += 1
                        written_bp += len(new_seq)

        pass2fp.close()

        print('%s: kept aside %d of %d from first pass, in %s' %
              (filename, save_pass2, n, filename),
              file=sys.stderr)
        save_pass2_total += save_pass2

    # ### SECOND PASS. ###

    skipped_n = 0
    skipped_bp = 0
    for _, pass2filename, corrfp in pass2list:
        print(('second pass: looking at sequences kept aside in %s') %
              pass2filename,
              file=sys.stderr)

        # note that for this second pass, we don't care about paired
        # reads - they will be output in the same order they're read in,
        # so pairs will stay together if not orphaned.  This is in contrast
        # to the first loop.

        for n, read in enumerate(
                screed.open(pass2filename, parse_description=False)):
            if n % 10000 == 0:
                print('... x 2',
                      n,
                      pass2filename,
                      written_reads,
                      written_bp,
                      file=sys.stderr)

            seq = read.sequence.replace('N', 'A')
            med, _, _ = ct.get_median_count(seq)

            # do we retain low-abundance components unchanged?
            if med < NORMALIZE_LIMIT and args.variable_coverage:
                write_record(read, corrfp)

                written_reads += 1
                written_bp += len(read.sequence)
                skipped_n += 1
                skipped_bp += len(read.sequence)

            # otherwise, examine/correct.
            else:  # med >= NORMALIZE LIMIT or not args.variable_coverage
                is_aligned, new_seq = correct_sequence(aligner, seq)
                if is_aligned:
                    if new_seq != read.sequence:
                        corrected_reads += 1
                    read.sequence = new_seq
                    if hasattr(read, 'quality'):
                        fix_quality(read)
                    write_record(read, corrfp)

                    written_reads += 1
                    written_bp += len(new_seq)

        print('removing %s' % pass2filename, file=sys.stderr)
        os.unlink(pass2filename)

    print('removing temp directory & contents (%s)' % tempdir, file=sys.stderr)
    shutil.rmtree(tempdir)

    n_passes = 1.0 + (float(save_pass2_total) / n_reads)
    percent_reads_corrected = float(corrected_reads +
                                    (n_reads - written_reads)) /\
        n_reads * 100.0

    print('read %d reads, %d bp' % (
        n_reads,
        n_bp,
    ), file=sys.stderr)
    print('wrote %d reads, %d bp' % (
        written_reads,
        written_bp,
    ),
          file=sys.stderr)
    print('looked at %d reads twice (%.2f passes)' %
          (save_pass2_total, n_passes),
          file=sys.stderr)
    print('removed %d reads and corrected %d reads (%.2f%%)' %
          (n_reads - written_reads, corrected_reads, percent_reads_corrected),
          file=sys.stderr)
    print('removed %.2f%% of bases (%d total)' %
          ((1 - (written_bp / float(n_bp))) * 100.0, n_bp - written_bp),
          file=sys.stderr)

    if args.variable_coverage:
        percent_reads_hicov = 100.0 * float(n_reads - skipped_n) / n_reads
        print('%d reads were high coverage (%.2f%%);' %
              (n_reads - skipped_n, percent_reads_hicov),
              file=sys.stderr)
        print(('skipped %d reads/%d bases because of low coverage') %
              (skipped_n, skipped_bp),
              file=sys.stderr)

    fp_rate = \
        khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975
    print('fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate),
          file=sys.stderr)

    print('output in *.corr', file=sys.stderr)

    if args.savegraph:
        print("Saving k-mer countgraph to", args.savegraph, file=sys.stderr)
        ct.save(args.savegraph)
Пример #26
0
def main():
    info('split-paired-reads.py')
    args = sanitize_help(get_parser()).parse_args()

    infile = args.infile

    filenames = [infile]
    check_input_files(infile, args.force)
    check_space(filenames, args.force)

    basename = os.path.basename(infile)

    # decide where to put output files - specific directory? or just default?
    if infile in ('/dev/stdin', '-'):
        if not (args.output_first and args.output_second):
            print(
                "Accepting input from stdin; "
                "output filenames must be provided.",
                file=sys.stderr)
            sys.exit(1)
    elif args.output_directory:
        if not os.path.exists(args.output_directory):
            os.makedirs(args.output_directory)
        out1 = os.path.join(args.output_directory, basename + '.1')
        out2 = os.path.join(args.output_directory, basename + '.2')
    else:
        out1 = basename + '.1'
        out2 = basename + '.2'

    # OVERRIDE output file locations with -1, -2
    if args.output_first:
        fp_out1 = get_file_writer(args.output_first, args.gzip, args.bzip)
        out1 = fp_out1.name
    else:
        # Use default filename created above
        fp_out1 = get_file_writer(open(out1, 'wb'), args.gzip, args.bzip)
    if args.output_second:
        fp_out2 = get_file_writer(args.output_second, args.gzip, args.bzip)
        out2 = fp_out2.name
    else:
        # Use default filename created above
        fp_out2 = get_file_writer(open(out2, 'wb'), args.gzip, args.bzip)

    # put orphaned reads here, if -0!
    if args.output_orphaned:
        fp_out0 = get_file_writer(args.output_orphaned, args.gzip, args.bzip)
        out0 = describe_file_handle(args.output_orphaned)

    counter1 = 0
    counter2 = 0
    counter3 = 0
    index = None

    screed_iter = screed.open(infile)

    # walk through all the reads in broken-paired mode.
    paired_iter = broken_paired_reader(screed_iter,
                                       require_paired=not args.output_orphaned)

    try:
        for index, is_pair, record1, record2 in paired_iter:
            if index % 10000 == 0:
                print('...', index, file=sys.stderr)

            if is_pair:
                write_record(record1, fp_out1)
                counter1 += 1
                write_record(record2, fp_out2)
                counter2 += 1
            elif args.output_orphaned:
                write_record(record1, fp_out0)
                counter3 += 1
    except UnpairedReadsError as e:
        print("Unpaired reads found starting at {name}; exiting".format(
            name=e.read1.name),
              file=sys.stderr)
        sys.exit(1)

    print("DONE; split %d sequences (%d left, %d right, %d orphans)" %
          (counter1 + counter2, counter1, counter2, counter3),
          file=sys.stderr)
    print("/1 reads in %s" % out1, file=sys.stderr)
    print("/2 reads in %s" % out2, file=sys.stderr)
    if args.output_orphaned:
        print("orphans in %s" % out0, file=sys.stderr)
Пример #27
0
def main():
    args = sanitize_help(get_parser()).parse_args()

    configure_logging(args.quiet)
    check_input_files(args.datafile, args.force)
    check_space([args.datafile], args.force)

    if args.savegraph:
        tablesize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, tablesize, args.force)

    report_on_config(args)

    log_info('making countgraph')
    graph = khmer_args.create_countgraph(args)

    # first, load reads into graph
    rparser = khmer.ReadParser(args.datafile)
    threads = []
    log_info('consuming input, round 1 -- {datafile}', datafile=args.datafile)
    for _ in range(args.threads):
        cur_thread = \
            threading.Thread(
                target=graph.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(cur_thread)
        cur_thread.start()

    for _ in threads:
        _.join()

    log_info('Total number of unique k-mers: {nk}', nk=graph.n_unique_kmers())

    fp_rate = khmer.calc_expected_collisions(graph, args.force)
    log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate)

    # the filtering loop
    log_info('filtering {datafile}', datafile=args.datafile)
    if args.outfile is None:
        outfile = os.path.basename(args.datafile) + '.abundfilt'
    else:
        outfile = args.outfile
    outfp = open(outfile, 'wb')
    outfp = get_file_writer(outfp, args.gzip, args.bzip)

    paired_iter = broken_paired_reader(ReadParser(args.datafile),
                                       min_length=graph.ksize(),
                                       force_single=True)

    for n, is_pair, read1, read2 in paired_iter:
        assert not is_pair
        assert read2 is None

        trimmed_record, _ = trim_record(graph, read1, args.cutoff,
                                        args.variable_coverage,
                                        args.normalize_to)
        if trimmed_record:
            print((trimmed_record,))
            write_record(trimmed_record, outfp)

    log_info('output in {outfile}', outfile=outfile)

    if args.savegraph:
        log_info('Saving k-mer countgraph filename {graph}',
                 graph=args.savegraph)
        graph.save(args.savegraph)
Пример #28
0
def main():
    info('split-paired-reads.py')
    args = get_parser().parse_args()

    infile = args.infile

    check_input_files(infile, args.force)
    filenames = [infile]
    check_space(filenames, args.force)

    # decide where to put output files - specific directory? or just default?
    if args.output_directory:
        if not os.path.exists(args.output_directory):
            os.makedirs(args.output_directory)
        out1 = args.output_directory + '/' + os.path.basename(infile) + '.1'
        out2 = args.output_directory + '/' + os.path.basename(infile) + '.2'
    else:
        out1 = os.path.basename(infile) + '.1'
        out2 = os.path.basename(infile) + '.2'

    # OVERRIDE output file locations with -1, -2
    if args.output_first:
        out1 = args.output_first
    if args.output_second:
        out2 = args.output_second

    fp_out1 = open(out1, 'w')
    fp_out2 = open(out2, 'w')

    counter1 = 0
    counter2 = 0
    index = None

    screed_iter = screed.open(infile, parse_description=False)

    # walk through all the reads in broken-paired mode.
    for index, is_pair, record1, record2 in broken_paired_reader(screed_iter):
        if index % 100000 == 0 and index:
            print >> sys.stderr, '...', index

        # are we requiring pairs?
        if args.force_paired and not is_pair:
            print >>sys.stderr, 'ERROR, %s is not part of a pair' % \
                record1.name
            sys.exit(1)

        if is_pair:
            write_record(record1, fp_out1)
            counter1 += 1
            write_record(record2, fp_out2)
            counter2 += 1
        else:
            name = record1.name
            if check_is_left(name):
                write_record(record1, fp_out1)
                counter1 += 1
            elif check_is_right(name):
                write_record(record1, fp_out2)
                counter2 += 1
            else:
                print >>sys.stderr, \
                    "Unrecognized format for read pair information: %s" % name
                print >> sys.stderr, "Exiting."
                sys.exit(1)

    print >> sys.stderr, "DONE; split %d sequences (%d left, %d right)" % \
        (counter1 + counter2, counter1, counter2)
    print >> sys.stderr, "/1 reads in %s" % out1
    print >> sys.stderr, "/2 reads in %s" % out2
Пример #29
0
    script_result = screed.open('test_files/'
                                'simple-metagenome-reads.fa.keep.k20.C20')
    for read_a, read_b in zip(broken_paired_to_single(input_iter), script_result):
        print read_a.name
        assert read_a == read_b, (read_a, read_b)


if __name__ == '__main__':
    filename = sys.argv[1]

    graph = khmer.new_counting_hash(20, 1e7, 4)
    out_fp = open(os.path.basename(filename) + '.abundtrim', 'w')

    ## khmer scripts/trim-low-abund.py -V, using generators
    input_iter = screed.open(filename)
    input_iter = broken_paired_reader(input_iter)
    input_iter = clean_reads(input_iter)
    input_iter = streamtrim(input_iter, graph, 20, 2)
    output_reads(input_iter, out_fp)
    
    graph = khmer.new_counting_hash(20, 1e7, 4)
    out_fp = open(os.path.basename(filename) + '.keep', 'w')

    ## khmer scripts/normalize-by-median.py, using generators
    input_iter = screed.open(filename)
    input_iter = broken_paired_reader(input_iter)
    input_iter = clean_reads(input_iter)
    input_iter = diginorm(input_iter, graph, 20)
    output_reads(input_iter, out_fp)
Пример #30
0
def main():
    args = sanitize_help(get_parser()).parse_args()

    infile = args.infile

    filenames = [infile]
    check_input_files(infile, args.force)
    check_space(filenames, args.force)

    basename = os.path.basename(infile)

    # decide where to put output files - specific directory? or just default?
    if infile in ('/dev/stdin', '-'):
        # seqan only treats '-' as "read from stdin"
        infile = '-'
        if not (args.output_first and args.output_second):
            print("Accepting input from stdin; "
                  "output filenames must be provided.", file=sys.stderr)
            sys.exit(1)
    elif args.output_directory:
        if not os.path.exists(args.output_directory):
            os.makedirs(args.output_directory)
        out1 = os.path.join(args.output_directory, basename + '.1')
        out2 = os.path.join(args.output_directory, basename + '.2')
    else:
        out1 = basename + '.1'
        out2 = basename + '.2'

    # OVERRIDE output file locations with -1, -2
    if args.output_first:
        fp_out1 = get_file_writer(args.output_first, args.gzip, args.bzip)
        out1 = fp_out1.name
    else:
        # Use default filename created above
        fp_out1 = get_file_writer(open(out1, 'wb'), args.gzip, args.bzip)
    if args.output_second:
        fp_out2 = get_file_writer(args.output_second, args.gzip, args.bzip)
        out2 = fp_out2.name
    else:
        # Use default filename created above
        fp_out2 = get_file_writer(open(out2, 'wb'), args.gzip, args.bzip)

    # put orphaned reads here, if -0!
    if args.output_orphaned:
        fp_out0 = get_file_writer(args.output_orphaned, args.gzip, args.bzip)
        out0 = describe_file_handle(args.output_orphaned)

    counter1 = 0
    counter2 = 0
    counter3 = 0
    index = None

    # walk through all the reads in broken-paired mode.
    paired_iter = broken_paired_reader(ReadParser(infile),
                                       require_paired=not args.output_orphaned)

    try:
        for index, is_pair, record1, record2 in paired_iter:
            if index % 10000 == 0:
                print('...', index, file=sys.stderr)

            if is_pair:
                write_record(record1, fp_out1)
                counter1 += 1
                write_record(record2, fp_out2)
                counter2 += 1
            elif args.output_orphaned:
                write_record(record1, fp_out0)
                counter3 += 1
    except UnpairedReadsError as e:
        print("Unpaired reads found starting at {name}; exiting".format(
            name=e.read1.name), file=sys.stderr)
        sys.exit(1)

    print("DONE; split %d sequences (%d left, %d right, %d orphans)" %
          (counter1 + counter2, counter1, counter2, counter3), file=sys.stderr)
    print("/1 reads in %s" % out1, file=sys.stderr)
    print("/2 reads in %s" % out2, file=sys.stderr)
    if args.output_orphaned:
        print("orphans in %s" % out0, file=sys.stderr)
Пример #31
0
def main():
    info('sample-reads-randomly.py')
    args = get_parser().parse_args()

    for _ in args.filenames:
        check_input_files(_, args.force)

    check_space(args.filenames, args.force)

    # seed the random number generator?
    if args.random_seed:
        random.seed(args.random_seed)

    # bound n_samples
    num_samples = max(args.num_samples, 1)

    #
    # Figure out what the output filename is going to be
    #

    output_file = args.output_file
    if output_file:
        if num_samples > 1:
            sys.stderr.write(
                "Error: cannot specify -o with more than one sample.")
            if not args.force:
                sys.exit(1)
        output_filename = output_file.name
    else:
        filename = args.filenames[0]
        output_filename = os.path.basename(filename) + '.subset'

    if num_samples == 1:
        print >>sys.stderr, 'Subsampling %d reads using reservoir sampling.' %\
            args.num_reads
        print >>sys.stderr, 'Subsampled reads will be placed in %s' % \
            output_filename
        print >>sys.stderr, ''
    else:  # > 1
        print >>sys.stderr, 'Subsampling %d reads, %d times,' \
            % (args.num_reads, num_samples), ' using reservoir sampling.'
        print >>sys.stderr, 'Subsampled reads will be placed in %s.N' \
            % output_filename
        print >>sys.stderr, ''

    reads = []
    for n in range(num_samples):
        reads.append([])

    # read through all the sequences and load/resample the reservoir
    for filename in args.filenames:
        print >>sys.stderr, 'opening', filename, 'for reading'
        screed_iter = screed.open(filename, parse_description=False)
        for count, ispair, rcrd1, rcrd2 in broken_paired_reader(
                screed_iter,
                force_single=args.force_single):
            if count % 10000 == 0:
                print >>sys.stderr, '...', count, 'reads scanned'
                if count >= args.max_reads:
                    print >>sys.stderr, 'reached upper limit of %d reads' % \
                        args.max_reads, '(see -M); exiting'
                    break

            # collect first N reads
            if count < args.num_reads:
                for n in range(num_samples):
                    reads[n].append((rcrd1, rcrd2))
            else:
                # use reservoir sampling to replace reads at random
                # see http://en.wikipedia.org/wiki/Reservoir_sampling

                for n in range(num_samples):
                    guess = random.randint(1, count)
                    if guess <= args.num_reads:
                        reads[n][guess - 1] = (rcrd1, rcrd2)

    # output all the subsampled reads:
    if len(reads) == 1:
        print >>sys.stderr, 'Writing %d sequences to %s' % \
            (len(reads[0]), output_filename)
        if not output_file:
            output_file = open(output_filename, 'w')

        for records in reads[0]:
            write_record(records[0], output_file)
            if records[1] is not None:
                write_record(records[1], output_file)
    else:
        for n in range(num_samples):
            n_filename = output_filename + '.%d' % n
            print >>sys.stderr, 'Writing %d sequences to %s' % \
                (len(reads[n]), n_filename)
            output_file = open(n_filename, 'w')
            for records in reads[n]:
                write_record(records[0], output_file)
                if records[1] is not None:
                    write_record(records[1], output_file)
Пример #32
0
def main():
    info('split-paired-reads.py')
    args = get_parser().parse_args()

    infile = args.infile

    check_input_files(infile, args.force)
    filenames = [infile]
    check_space(filenames, args.force)

    # decide where to put output files - specific directory? or just default?
    if args.output_directory:
        if not os.path.exists(args.output_directory):
            os.makedirs(args.output_directory)
        out1 = args.output_directory + '/' + os.path.basename(infile) + '.1'
        out2 = args.output_directory + '/' + os.path.basename(infile) + '.2'
    else:
        out1 = os.path.basename(infile) + '.1'
        out2 = os.path.basename(infile) + '.2'

    # OVERRIDE output file locations with -1, -2
    if args.output_first:
        out1 = args.output_first
    if args.output_second:
        out2 = args.output_second

    fp_out1 = open(out1, 'w')
    fp_out2 = open(out2, 'w')

    counter1 = 0
    counter2 = 0
    index = None

    screed_iter = screed.open(infile, parse_description=False)

    # walk through all the reads in broken-paired mode.
    for index, is_pair, record1, record2 in broken_paired_reader(screed_iter):
        if index % 100000 == 0 and index:
            print >> sys.stderr, '...', index

        # are we requiring pairs?
        if args.force_paired and not is_pair:
            print >>sys.stderr, 'ERROR, %s is not part of a pair' % \
                record1.name
            sys.exit(1)

        if is_pair:
            write_record(record1, fp_out1)
            counter1 += 1
            write_record(record2, fp_out2)
            counter2 += 1
        else:
            name = record1.name
            if check_is_left(name):
                write_record(record1, fp_out1)
                counter1 += 1
            elif check_is_right(name):
                write_record(record1, fp_out2)
                counter2 += 1
            else:
                print >>sys.stderr, \
                    "Unrecognized format for read pair information: %s" % name
                print >>sys.stderr, "Exiting."
                sys.exit(1)

    print >> sys.stderr, "DONE; split %d sequences (%d left, %d right)" % \
        (counter1 + counter2, counter1, counter2)
    print >> sys.stderr, "/1 reads in %s" % out1
    print >> sys.stderr, "/2 reads in %s" % out2
Пример #33
0
def main():
    parser = sanitize_help(get_parser())
    args = parser.parse_args()
    if not args.quiet:
        info('trim-low-abund.py', ['streaming'])

    configure_logging(args.quiet)

    ###

    if len(set(args.input_filenames)) != len(args.input_filenames):
        log_error("Error: Cannot input the same filename multiple times.")
        sys.exit(1)

    if args.trim_at_coverage != DEFAULT_TRIM_AT_COVERAGE and \
       not args.variable_coverage:
        log_error("Error: --trim-at-coverage/-Z given, but "
                  "--variable-coverage/-V not specified.")
        sys.exit(1)

    if args.diginorm_coverage != DEFAULT_DIGINORM_COVERAGE and \
       not args.diginorm:
        log_error("Error: --diginorm-coverage given, but "
                  "--diginorm not specified.")
        sys.exit(1)

    if args.diginorm and args.single_pass:
        log_error("Error: --diginorm and --single-pass are incompatible!\n"
                  "You probably want to use normalize-by-median.py instead.")
        sys.exit(1)

    ###

    report_on_config(args)
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savegraph:
        graphsize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, graphsize, args.force)

    if ('-' in args.input_filenames or '/dev/stdin' in args.input_filenames) \
       and not args.output:
        log_error("Accepting input from stdin; output filename must "
                  "be provided with -o.")
        sys.exit(1)

    if args.loadgraph:
        log_info('loading countgraph from {graph}', graph=args.loadgraph)
        ct = khmer.load_countgraph(args.loadgraph)
    else:
        log_info('making countgraph')
        ct = khmer_args.create_countgraph(args)

    K = ct.ksize()
    tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir)
    log_info('created temporary directory {temp};\n'
             'use -T to change location', temp=tempdir)

    trimmer = Trimmer(ct, not args.variable_coverage, args.cutoff,
                      args.trim_at_coverage)
    if args.diginorm:
        trimmer.set_diginorm(args.diginorm_coverage)

    # ### FIRST PASS ###

    save_pass2_total = 0

    written_bp = 0
    written_reads = 0

    # only create the file writer once if outfp is specified; otherwise,
    # create it for each file.
    if args.output:
        trimfp = get_file_writer(args.output, args.gzip, args.bzip)

    pass2list = []
    for filename in args.input_filenames:
        # figure out temporary filename for 2nd pass
        pass2filename = os.path.basename(filename) + '.pass2'
        pass2filename = os.path.join(tempdir, pass2filename)
        pass2fp = open(pass2filename, 'w')

        # construct output filenames
        if args.output is None:
            # note: this will be saved in trimfp.
            outfp = open(os.path.basename(filename) + '.abundtrim', 'wb')

            # get file handle w/gzip, bzip
            trimfp = get_file_writer(outfp, args.gzip, args.bzip)

        # record all this info
        pass2list.append((filename, pass2filename, trimfp))

        # input file stuff: get a broken_paired reader.
        screed_iter = screed.open(filename)
        paired_iter = broken_paired_reader(screed_iter, min_length=K,
                                           force_single=args.ignore_pairs)

        # main loop through the file.
        n_start = trimmer.n_reads
        save_start = trimmer.n_saved

        watermark = REPORT_EVERY_N_READS
        for read in trimmer.pass1(paired_iter, pass2fp):
            if (trimmer.n_reads - n_start) > watermark:
                log_info("... {filename} {n_saved} {n_reads} {n_bp} "
                         "{w_reads} {w_bp}", filename=filename,
                         n_saved=trimmer.n_saved, n_reads=trimmer.n_reads,
                         n_bp=trimmer.n_bp, w_reads=written_reads,
                         w_bp=written_bp)
                watermark += REPORT_EVERY_N_READS

            # write out the trimmed/etc sequences that AREN'T going to be
            # revisited in a 2nd pass.
            write_record(read, trimfp)
            written_bp += len(read)
            written_reads += 1
        pass2fp.close()

        log_info("{filename}: kept aside {kept} of {total} from first pass",
                 filename=filename, kept=trimmer.n_saved - save_start,
                 total=trimmer.n_reads - n_start)

    # first pass goes across all the data, so record relevant stats...
    n_reads = trimmer.n_reads
    n_bp = trimmer.n_bp
    n_skipped = trimmer.n_skipped
    bp_skipped = trimmer.bp_skipped
    save_pass2_total = trimmer.n_saved

    # ### SECOND PASS. ###

    # nothing should have been skipped yet!
    assert trimmer.n_skipped == 0
    assert trimmer.bp_skipped == 0

    if args.single_pass:
        pass2list = []

    # go back through all the files again.
    for _, pass2filename, trimfp in pass2list:
        log_info('second pass: looking at sequences kept aside in {pass2}',
                 pass2=pass2filename)

        # note that for this second pass, we don't care about paired
        # reads - they will be output in the same order they're read in,
        # so pairs will stay together if not orphaned.  This is in contrast
        # to the first loop.  Hence, force_single=True below.

        screed_iter = screed.open(pass2filename, parse_description=False)
        paired_iter = broken_paired_reader(screed_iter, min_length=K,
                                           force_single=True)

        watermark = REPORT_EVERY_N_READS
        for read in trimmer.pass2(paired_iter):
            if (trimmer.n_reads - n_start) > watermark:
                log_info('... x 2 {a} {b} {c} {d} {e} {f} {g}',
                         a=trimmer.n_reads - n_start,
                         b=pass2filename, c=trimmer.n_saved,
                         d=trimmer.n_reads, e=trimmer.n_bp,
                         f=written_reads, g=written_bp)
                watermark += REPORT_EVERY_N_READS

            write_record(read, trimfp)
            written_reads += 1
            written_bp += len(read)

        log_info('removing {pass2}', pass2=pass2filename)
        os.unlink(pass2filename)

        # if we created our own trimfps, close 'em.
        if not args.output:
            trimfp.close()

    log_info('removing temp directory & contents ({temp})', temp=tempdir)
    shutil.rmtree(tempdir)

    trimmed_reads = trimmer.trimmed_reads

    n_passes = 1.0 + (float(save_pass2_total) / n_reads)
    percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\
        n_reads * 100.0

    log_info('read {read} reads, {bp} bp', read=n_reads, bp=n_bp)
    log_info('wrote {wr} reads, {wbp} bp', wr=written_reads, wbp=written_bp)
    log_info('looked at {st} reads twice ({np:.2f} passes)',
             st=save_pass2_total, np=n_passes)
    log_info('removed {r} reads and trimmed {t} reads ({p:.2f}%)',
             r=n_reads - written_reads, t=trimmed_reads,
             p=percent_reads_trimmed)
    log_info('trimmed or removed {p:.2f}%% of bases ({bp} total)',
             p=(1 - (written_bp / float(n_bp))) * 100.0, bp=n_bp - written_bp)

    if args.variable_coverage:
        percent_reads_hicov = 100.0 * float(n_reads - n_skipped) / n_reads
        log_info('{n} reads were high coverage ({p:.2f}%);',
                 n=n_reads - n_skipped, p=percent_reads_hicov)
        log_info('skipped {r} reads/{bp} bases because of low coverage',
                 r=n_skipped, bp=bp_skipped)

    fp_rate = \
        khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975
    log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate)

    log_info('output in *.abundtrim')

    if args.savegraph:
        log_info("Saving k-mer countgraph to {graph}", graph=args.savegraph)
        ct.save(args.savegraph)
Пример #34
0
def main():
    args = sanitize_help(get_parser()).parse_args()

    infile = args.infile
    check_input_files(infile, args.force)
    check_space([infile], args.force)

    # decide where to put output files - specific directory? or just default?
    if infile in ('/dev/stdin', '-'):
        # seqan only treats '-' as "read from stdin"
        infile = '-'
        if not (args.output_paired and args.output_single):
            print("Accepting input from stdin; output filenames must be "
                  "provided.", file=sys.stderr)
            sys.exit(1)
    elif args.output_dir:
        if not os.path.exists(args.output_dir):
            os.makedirs(args.output_dir)
        out1 = args.output_dir + '/' + os.path.basename(infile) + '.se'
        out2 = args.output_dir + '/' + os.path.basename(infile) + '.pe'
    else:
        out1 = os.path.basename(infile) + '.se'
        out2 = os.path.basename(infile) + '.pe'

    # OVERRIDE default output file locations with -p, -s
    if args.output_paired:
        paired_fp = get_file_writer(args.output_paired, args.gzip, args.bzip)
        out2 = paired_fp.name
    else:
        # Don't override, just open the default filename from above
        paired_fp = get_file_writer(open(out2, 'wb'), args.gzip, args.bzip)
    if args.output_single:
        single_fp = get_file_writer(args.output_single, args.gzip, args.bzip)
        out1 = args.output_single.name
    else:
        # Don't override, just open the default filename from above
        single_fp = get_file_writer(open(out1, 'wb'), args.gzip, args.bzip)

    print('reading file "%s"' % infile, file=sys.stderr)
    print('outputting interleaved pairs to "%s"' % out2, file=sys.stderr)
    print('outputting orphans to "%s"' % out1, file=sys.stderr)

    n_pe = 0
    n_se = 0

    screed_iter = ReadParser(infile)
    for index, is_pair, read1, read2 in broken_paired_reader(screed_iter):
        if index % 100000 == 0 and index > 0:
            print('...', index, file=sys.stderr)

        if is_pair:
            write_record_pair(read1, read2, paired_fp)
            n_pe += 1
        else:
            write_record(read1, single_fp)
            n_se += 1

    single_fp.close()
    paired_fp.close()

    if n_pe == 0:
        raise Exception("no paired reads!? check file formats...")

    print('DONE; read %d sequences,'
          ' %d pairs and %d singletons' %
          (n_pe * 2 + n_se, n_pe, n_se), file=sys.stderr)

    print('wrote to: %s and %s' % (out2, out1),
          file=sys.stderr)
Пример #35
0
def main():  # pylint: disable=too-many-branches,too-many-statements
    info('normalize-by-median.py', ['diginorm'])
    args = get_parser().parse_args()

    report_on_config(args)

    report_fp = args.report
    force_single = args.force_single

    # check for similar filenames
    # if we're using a single output file only check for identical filenames
    # otherwise, check for identical BASE names as well.
    filenames = []
    basenames = []
    for pathfilename in args.input_filenames:
        filenames.append(pathfilename)
        if args.single_output_file:
            continue  # nothing more to worry about

        basename = os.path.basename(pathfilename)
        if basename in basenames:
            print('ERROR: Duplicate filename--Cannot handle this!',
                  file=sys.stderr)
            print('** Exiting!', file=sys.stderr)
            sys.exit(1)

        basenames.append(basename)

    # check that files exist and there is sufficient output disk space.
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savetable:
        check_space_for_hashtable(args.n_tables * args.min_tablesize,
                                  args.force)

    # load or create counting table.
    if args.loadtable:
        print('loading k-mer counting table from ' + args.loadtable,
              file=sys.stderr)
        htable = khmer.load_counting_hash(args.loadtable)
    else:
        print('making k-mer counting table', file=sys.stderr)
        htable = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                         args.n_tables)

    input_filename = None

    # create an object to handle diginorm of all files
    norm = Normalizer(args.cutoff, htable)

    # make a list of all filenames and if they're paired or not;
    # if we don't know if they're paired, default to allowing but not
    # forcing pairing.
    files = []
    for e in filenames:
        files.append([e, args.paired])
    if args.unpaired_reads:
        files.append([args.unpaired_reads, False])

    corrupt_files = []
    outfp = None
    output_name = None

    if args.single_output_file:
        if args.single_output_file is sys.stdout:
            output_name = '/dev/stdout'
        else:
            output_name = args.single_output_file.name
        outfp = args.single_output_file

    #
    # main loop: iterate over all files given, do diginorm.
    #

    for filename, require_paired in files:
        if not args.single_output_file:
            output_name = os.path.basename(filename) + '.keep'
            outfp = open(output_name, 'w')

        # failsafe context manager in case an input file breaks
        with CatchIOErrors(filename, outfp, args.single_output_file,
                           args.force, corrupt_files):

            screed_iter = screed.open(filename, parse_description=False)
            reader = broken_paired_reader(screed_iter,
                                          min_length=args.ksize,
                                          force_single=force_single,
                                          require_paired=require_paired)

            # actually do diginorm
            for record in WithDiagnostics(filename, norm, reader, report_fp):
                if record is not None:
                    write_record(record, outfp)

            print('output in ' + output_name, file=sys.stderr)
            if output_name is not '/dev/stdout':
                outfp.close()

    # finished - print out some diagnostics.

    print('Total number of unique k-mers: {0}'.format(htable.n_unique_kmers()),
          file=sys.stderr)

    if args.savetable:
        print('...saving to ' + args.savetable, file=sys.stderr)
        htable.save(args.savetable)

    fp_rate = \
        khmer.calc_expected_collisions(htable, args.force, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975

    print('fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate),
          file=sys.stderr)

    if args.force and len(corrupt_files) > 0:
        print("** WARNING: Finished with errors!", file=sys.stderr)
        print("** IOErrors occurred in the following files:", file=sys.stderr)
        print("\t", " ".join(corrupt_files), file=sys.stderr)
Пример #36
0
def main():
    parser = get_parser()
    parser.epilog = parser.epilog.replace(
        "`reservoir sampling\n"
        "<http://en.wikipedia.org/wiki/Reservoir_sampling>`__ algorithm.",
        "reservoir sampling algorithm. "
        "http://en.wikipedia.org/wiki/Reservoir_sampling")
    args = sanitize_help(parser).parse_args()

    for name in args.filenames:
        check_input_files(name, args.force)

    # seed the random number generator?
    if args.random_seed:
        random.seed(args.random_seed)

    # bound n_samples
    num_samples = max(args.num_samples, 1)

    #
    # Figure out what the output filename is going to be

    if args.output_file:
        output_filename = args.output_file.name
        if num_samples > 1:
            sys.stderr.write(
                "Error: cannot specify -o with more than one sample.")
            if not args.force:
                print(
                    "NOTE: This can be overridden using the --force"
                    " argument",
                    file=sys.stderr)
                sys.exit(1)
    else:
        filename = args.filenames[0]
        if filename in ('/dev/stdin', '-'):
            print(
                "Accepting input from stdin; output filename must "
                "be provided with '-o'.",
                file=sys.stderr)
            sys.exit(1)
        output_filename = os.path.basename(filename) + '.subset'

    filename = args.filenames[0]
    if filename in ('/dev/stdin', '-'):
        # seqan only treats '-' as "read from stdin"
        filename = '-'

    if num_samples == 1:
        print('Subsampling %d reads using reservoir sampling.' %
              args.num_reads,
              file=sys.stderr)
        print('Subsampled reads will be placed in %s' % output_filename,
              file=sys.stderr)
        print('', file=sys.stderr)
    else:  # > 1
        print('Subsampling %d reads, %d times,' %
              (args.num_reads, num_samples),
              ' using reservoir sampling.',
              file=sys.stderr)
        print('Subsampled reads will be placed in %s.N' % output_filename,
              file=sys.stderr)
        print('', file=sys.stderr)

    reads = []
    for _ in range(num_samples):
        reads.append([])

    # read through all the sequences and load/resample the reservoir
    for filename in args.filenames:
        print('opening', filename, 'for reading', file=sys.stderr)

        for count, (_, _, rcrd1, rcrd2) in enumerate(
                broken_paired_reader(ReadParser(filename),
                                     force_single=args.force_single)):
            if count % 10000 == 0:
                print('...', count, 'reads scanned', file=sys.stderr)
                if count >= args.max_reads:
                    print('reached upper limit of %d reads' % args.max_reads,
                          '(see -M); exiting',
                          file=sys.stderr)
                    break

            # collect first N reads
            if count < args.num_reads:
                for sample in range(num_samples):
                    reads[sample].append((rcrd1, rcrd2))
            else:
                for sample in range(num_samples):
                    assert len(reads[sample]) <= count

                # use reservoir sampling to replace reads at random
                # see http://en.wikipedia.org/wiki/Reservoir_sampling

                for n in range(num_samples):
                    guess = random.randint(1, count)
                    if guess <= args.num_reads:
                        reads[n][guess - 1] = (rcrd1, rcrd2)

    # output all the subsampled reads:
    if len(reads) == 1:
        print('Writing %d sequences to %s' % (len(reads[0]), output_filename),
              file=sys.stderr)

        output_file = args.output_file
        if not output_file:
            output_file = open(output_filename, 'wb')

        output_file = get_file_writer(output_file, args.gzip, args.bzip)

        for records in reads[0]:
            write_record(records[0], output_file)
            if records[1] is not None:
                write_record(records[1], output_file)
    else:
        for n in range(num_samples):
            n_filename = output_filename + '.%d' % n
            print('Writing %d sequences to %s' % (len(reads[n]), n_filename),
                  file=sys.stderr)
            output_file = get_file_writer(open(n_filename, 'wb'), args.gzip,
                                          args.bzip)
            for records in reads[n]:
                write_record(records[0], output_file)
                if records[1] is not None:
                    write_record(records[1], output_file)
Пример #37
0
def main():  # pylint: disable=too-many-branches,too-many-statements
    info('normalize-by-median.py', ['diginorm'])
    args = get_parser().parse_args()

    report_on_config(args)

    report_fp = args.report
    force_single = args.force_single

    # check for similar filenames
    # if we're using a single output file only check for identical filenames
    # otherwise, check for identical BASE names as well.
    filenames = []
    basenames = []
    for pathfilename in args.input_filenames:
        filenames.append(pathfilename)
        if args.single_output_file:
            continue  # nothing more to worry about

        basename = os.path.basename(pathfilename)
        if basename in basenames:
            print('ERROR: Duplicate filename--Cannot handle this!',
                  file=sys.stderr)
            print('** Exiting!', file=sys.stderr)
            sys.exit(1)

        basenames.append(basename)

    # check that files exist and there is sufficient output disk space.
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savetable:
        check_space_for_hashtable(args, 'countgraph', args.force)

    # load or create counting table.
    if args.loadtable:
        print('loading k-mer counting table from ' + args.loadtable,
              file=sys.stderr)
        htable = khmer.load_counting_hash(args.loadtable)
    else:
        print('making countgraph', file=sys.stderr)
        htable = khmer_args.create_countgraph(args)

    input_filename = None

    # create an object to handle diginorm of all files
    norm = Normalizer(args.cutoff, htable)

    # make a list of all filenames and if they're paired or not;
    # if we don't know if they're paired, default to allowing but not
    # forcing pairing.
    files = []
    for e in filenames:
        files.append([e, args.paired])
    if args.unpaired_reads:
        files.append([args.unpaired_reads, False])

    corrupt_files = []
    outfp = None
    output_name = None

    if args.single_output_file:
        if args.single_output_file is sys.stdout:
            output_name = '/dev/stdout'
        else:
            output_name = args.single_output_file.name
        outfp = args.single_output_file

    #
    # main loop: iterate over all files given, do diginorm.
    #

    for filename, require_paired in files:
        if not args.single_output_file:
            output_name = os.path.basename(filename) + '.keep'
            outfp = open(output_name, 'w')

        # failsafe context manager in case an input file breaks
        with CatchIOErrors(filename, outfp, args.single_output_file,
                           args.force, corrupt_files):

            screed_iter = screed.open(filename, parse_description=False)
            reader = broken_paired_reader(screed_iter, min_length=args.ksize,
                                          force_single=force_single,
                                          require_paired=require_paired)

            # actually do diginorm
            for record in WithDiagnostics(filename, norm, reader, report_fp):
                if record is not None:
                    write_record(record, outfp)

            print('output in ' + output_name, file=sys.stderr)
            if output_name is not '/dev/stdout':
                outfp.close()

    # finished - print out some diagnostics.

    print('Total number of unique k-mers: {0}'
          .format(htable.n_unique_kmers()),
          file=sys.stderr)

    if args.savetable:
        print('...saving to ' + args.savetable, file=sys.stderr)
        htable.save(args.savetable)

    fp_rate = \
        khmer.calc_expected_collisions(htable, args.force, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975

    print('fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate),
          file=sys.stderr)

    if args.force and len(corrupt_files) > 0:
        print("** WARNING: Finished with errors!", file=sys.stderr)
        print("** I/O Errors occurred in the following files:",
              file=sys.stderr)
        print("\t", " ".join(corrupt_files), file=sys.stderr)
Пример #38
0
def main():  # pylint: disable=too-many-branches,too-many-statements
    start_time = time.time()
    parser = sanitize_help(get_parser())
    args = parser.parse_args()

    configure_logging(args.quiet)
    report_on_config(args)

    report_fp = args.report
    force_single = args.force_single

    # check for similar filenames
    # if we're using a single output file only check for identical filenames
    # otherwise, check for identical BASE names as well.
    filenames = []
    basenames = []
    for pathfilename in args.input_filenames:
        filenames.append(pathfilename)
        if args.single_output_file:
            continue  # nothing more to worry about

        basename = os.path.basename(pathfilename)
        if basename in basenames:
            log_error('ERROR: Duplicate filename--Cannot handle this!')
            log_error('** Exiting!')
            sys.exit(1)

        basenames.append(basename)

    # check that files exist and there is sufficient output disk space.
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savegraph is not None:
        graphsize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, graphsize, args.force)

    # load or create counting table.
    if args.loadgraph:
        log_info('loading k-mer countgraph from {graph}', graph=args.loadgraph)
        countgraph1 = Countgraph.load(args.loadgraph)

    # load second counting table.
    if args.loadgraph2:
        log_info('loading k-mer countgraph from {graph}',
                 graph=args.loadgraph2)
        countgraph2 = Countgraph.load(args.loadgraph2)

    # make a list of all filenames and if they're paired or not;
    # if we don't know if they're paired, default to allowing but not
    # forcing pairing.
    files = []
    for element in filenames:
        files.append([element, args.paired])
    if args.unpaired_reads:
        files.append([args.unpaired_reads, False])

    #
    # main loop: iterate over all files given, do diginorm.
    #

    for filename, require_paired in files:
        if not args.single_output_file:
            output_name = os.path.basename(filename) + '.keep'
            outfp = open(output_name, 'wb')
            outfp = get_file_writer(outfp, args.gzip, args.bzip)

        screed_iter = clean_input_reads(screed.open(filename))
        reader = broken_paired_reader(screed_iter,
                                      min_length=args.ksize,
                                      force_single=force_single,
                                      require_paired=require_paired)

        # actually do diginorm
        for _, is_paired, read0, read1 in reader:
            for record in snarf(is_paired, read0, read1, countgraph1,
                                countgraph2):
                if record is not None:
                    write_record(record, outfp)

    print("--- %s seconds ---" % (time.time() - start_time))
Пример #39
0
def main():
    info('trim-low-abund.py', ['streaming'])
    parser = get_parser()
    args = parser.parse_args()

    ###

    if len(set(args.input_filenames)) != len(args.input_filenames):
        print >>sys.stderr, \
            "Error: Cannot input the same filename multiple times."
        sys.exit(1)

    ###

    report_on_config(args)
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savetable:
        check_space_for_hashtable(
            args.n_tables * args.min_tablesize, args.force)

    K = args.ksize

    CUTOFF = args.cutoff
    NORMALIZE_LIMIT = args.normalize_to

    if args.loadtable:
        print >>sys.stderr, 'loading k-mer counting table from', args.loadtable
        ct = khmer.load_counting_hash(args.loadtable)
    else:
        print >>sys.stderr, 'making k-mer counting table'
        ct = khmer.new_counting_hash(K, args.min_tablesize, args.n_tables)

    tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir)
    print >>sys.stderr, 'created temporary directory %s; ' \
                        'use -T to change location' % tempdir

    # ### FIRST PASS ###

    save_pass2_total = 0

    n_bp = 0
    n_reads = 0
    written_bp = 0
    written_reads = 0
    trimmed_reads = 0

    pass2list = []
    for filename in args.input_filenames:
        pass2filename = os.path.basename(filename) + '.pass2'
        pass2filename = os.path.join(tempdir, pass2filename)
        trimfilename = os.path.basename(filename) + '.abundtrim'

        pass2list.append((filename, pass2filename, trimfilename))

        screed_iter = screed.open(filename, parse_description=False)
        pass2fp = open(pass2filename, 'w')
        trimfp = open(trimfilename, 'w')

        save_pass2 = 0
        n = 0

        paired_iter = broken_paired_reader(screed_iter, min_length=K,
                                           force_single=args.ignore_pairs)
        for n, is_pair, read1, read2 in paired_iter:
            if n % 10000 == 0:
                print >>sys.stderr, '...', n, filename, save_pass2, \
                    n_reads, n_bp, written_reads, written_bp

            # we want to track paired reads here, to make sure that pairs
            # are not split between first pass and second pass.

            if is_pair:
                n_reads += 2
                n_bp += len(read1.sequence) + len(read2.sequence)

                seq1 = read1.sequence.replace('N', 'A')
                seq2 = read2.sequence.replace('N', 'A')

                med1, _, _ = ct.get_median_count(seq1)
                med2, _, _ = ct.get_median_count(seq2)

                if med1 < NORMALIZE_LIMIT or med2 < NORMALIZE_LIMIT:
                    ct.consume(seq1)
                    ct.consume(seq2)
                    write_record_pair(read1, read2, pass2fp)
                    save_pass2 += 2
                else:
                    _, trim_at1 = ct.trim_on_abundance(seq1, CUTOFF)
                    _, trim_at2 = ct.trim_on_abundance(seq2, CUTOFF)

                    if trim_at1 >= K:
                        read1 = trim_record(read1, trim_at1)

                    if trim_at2 >= K:
                        read2 = trim_record(read2, trim_at2)

                    if trim_at1 != len(seq1):
                        trimmed_reads += 1
                    if trim_at2 != len(seq2):
                        trimmed_reads += 1

                    write_record_pair(read1, read2, trimfp)
                    written_reads += 2
                    written_bp += trim_at1 + trim_at2
            else:
                n_reads += 1
                n_bp += len(read1.sequence)

                seq = read1.sequence.replace('N', 'A')

                med, _, _ = ct.get_median_count(seq)

                # has this portion of the graph saturated? if not,
                # consume & save => pass2.
                if med < NORMALIZE_LIMIT:
                    ct.consume(seq)
                    write_record(read1, pass2fp)
                    save_pass2 += 1
                else:                       # trim!!
                    _, trim_at = ct.trim_on_abundance(seq, CUTOFF)
                    if trim_at >= K:
                        new_read = trim_record(read1, trim_at)
                        write_record(new_read, trimfp)

                        written_reads += 1
                        written_bp += trim_at

                        if trim_at != len(read1.sequence):
                            trimmed_reads += 1

        pass2fp.close()
        trimfp.close()

        print '%s: kept aside %d of %d from first pass, in %s' % \
              (filename, save_pass2, n, filename)
        save_pass2_total += save_pass2

    # ### SECOND PASS. ###

    skipped_n = 0
    skipped_bp = 0
    for _, pass2filename, trimfilename in pass2list:
        print 'second pass: looking at sequences kept aside in %s' % \
              pass2filename

        # note that for this second pass, we don't care about paired
        # reads - they will be output in the same order they're read in,
        # so pairs will stay together if not orphaned.  This is in contrast
        # to the first loop.

        trimfp = open(trimfilename, 'a')
        for n, read in enumerate(screed.open(pass2filename,
                                             parse_description=False)):
            if n % 10000 == 0:
                print >>sys.stderr, '... x 2', n, pass2filename, \
                    written_reads, written_bp

            seq = read.sequence.replace('N', 'A')
            med, _, _ = ct.get_median_count(seq)

            # do we retain low-abundance components unchanged?
            if med < NORMALIZE_LIMIT and args.variable_coverage:
                write_record(read, trimfp)

                written_reads += 1
                written_bp += len(read.sequence)
                skipped_n += 1
                skipped_bp += len(read.sequence)

            # otherwise, examine/trim/truncate.
            else:    # med >= NORMALIZE LIMIT or not args.variable_coverage
                _, trim_at = ct.trim_on_abundance(seq, CUTOFF)
                if trim_at >= K:
                    new_read = trim_record(read, trim_at)
                    write_record(new_read, trimfp)

                    written_reads += 1
                    written_bp += trim_at

                    if trim_at != len(read.sequence):
                        trimmed_reads += 1

        print >>sys.stderr, 'removing %s' % pass2filename
        os.unlink(pass2filename)

    print >>sys.stderr, 'removing temp directory & contents (%s)' % tempdir
    shutil.rmtree(tempdir)

    n_passes = 1.0 + (float(save_pass2_total) / n_reads)
    percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\
        n_reads * 100.0

    print 'read %d reads, %d bp' % (n_reads, n_bp,)
    print 'wrote %d reads, %d bp' % (written_reads, written_bp,)
    print 'looked at %d reads twice (%.2f passes)' % (save_pass2_total,
                                                      n_passes)
    print 'removed %d reads and trimmed %d reads (%.2f%%)' % \
        (n_reads - written_reads, trimmed_reads, percent_reads_trimmed)
    print 'trimmed or removed %.2f%% of bases (%d total)' % \
        ((1 - (written_bp / float(n_bp))) * 100.0, n_bp - written_bp)

    if args.variable_coverage:
        percent_reads_hicov = 100.0 * float(n_reads - skipped_n) / n_reads
        print '%d reads were high coverage (%.2f%%);' % (n_reads - skipped_n,
                                                         percent_reads_hicov)
        print 'skipped %d reads/%d bases because of low coverage' % \
              (skipped_n, skipped_bp)

    fp_rate = khmer.calc_expected_collisions(ct)
    print >>sys.stderr, \
        'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate)

    if fp_rate > MAX_FALSE_POSITIVE_RATE:
        print >> sys.stderr, "**"
        print >> sys.stderr, ("** ERROR: the k-mer counting table is too small"
                              " for this data set. Increase tablesize/# "
                              "tables.")
        print >> sys.stderr, "**"
        print >> sys.stderr, "** Do not use these results!!"
        sys.exit(1)

    print 'output in *.abundtrim'

    if args.savetable:
        print >>sys.stderr, "Saving k-mer counting table to", args.savetable
        ct.save(args.savetable)
Пример #40
0
def main():
    info('trim-low-abund.py', ['streaming'])
    parser = sanitize_help(get_parser())
    args = parser.parse_args()

    ###

    if len(set(args.input_filenames)) != len(args.input_filenames):
        print("Error: Cannot input the same filename multiple times.",
              file=sys.stderr)
        sys.exit(1)

    ###

    report_on_config(args)
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savegraph:
        graphsize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, graphsize, args.force)

    if ('-' in args.input_filenames or '/dev/stdin' in args.input_filenames) \
       and not args.output:
        print("Accepting input from stdin; output filename must "
              "be provided with -o.", file=sys.stderr)
        sys.exit(1)

    if args.loadgraph:
        print('loading countgraph from', args.loadgraph, file=sys.stderr)
        ct = khmer.load_countgraph(args.loadgraph)
    else:
        print('making countgraph', file=sys.stderr)
        ct = khmer_args.create_countgraph(args)

    K = ct.ksize()
    CUTOFF = args.cutoff
    NORMALIZE_LIMIT = args.normalize_to

    tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir)
    print('created temporary directory %s; '
          'use -T to change location' % tempdir, file=sys.stderr)

    # ### FIRST PASS ###

    save_pass2_total = 0

    n_bp = 0
    n_reads = 0
    written_bp = 0
    written_reads = 0
    trimmed_reads = 0

    pass2list = []
    for filename in args.input_filenames:
        pass2filename = os.path.basename(filename) + '.pass2'
        pass2filename = os.path.join(tempdir, pass2filename)
        if args.output is None:
            trimfp = get_file_writer(open(os.path.basename(filename) +
                                          '.abundtrim', 'wb'),
                                     args.gzip, args.bzip)
        else:
            trimfp = get_file_writer(args.output, args.gzip, args.bzip)

        pass2list.append((filename, pass2filename, trimfp))

        screed_iter = screed.open(filename)
        pass2fp = open(pass2filename, 'w')

        save_pass2 = 0
        n = 0

        paired_iter = broken_paired_reader(screed_iter, min_length=K,
                                           force_single=args.ignore_pairs)
        for n, is_pair, read1, read2 in paired_iter:
            if n % 10000 == 0:
                print('...', n, filename, save_pass2, n_reads, n_bp,
                      written_reads, written_bp, file=sys.stderr)

            # we want to track paired reads here, to make sure that pairs
            # are not split between first pass and second pass.

            if is_pair:
                n_reads += 2
                n_bp += len(read1.sequence) + len(read2.sequence)

                seq1 = read1.sequence.replace('N', 'A')
                seq2 = read2.sequence.replace('N', 'A')

                med1, _, _ = ct.get_median_count(seq1)
                med2, _, _ = ct.get_median_count(seq2)

                if med1 < NORMALIZE_LIMIT or med2 < NORMALIZE_LIMIT:
                    ct.consume(seq1)
                    ct.consume(seq2)
                    write_record_pair(read1, read2, pass2fp)
                    save_pass2 += 2
                else:
                    _, trim_at1 = ct.trim_on_abundance(seq1, CUTOFF)
                    _, trim_at2 = ct.trim_on_abundance(seq2, CUTOFF)

                    if trim_at1 >= K:
                        read1 = trim_record(read1, trim_at1)

                    if trim_at2 >= K:
                        read2 = trim_record(read2, trim_at2)

                    if trim_at1 != len(seq1):
                        trimmed_reads += 1
                    if trim_at2 != len(seq2):
                        trimmed_reads += 1

                    write_record_pair(read1, read2, trimfp)
                    written_reads += 2
                    written_bp += trim_at1 + trim_at2
            else:
                n_reads += 1
                n_bp += len(read1.sequence)

                seq = read1.sequence.replace('N', 'A')

                med, _, _ = ct.get_median_count(seq)

                # has this portion of the graph saturated? if not,
                # consume & save => pass2.
                if med < NORMALIZE_LIMIT:
                    ct.consume(seq)
                    write_record(read1, pass2fp)
                    save_pass2 += 1
                else:                       # trim!!
                    _, trim_at = ct.trim_on_abundance(seq, CUTOFF)
                    if trim_at >= K:
                        new_read = trim_record(read1, trim_at)
                        write_record(new_read, trimfp)

                        written_reads += 1
                        written_bp += trim_at

                        if trim_at != len(read1.sequence):
                            trimmed_reads += 1

        pass2fp.close()

        print('%s: kept aside %d of %d from first pass, in %s' %
              (filename, save_pass2, n, filename),
              file=sys.stderr)
        save_pass2_total += save_pass2

    # ### SECOND PASS. ###

    skipped_n = 0
    skipped_bp = 0
    for _, pass2filename, trimfp in pass2list:
        print('second pass: looking at sequences kept aside in %s' %
              pass2filename,
              file=sys.stderr)

        # note that for this second pass, we don't care about paired
        # reads - they will be output in the same order they're read in,
        # so pairs will stay together if not orphaned.  This is in contrast
        # to the first loop.

        for n, read in enumerate(screed.open(pass2filename)):
            if n % 10000 == 0:
                print('... x 2', n, pass2filename,
                      written_reads, written_bp, file=sys.stderr)

            seq = read.sequence.replace('N', 'A')
            med, _, _ = ct.get_median_count(seq)

            # do we retain low-abundance components unchanged?
            if med < NORMALIZE_LIMIT and args.variable_coverage:
                write_record(read, trimfp)

                written_reads += 1
                written_bp += len(read.sequence)
                skipped_n += 1
                skipped_bp += len(read.sequence)

            # otherwise, examine/trim/truncate.
            else:    # med >= NORMALIZE LIMIT or not args.variable_coverage
                _, trim_at = ct.trim_on_abundance(seq, CUTOFF)
                if trim_at >= K:
                    new_read = trim_record(read, trim_at)
                    write_record(new_read, trimfp)

                    written_reads += 1
                    written_bp += trim_at

                    if trim_at != len(read.sequence):
                        trimmed_reads += 1

        print('removing %s' % pass2filename, file=sys.stderr)
        os.unlink(pass2filename)

    print('removing temp directory & contents (%s)' % tempdir, file=sys.stderr)
    shutil.rmtree(tempdir)

    n_passes = 1.0 + (float(save_pass2_total) / n_reads)
    percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\
        n_reads * 100.0

    print('read %d reads, %d bp' % (n_reads, n_bp,), file=sys.stderr)
    print('wrote %d reads, %d bp' % (written_reads, written_bp,),
          file=sys.stderr)
    print('looked at %d reads twice (%.2f passes)' % (save_pass2_total,
                                                      n_passes),
          file=sys.stderr)
    print('removed %d reads and trimmed %d reads (%.2f%%)' %
          (n_reads - written_reads, trimmed_reads, percent_reads_trimmed),
          file=sys.stderr)
    print('trimmed or removed %.2f%% of bases (%d total)' %
          ((1 - (written_bp / float(n_bp))) * 100.0, n_bp - written_bp),
          file=sys.stderr)

    if args.variable_coverage:
        percent_reads_hicov = 100.0 * float(n_reads - skipped_n) / n_reads
        print('%d reads were high coverage (%.2f%%);' % (n_reads - skipped_n,
                                                         percent_reads_hicov),
              file=sys.stderr)
        print('skipped %d reads/%d bases because of low coverage' %
              (skipped_n, skipped_bp),
              file=sys.stderr)

    fp_rate = \
        khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975
    print('fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate),
          file=sys.stderr)

    print('output in *.abundtrim', file=sys.stderr)

    if args.savegraph:
        print("Saving k-mer countgraph to",
              args.savegraph, file=sys.stderr)
        ct.save(args.savegraph)
Пример #41
0
def main():
    parser = build_nodegraph_args("find uniq kmer in query compard to refs")
    parser.add_argument('query',
                        help=('fasta readfile to query against'
                              'hashtable, use "-" if from stdin'))
    parser.add_argument('ref',
                        nargs='+',
                        help='fasta sequence file to be loaded in hashtable')
    parser.add_argument('--x2',
                        default='1e8',
                        help='max_table size for readfile2')
    parser.add_argument('--N2',
                        default='4',
                        help='# of table (N) for readfile2')

    args = parser.parse_args()
    #print(args, file=sys.stderr)

    K = args.ksize
    HT_SIZE = args.max_tablesize
    N_HT = args.n_tables
    HT_SIZE2 = int(float(args.x2))
    N_HT2 = int(args.N2)

    # positional
    query = args.query
    refs = args.ref
    print('{} refs to be loaded'.format(len(refs)), file=sys.stderr)
    if query == '-' and refs == ['-']:
        print('*** query and ref can not both be "-" (read from stdin)',
              file=sys.stderr)
    # create a hashbits data structure
    start_time = time.time()
    ht = khmer.Nodetable(K, HT_SIZE, N_HT)
    end_time = time.time()
    secs = end_time - start_time
    mes = 'initiation of bloom filter took {:.2f} hours..'
    print(mes.format(secs / 3600.0), file=sys.stderr)
    for index, filename in enumerate(refs):
        if index != 0 and index % 100 == 0:
            end_time = time.time()
            secs = end_time - start_time
            mes = '{} refs have been loaded with in {:.2f} hours ..'
            print(mes.format(index, secs / 3600.0), file=sys.stderr)
        try:
            ht.consume_seqfile(filename)
        except OSError as e:
            mes = ('*** Skipping due to OSError (machine or system problem):'
                   ' {}\n'
                   '*** Detailed error message:\n'
                   '*** {}')
            print(mes.format(os.path.basename(filename), str(e)),
                  file=sys.stderr)
            continue

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(ht)
    mes = 'fp rate estimated to be {:1.3f}'
    print(mes.format(fp_rate), file=sys.stderr)

    if fp_rate > 0.01:
        mes = ('**\n'
               '** ERROR: the counting hash is too small for\n'
               '** refs.  Increase hashsize/num ht.\n'
               '**\n'
               '** Do not use these results!!')
        sys.exit(-1)

    n_unique1 = ht.n_unique_kmers()

    pair = 0
    forward = 0
    reverse = 0
    other = 0
    total_pair = 0
    for n, is_pair, r1, r2 in broken_paired_reader(
            khmer.ReadParser(query, require_paired=True)):
        #for n, record in enumerate(screed.open(query)):
        total_pair += 1
        share_list = []
        for record in [r1, r2]:
            name, desc = record.name.split(None, 1)
            sequence = record.sequence.replace('N', 'A')
            seq_len = len(sequence)
            if seq_len < K:
                print('*** {} is shorter than {}..'.format(r1.name, K),
                      file=sys.stderr)
                continue
            for i in range(0, seq_len + 1 - K):
                kmer = sequence[i:i + K]
                if ht.get(kmer):
                    share_list.append(1)
                    break
                else:
                    share_list.append(0)

        if share_list == [1, 1]:
            pair += 1
        elif share_list == [1, 0]:
            forward += 1
        elif share_list == [0, 1]:
            reverse += 1
        else:  #[0, 0]
            other += 1
            # do not print
            continue

        mes = ('>{}  {}||uniq_{}\n{}\n' '>{}  {}||uniq_{}\n{}')
        l1 = r1.name.split(None, 1)
        l2 = r2.name.split(None, 1)
        print(
            mes.format(l1[0], l1[1], share_list[0], r1.sequence, l2[0], l2[1],
                       share_list[1], r2.sequence))

    mes = ('Unique kmer in ref:\t{}\n'
           'Total pair:\t{}\n'
           'Both primers uniq:\t{}\n'
           'Pair with forward uniq:\t{}\n'
           'Pair with reverse uniq:\t{}')

    print(mes.format(n_unique1, total_pair, pair, forward, reverse),
          file=sys.stderr)