Exemplo n.º 1
0
def main():
    info("filter-abund-single.py", ["counting"])
    args = get_parser().parse_args()
    check_file_status(args.datafile)
    check_space([args.datafile])
    if args.savetable:
        check_space_for_hashtable(args.n_tables * args.min_tablesize)
    report_on_config(args)

    config = khmer.get_config()
    config.set_reads_input_buffer_size(args.threads * 64 * 1024)

    print "making k-mer counting table"
    htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables, args.threads)

    # first, load reads into hash table
    rparser = khmer.ReadParser(args.datafile, args.threads)
    threads = []
    print "consuming input, round 1 --", args.datafile
    for _ in xrange(args.threads):
        cur_thread = threading.Thread(target=htable.consume_fasta_with_reads_parser, args=(rparser,))
        threads.append(cur_thread)
        cur_thread.start()

    for _ in threads:
        _.join()

    fp_rate = khmer.calc_expected_collisions(htable)
    print "fp rate estimated to be %1.3f" % fp_rate

    # now, trim.

    # the filtering function.
    def process_fn(record):
        name = record["name"]
        seq = record["sequence"]
        if "N" in seq:
            return None, None

        trim_seq, trim_at = htable.trim_on_abundance(seq, args.cutoff)

        if trim_at >= args.ksize:
            return name, trim_seq

        return None, None

    # the filtering loop
    print "filtering", args.datafile
    outfile = os.path.basename(args.datafile) + ".abundfilt"
    outfp = open(outfile, "w")

    tsp = ThreadedSequenceProcessor(process_fn)
    tsp.start(verbose_loader(args.datafile), outfp)

    print "output in", outfile

    if args.savetable:
        print "Saving k-mer counting table filename", args.savetable
        print "...saving to", args.savetable
        htable.save(args.savetable)
Exemplo n.º 2
0
def main():
    info('filter-abund.py', ['counting'])
    args = sanitize_help(get_parser()).parse_args()

    check_input_files(args.input_graph, args.force)
    infiles = args.input_filename
    if ('-' in infiles or '/dev/stdin' in infiles) and not \
       args.single_output_file:
        print("Accepting input from stdin; output filename must "
              "be provided with -o.", file=sys.stderr)
        sys.exit(1)

    for filename in infiles:
        check_input_files(filename, args.force)

    check_space(infiles, args.force)

    print('loading countgraph:', args.input_graph,
          file=sys.stderr)
    countgraph = khmer.load_countgraph(args.input_graph)
    ksize = countgraph.ksize()

    print("K:", ksize, file=sys.stderr)

    # the filtering function.
    def process_fn(record):
        name = record.name
        seq = record.sequence
        seqN = seq.replace('N', 'A')

        if args.variable_coverage:  # only trim when sequence has high enough C
            med, _, _ = countgraph.get_median_count(seqN)
            if med < args.normalize_to:
                return name, seq

        _, trim_at = countgraph.trim_on_abundance(seqN, args.cutoff)

        if trim_at >= ksize:
            # be sure to not to change the 'N's in the trimmed sequence -
            # so, return 'seq' and not 'seqN'.
            return name, seq[:trim_at]

        return None, None

    if args.single_output_file:
        outfile = args.single_output_file.name
        outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip)

    # the filtering loop
    for infile in infiles:
        print('filtering', infile, file=sys.stderr)
        if not args.single_output_file:
            outfile = os.path.basename(infile) + '.abundfilt'
            outfp = open(outfile, 'wb')
            outfp = get_file_writer(outfp, args.gzip, args.bzip)

        tsp = ThreadedSequenceProcessor(process_fn, n_workers=args.threads)
        tsp.start(verbose_loader(infile), outfp)

        print('output in', outfile, file=sys.stderr)
Exemplo n.º 3
0
def main():
    parser = build_counting_multifile_args()
    parser.add_argument('--cutoff',
                        '-C',
                        dest='coverage',
                        default=DEFAULT_COVERAGE,
                        type=int,
                        help="Diginorm coverage.")
    parser.add_argument('--max-error-region',
                        '-M',
                        dest='max_error_region',
                        default=DEFAULT_MAX_ERROR_REGION,
                        type=int,
                        help="Max length of error region allowed")
    args = parser.parse_args()

    counting_ht = args.input_table
    infiles = args.input_filenames

    print 'file with ht: %s' % counting_ht

    print 'loading hashtable'
    ht = khmer.load_counting_hash(counting_ht)
    K = ht.ksize()
    C = args.coverage
    max_error_region = args.max_error_region

    print "K:", K
    print "C:", C
    print "max error region:", max_error_region

    # the filtering function.
    def process_fn(record):
        # read_aligner is probably not threadsafe?
        aligner = khmer.new_readaligner(ht, 1, C, max_error_region)

        name = record['name']
        seq = record['sequence']

        seq = seq.replace('N', 'A')

        grXreAlign, reXgrAlign = aligner.align(seq)

        if len(reXgrAlign) > 0:
            graph_seq = grXreAlign.replace('-', '')
            seq = graph_seq

        return name, seq

    # the filtering loop
    for infile in infiles:
        print 'filtering', infile
        outfile = os.path.basename(infile) + '.corr'
        outfp = open(outfile, 'w')

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print 'output in', outfile
Exemplo n.º 4
0
def main():
    parser = build_counting_multifile_args()
    parser.add_argument(
        "--cutoff", "-C", dest="coverage", default=DEFAULT_COVERAGE, type=int, help="Diginorm coverage."
    )
    parser.add_argument(
        "--max-error-region",
        "-M",
        dest="max_error_region",
        default=DEFAULT_MAX_ERROR_REGION,
        type=int,
        help="Max length of error region allowed",
    )
    args = parser.parse_args()

    counting_ht = args.input_table
    infiles = args.input_filenames

    print "file with ht: %s" % counting_ht

    print "loading hashtable"
    ht = khmer.load_counting_hash(counting_ht)
    K = ht.ksize()
    C = args.coverage
    max_error_region = args.max_error_region

    print "K:", K
    print "C:", C
    print "max error region:", max_error_region

    # the filtering function.
    def process_fn(record):
        # read_aligner is probably not threadsafe?
        aligner = khmer.new_readaligner(ht, 1, C, max_error_region)

        name = record["name"]
        seq = record["sequence"]

        seq = seq.replace("N", "A")

        grXreAlign, reXgrAlign = aligner.align(seq)

        if len(reXgrAlign) > 0:
            graph_seq = grXreAlign.replace("-", "")
            seq = graph_seq

        return name, seq

    # the filtering loop
    for infile in infiles:
        print "filtering", infile
        outfile = os.path.basename(infile) + ".corr"
        outfp = open(outfile, "w")

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print "output in", outfile
Exemplo n.º 5
0
def main():
    parser = build_counting_multifile_args()
    parser.add_argument(
        "--cutoff", "-C", dest="cutoff", default=DEFAULT_CUTOFF, type=int, help="Trim at k-mers below this abundance."
    )

    parser.add_argument("-V", "--variable-coverage", action="store_true", dest="variable_coverage", default=False)
    parser.add_argument(
        "--normalize-to",
        "-Z",
        type=int,
        dest="normalize_to",
        help="base variable-coverage cutoff on this median k-mer abundance",
        default=DEFAULT_NORMALIZE_LIMIT,
    )

    args = parser.parse_args()

    counting_ht = args.input_table
    infiles = args.input_filenames

    print "file with ht: %s" % counting_ht

    print "loading hashtable"
    ht = khmer.load_counting_hash(counting_ht)
    K = ht.ksize()

    print "K:", K

    ### the filtering function.
    def process_fn(record):
        name = record["name"]
        seq = record["sequence"]
        if "N" in seq:
            return None, None

        if args.variable_coverage:  # only trim when sequence has high enough C
            med, _, _ = ht.get_median_count(seq)
            if med < args.normalize_to:
                return name, seq

        trim_seq, trim_at = ht.trim_on_abundance(seq, args.cutoff)

        if trim_at >= K:
            return name, trim_seq

        return None, None

    ### the filtering loop
    for infile in infiles:
        print "filtering", infile
        outfile = os.path.basename(infile) + ".abundfilt"
        outfp = open(outfile, "w")

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print "output in", outfile
Exemplo n.º 6
0
def main():
    parser = build_counting_multifile_args()
    parser.add_argument('--cutoff', '-C', dest='coverage',
                        default=DEFAULT_COVERAGE, type=int,
                        help="Diginorm coverage.")
    parser.add_argument('--max-error-region', '-M', dest='max_error_region',
                        default=DEFAULT_MAX_ERROR_REGION, type=int,
                        help="Max length of error region allowed")
    args = parser.parse_args()

    counting_ht = args.input_table
    infiles = args.input_filenames

    print 'file with ht: %s' % counting_ht

    print 'loading hashtable'
    ht = khmer.load_counting_hash(counting_ht)
    K = ht.ksize()
    C = args.coverage
    max_error_region = args.max_error_region

    print "K:", K
    print "C:", C
    print "max error region:", max_error_region

    # the filtering function.
    def process_fn(record):
        # read_aligner is probably not threadsafe?
        aligner = khmer.new_readaligner(ht, 1, C, max_error_region)

        name = record['name']
        seq = record['sequence']

        seq = seq.replace('N', 'A')

        grXreAlign, reXgrAlign = aligner.align(seq)

        if len(reXgrAlign) > 0:
            graph_seq = grXreAlign.replace('-', '')
            seq = graph_seq

        return name, seq

    # the filtering loop
    for infile in infiles:
        print 'filtering', infile
        outfile = os.path.basename(infile) + '.corr'
        outfp = open(outfile, 'w')

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print 'output in', outfile
Exemplo n.º 7
0
def main():
    info('filter-abund.py', ['counting'])
    args = get_parser().parse_args()

    check_input_files(args.input_table, args.force)
    infiles = args.input_filename
    for filename in infiles:
        check_input_files(filename, args.force)

    check_space(infiles, args.force)

    print('loading counting table:', args.input_table,
          file=sys.stderr)
    htable = khmer.load_counting_hash(args.input_table)
    ksize = htable.ksize()

    print("K:", ksize, file=sys.stderr)

    # the filtering function.
    def process_fn(record):
        name = record.name
        seq = record.sequence
        seqN = seq.replace('N', 'A')

        if args.variable_coverage:  # only trim when sequence has high enough C
            med, _, _ = htable.get_median_count(seqN)
            if med < args.normalize_to:
                return name, seq

        _, trim_at = htable.trim_on_abundance(seqN, args.cutoff)

        if trim_at >= ksize:
            # be sure to not to change the 'N's in the trimmed sequence -
            # so, return 'seq' and not 'seqN'.
            return name, seq[:trim_at]

        return None, None

    # the filtering loop
    for infile in infiles:
        print('filtering', infile, file=sys.stderr)
        if args.single_output_filename != '':
            outfile = args.single_output_filename
            outfp = open(outfile, 'a')
        else:
            outfile = os.path.basename(infile) + '.abundfilt'
            outfp = open(outfile, 'w')

        tsp = ThreadedSequenceProcessor(process_fn, n_workers=args.threads)
        tsp.start(verbose_loader(infile), outfp)

        print('output in', outfile, file=sys.stderr)
Exemplo n.º 8
0
def main():
    info('filter-abund.py', ['counting'])
    args = get_parser().parse_args()

    counting_ht = args.input_table
    infiles = args.input_filename

    for _ in infiles:
        check_file_status(_)

    check_space(infiles)

    print 'loading hashtable'
    htable = khmer.load_counting_hash(counting_ht)
    ksize = htable.ksize()

    print "K:", ksize

    # the filtering function.
    def process_fn(record):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        if args.variable_coverage:  # only trim when sequence has high enough C
            med, _, _ = htable.get_median_count(seq)
            if med < args.normalize_to:
                return name, seq

        trim_seq, trim_at = htable.trim_on_abundance(seq, args.cutoff)

        if trim_at >= ksize:
            return name, trim_seq

        return None, None

    # the filtering loop
    for infile in infiles:
        print 'filtering', infile
        if args.single_output_filename != '':
            outfile = args.single_output_filename
            outfp = open(outfile, 'a')
        else:
            outfile = os.path.basename(infile) + '.abundfilt'
            outfp = open(outfile, 'w')

        tsp = ThreadedSequenceProcessor(process_fn, n_workers=args.threads)
        tsp.start(verbose_loader(infile), outfp)

        print 'output in', outfile
Exemplo n.º 9
0
def main():
    info('filter-abund.py', ['counting'])
    args = get_parser().parse_args()

    counting_ht = args.input_table
    infiles = args.input_filename

    for _ in infiles:
        check_file_status(_, args.force)

    check_space(infiles, args.force)

    print >>sys.stderr, 'loading hashtable'
    htable = khmer.load_counting_hash(counting_ht)
    ksize = htable.ksize()

    print >>sys.stderr, "K:", ksize

    # the filtering function.
    def process_fn(record):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        if args.variable_coverage:  # only trim when sequence has high enough C
            med, _, _ = htable.get_median_count(seq)
            if med < args.normalize_to:
                return name, seq

        trim_seq, trim_at = htable.trim_on_abundance(seq, args.cutoff)

        if trim_at >= ksize:
            return name, trim_seq

        return None, None

    # the filtering loop
    for infile in infiles:
        print >>sys.stderr, 'filtering', infile
        if args.single_output_filename != '':
            outfile = args.single_output_filename
            outfp = open(outfile, 'a')
        else:
            outfile = os.path.basename(infile) + '.abundfilt'
            outfp = open(outfile, 'w')

        tsp = ThreadedSequenceProcessor(process_fn, n_workers=args.threads)
        tsp.start(verbose_loader(infile), outfp)

        print >>sys.stderr, 'output in', outfile
Exemplo n.º 10
0
def main():
    parser = build_counting_multifile_args()
    parser.add_argument('--cutoff',
                        '-C',
                        dest='cutoff',
                        default=DEFAULT_CUTOFF,
                        type=int,
                        help="Trim at k-mers below this abundance.")
    parser.add_argument('-o', '--outputpath', dest='outputpath', default='.')
    args = parser.parse_args()

    counting_ht = args.input_table
    infiles = args.input_filenames
    outpath = args.outputpath

    print 'file with ht: %s' % counting_ht

    print 'loading hashtable'
    ht = khmer.load_counting_hash(counting_ht)
    K = ht.ksize()

    print "K:", K

    ### the filtering function.
    def process_fn(record):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        trim_seq, trim_at = ht.trim_on_abundance(seq, args.cutoff)

        if trim_at >= K:
            return name, trim_seq

        return None, None

    ### the filtering loop
    for infile in infiles:
        print 'filtering', infile
        outfile = outpath + '/' + os.path.basename(infile) + '.abundfilt'
        outfp = open(outfile, 'w')

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print 'output in', outfile
Exemplo n.º 11
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-k',
                        default=DEFAULT_K,
                        type=int,
                        help='k-mer size',
                        dest='ksize')
    parser.add_argument('stoptags_file')
    parser.add_argument('input_filenames', nargs='+')

    args = parser.parse_args()
    K = args.ksize

    stoptags = args.stoptags_file
    infiles = args.input_filenames

    print 'loading stop tags, with K', K
    ht = khmer.new_hashbits(K, 1, 1)
    ht.load_stop_tags(stoptags)

    def process_fn(record):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        trim_seq, trim_at = ht.trim_on_stoptags(seq)

        if trim_at >= K:
            return name, trim_seq

        return None, None

    ### the filtering loop
    for infile in infiles:
        print 'filtering', infile
        outfile = os.path.dirname(infile) + '/' + os.path.basename(
            infile) + '.stopfilt'

        outfp = open(outfile, 'w')

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print 'output in', outfile
Exemplo n.º 12
0
def main():
    parser = build_counting_multifile_args()
    parser.add_argument('--cutoff', '-C', dest='cutoff',
                        default=DEFAULT_CUTOFF, type=int,
                        help="Trim at k-mers below this abundance.")
    parser.add_argument('-o', '--outputpath', dest='outputpath', default='.')
    args = parser.parse_args()

    counting_ht = args.input_table
    infiles = args.input_filenames
    outpath = args.outputpath    

    print 'file with ht: %s' % counting_ht

    print 'loading hashtable'
    ht = khmer.load_counting_hash(counting_ht)
    K = ht.ksize()

    print "K:", K

    ### the filtering function.
    def process_fn(record):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        trim_seq, trim_at = ht.trim_on_abundance(seq, args.cutoff)

        if trim_at >= K:
            return name, trim_seq

        return None, None


    ### the filtering loop
    for infile in infiles:
       print 'filtering', infile
       outfile = outpath + '/' + os.path.basename(infile) + '.abundfilt'
       outfp = open(outfile, 'w')

       tsp = ThreadedSequenceProcessor(process_fn)
       tsp.start(verbose_loader(infile), outfp)

       print 'output in', outfile
Exemplo n.º 13
0
def main():
    parser = build_counting_multifile_args()
    parser.add_argument('--cutoff',
                        '-C',
                        dest='cutoff',
                        default=DEFAULT_CUTOFF,
                        type=int,
                        help="Trim at reads above this median abundance.")
    args = parser.parse_args()

    counting_ht = args.input_table
    infiles = args.input_filenames

    print 'file with ht: %s' % counting_ht

    print 'loading hashtable'
    ht = khmer.load_counting_hash(counting_ht)
    K = ht.ksize()

    print "K:", K

    ### the filtering function.
    def process_fn(record):
        name = record['name']
        seq = record['sequence']

        med, _, _ = ht.get_median_count(seq)

        if med >= args.cutoff:
            return name, seq

        return None, None

    ### the filtering loop
    for infile in infiles:
        print 'filtering', infile
        outfile = os.path.basename(infile) + '.himed'
        outfp = open(outfile, 'w')

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print 'output in', outfile
Exemplo n.º 14
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-k', default=DEFAULT_K, type=int, help='k-mer size',
                        dest='ksize')
    parser.add_argument('stoptags_file')
    parser.add_argument('input_filenames', nargs='+')

    args = parser.parse_args()
    K = args.ksize

    stoptags = args.stoptags_file
    infiles = args.input_filenames

    print 'loading stop tags, with K', K
    ht = khmer.new_hashbits(K, 1, 1)
    ht.load_stop_tags(stoptags)

    def process_fn(record):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        trim_seq, trim_at = ht.trim_on_stoptags(seq)

        if trim_at >= K:
            return name, trim_seq

        return None, None

    ### the filtering loop
    for infile in infiles:
       print 'filtering', infile
       outfile = os.path.dirname(infile) + '/' + os.path.basename(infile) + '.stopfilt'

       outfp = open(outfile, 'w')

       tsp = ThreadedSequenceProcessor(process_fn)
       tsp.start(verbose_loader(infile), outfp)

       print 'output in', outfile
Exemplo n.º 15
0
def main():
    parser = build_counting_args()
    parser.add_argument('--coverage',
                        '-C',
                        dest='coverage',
                        default=DEFAULT_COVERAGE,
                        type=int)
    args = parser.parse_args()

    counting_ht = args.input_table
    infiles = args.input_filenames

    print('file with ht: %s' % counting_ht)

    print('loading hashtable')
    ht = khmer.load_countgraph(counting_ht)
    K = ht.ksize()

    print("K:", K)

    # the filtering function.
    def process_fn(record):
        name = record['name']
        seq = record['sequence']

        med, avg, dev = ht.get_median_count(seq)

        if random.randint(1, med) > args.coverage:
            return None, None

        return name, seq

    # the filtering loop
    for infile in infiles:
        print('filtering', infile)
        outfile = os.path.basename(infile) + '.medfilt'
        outfp = open(outfile, 'w')

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print('output in', outfile)
Exemplo n.º 16
0
def main():
    parser = build_counting_multifile_args()
    parser.add_argument('--cutoff', '-C', dest='cutoff',
                        default=DEFAULT_CUTOFF, type=int,
                        help="Trim at reads above this median abundance.")
    args = parser.parse_args()

    counting_ht = args.input_table
    infiles = args.input_filenames

    print 'file with ht: %s' % counting_ht

    print 'loading hashtable'
    ht = khmer.load_counting_hash(counting_ht)
    K = ht.ksize()

    print "K:", K

    ### the filtering function.
    def process_fn(record):
        name = record['name']
        seq = record['sequence']

        med, _, _ = ht.get_median_count(seq)

        if med >= args.cutoff:
            return name, seq

        return None, None

    ### the filtering loop
    for infile in infiles:
        print 'filtering', infile
        outfile = os.path.basename(infile) + '.himed'
        outfp = open(outfile, 'w')

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print 'output in', outfile
Exemplo n.º 17
0
def main():
    info("filter-stoptags.py", ["graph"])
    args = get_parser().parse_args()
    stoptags = args.stoptags_file
    infiles = args.input_filenames

    for _ in infiles:
        check_file_status(_)

    check_space(infiles)

    print "loading stop tags, with K", args.ksize
    htable = khmer.new_hashbits(args.ksize, 1, 1)
    htable.load_stop_tags(stoptags)

    def process_fn(record):
        name = record["name"]
        seq = record["sequence"]
        if "N" in seq:
            return None, None

        trim_seq, trim_at = htable.trim_on_stoptags(seq)

        if trim_at >= args.ksize:
            return name, trim_seq

        return None, None

    # the filtering loop
    for infile in infiles:
        print "filtering", infile
        outfile = os.path.basename(infile) + ".stopfilt"

        outfp = open(outfile, "w")

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print "output in", outfile
Exemplo n.º 18
0
def main():
    info('filter-stoptags.py', ['graph'])
    args = get_parser().parse_args()
    stoptags = args.stoptags_file
    infiles = args.input_filenames

    for _ in infiles:
        check_input_files(_, args.force)

    check_space(infiles, args.force)

    print >>sys.stderr, 'loading stop tags, with K', args.ksize
    htable = khmer.new_hashbits(args.ksize, 1, 1)
    htable.load_stop_tags(stoptags)

    def process_fn(record):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        trim_seq, trim_at = htable.trim_on_stoptags(seq)

        if trim_at >= args.ksize:
            return name, trim_seq

        return None, None

    # the filtering loop
    for infile in infiles:
        print >>sys.stderr, 'filtering', infile
        outfile = os.path.basename(infile) + '.stopfilt'

        outfp = open(outfile, 'w')

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print >>sys.stderr, 'output in', outfile
Exemplo n.º 19
0
def main():
    parser = build_counting_args()
    parser.add_argument('--coverage', '-C', dest='coverage',
                        default=DEFAULT_COVERAGE, type=int)
    args = parser.parse_args()

    counting_ht = args.input_table
    infiles = args.input_filenames

    print 'file with ht: %s' % counting_ht

    print 'loading hashtable'
    ht = khmer.load_counting_hash(counting_ht)
    K = ht.ksize()

    print "K:", K

    # the filtering function.
    def process_fn(record):
        name = record['name']
        seq = record['sequence']

        med, avg, dev = ht.get_median_count(seq)

        if random.randint(1, med) > args.coverage:
            return None, None

        return name, seq

    # the filtering loop
    for infile in infiles:
        print 'filtering', infile
        outfile = os.path.basename(infile) + '.medfilt'
        outfp = open(outfile, 'w')

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print 'output in', outfile
Exemplo n.º 20
0
def main():
    args = sanitize_help(get_parser()).parse_args()
    stoptags = args.stoptags_file
    infiles = args.input_filenames

    for _ in infiles:
        check_input_files(_, args.force)

    check_space(infiles, args.force)

    print('loading stop tags, with K', args.ksize, file=sys.stderr)
    nodegraph = Nodegraph(args.ksize, 1, 1)
    nodegraph.load_stop_tags(stoptags)

    def process_fn(record):
        name = record.name
        seq = record.sequence
        if 'N' in seq:
            return None, None

        trim_seq, trim_at = nodegraph.trim_on_stoptags(seq)

        if trim_at >= args.ksize:
            return name, trim_seq

        return None, None

    # the filtering loop
    for infile in infiles:
        print('filtering', infile, file=sys.stderr)
        outfile = os.path.basename(infile) + '.stopfilt'

        outfp = open(outfile, 'w')

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print('output in', outfile, file=sys.stderr)
Exemplo n.º 21
0
def main():
    args = sanitize_help(get_parser()).parse_args()
    stoptags = args.stoptags_file
    infiles = args.input_filenames

    for _ in infiles:
        check_input_files(_, args.force)

    check_space(infiles, args.force)

    print('loading stop tags, with K', args.ksize, file=sys.stderr)
    nodegraph = Nodegraph(args.ksize, 1, 1)
    nodegraph.load_stop_tags(stoptags)

    def process_fn(record):
        name = record.name
        seq = record.sequence
        if 'N' in seq:
            return None, None

        trim_seq, trim_at = nodegraph.trim_on_stoptags(seq)

        if trim_at >= args.ksize:
            return name, trim_seq

        return None, None

    # the filtering loop
    for infile in infiles:
        print('filtering', infile, file=sys.stderr)
        outfile = os.path.basename(infile) + '.stopfilt'

        outfp = open(outfile, 'w')

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print('output in', outfile, file=sys.stderr)
Exemplo n.º 22
0
def main():
    parser = build_counting_multifile_args()
    parser.add_argument("--coverage", "-C", dest="coverage", default=DEFAULT_COVERAGE, type=int)
    args = parser.parse_args()

    counting_ht = args.input_table
    infiles = args.input_filenames

    print "file with ht: %s" % counting_ht

    print "loading hashtable"
    ht = khmer.load_counting_hash(counting_ht)
    K = ht.ksize()

    print "K:", K

    ### the filtering function.
    def process_fn(record):
        name = record["name"]
        seq = record["sequence"]

        med, avg, dev = ht.get_median_count(seq)

        if random.randint(1, med) > args.coverage:
            return None, None

        return name, seq

    ### the filtering loop
    for infile in infiles:
        print "filtering", infile
        outfile = os.path.basename(infile) + ".medfilt"
        outfp = open(outfile, "w")

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print "output in", outfile
Exemplo n.º 23
0
def main():
    info('filter-abund-single.py', ['counting', 'SeqAn'])
    args = get_parser().parse_args()
    check_input_files(args.datafile, args.force)
    check_space([args.datafile], args.force)
    if args.savetable:
        check_space_for_hashtable(
            args.n_tables * args.min_tablesize, args.force)
    report_on_config(args)

    print >>sys.stderr, 'making k-mer counting table'
    htable = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                     args.n_tables)

    # first, load reads into hash table
    rparser = khmer.ReadParser(args.datafile)
    threads = []
    print >>sys.stderr, 'consuming input, round 1 --', args.datafile
    for _ in xrange(args.threads):
        cur_thread = \
            threading.Thread(
                target=htable.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(cur_thread)
        cur_thread.start()

    for _ in threads:
        _.join()

    if args.report_total_kmers:
        print >> sys.stderr, 'Total number of unique k-mers: {0}'.format(
            htable.n_unique_kmers())

    fp_rate = khmer.calc_expected_collisions(htable, args.force)
    print >>sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate

    # now, trim.

    # the filtering function.
    def process_fn(record):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        trim_seq, trim_at = htable.trim_on_abundance(seq, args.cutoff)

        if trim_at >= args.ksize:
            return name, trim_seq

        return None, None

    # the filtering loop
    print >>sys.stderr, 'filtering', args.datafile
    outfile = os.path.basename(args.datafile) + '.abundfilt'
    outfp = open(outfile, 'w')

    tsp = ThreadedSequenceProcessor(process_fn)
    tsp.start(verbose_loader(args.datafile), outfp)

    print >>sys.stderr, 'output in', outfile

    if args.savetable:
        print >>sys.stderr, 'Saving k-mer counting table filename', \
            args.savetable
        print >>sys.stderr, '...saving to', args.savetable
        htable.save(args.savetable)
    print >>sys.stderr, 'wrote to: ', outfile
Exemplo n.º 24
0
def main():
    parser = build_construct_args(
        "Filter k-mers at the given abundance (inmem version).")
    add_threading_args(parser)

    parser.add_argument('--cutoff',
                        '-C',
                        dest='cutoff',
                        default=DEFAULT_CUTOFF,
                        type=int,
                        help="Trim at k-mers below this abundance.")
    parser.add_argument('--savehash', dest='savehash', default='')
    parser.add_argument('datafile')

    args = parser.parse_args()
    report_on_config(args)

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes
    n_threads = int(args.n_threads)

    config = khmer.get_config()
    bufsz = config.get_reads_input_buffer_size()
    config.set_reads_input_buffer_size(n_threads * 64 * 1024)

    print 'making hashtable'
    ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads)

    filename = args.datafile

    # first, load reads into hash table
    rparser = khmer.ReadParser(filename, n_threads)
    threads = []
    print 'consuming input, round 1 --', filename
    for tnum in xrange(n_threads):
        t = \
            threading.Thread(
                target=ht.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(t)
        t.start()

    for t in threads:
        t.join()

    fp_rate = khmer.calc_expected_collisions(ht)
    print 'fp rate estimated to be %1.3f' % fp_rate

    # now, trim.

    # the filtering function.
    def process_fn(record):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        trim_seq, trim_at = ht.trim_on_abundance(seq, args.cutoff)

        if trim_at >= K:
            return name, trim_seq

        return None, None

    # the filtering loop
    print 'filtering', filename
    outfile = os.path.basename(filename) + '.abundfilt'
    outfp = open(outfile, 'w')

    tsp = ThreadedSequenceProcessor(process_fn)
    tsp.start(verbose_loader(filename), outfp)

    print 'output in', outfile

    if args.savehash:
        print 'Saving hashfile', args.savehash
        print '...saving to', args.savehash
        ht.save(args.savehash)
Exemplo n.º 25
0
def main():
    info('filter-abund-single.py', ['counting', 'SeqAn'])
    args = get_parser().parse_args()
    check_input_files(args.datafile, args.force)
    check_space([args.datafile], args.force)
    if args.savetable:
        check_space_for_hashtable(args.n_tables * args.min_tablesize,
                                  args.force)
    report_on_config(args)

    print >> sys.stderr, 'making k-mer counting table'
    htable = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                     args.n_tables)

    # first, load reads into hash table
    rparser = khmer.ReadParser(args.datafile)
    threads = []
    print >> sys.stderr, 'consuming input, round 1 --', args.datafile
    for _ in xrange(args.threads):
        cur_thread = \
            threading.Thread(
                target=htable.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(cur_thread)
        cur_thread.start()

    for _ in threads:
        _.join()

    if args.report_total_kmers:
        print >> sys.stderr, 'Total number of unique k-mers: {0}'.format(
            htable.n_unique_kmers())

    fp_rate = khmer.calc_expected_collisions(htable, args.force)
    print >> sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate

    # now, trim.

    # the filtering function.
    def process_fn(record):
        name = record.name
        seq = record.sequence
        seqN = seq.replace('N', 'A')

        _, trim_at = htable.trim_on_abundance(seqN, args.cutoff)

        if trim_at >= args.ksize:
            # be sure to not to change the 'N's in the trimmed sequence -
            # so, return 'seq' and not 'seqN'.
            return name, seq[:trim_at]

        return None, None

    # the filtering loop
    print >> sys.stderr, 'filtering', args.datafile
    outfile = os.path.basename(args.datafile) + '.abundfilt'
    outfp = open(outfile, 'w')

    tsp = ThreadedSequenceProcessor(process_fn)
    tsp.start(verbose_loader(args.datafile), outfp)

    print >> sys.stderr, 'output in', outfile

    if args.savetable:
        print >>sys.stderr, 'Saving k-mer counting table filename', \
            args.savetable
        print >> sys.stderr, '...saving to', args.savetable
        htable.save(args.savetable)
    print >> sys.stderr, 'wrote to: ', outfile
Exemplo n.º 26
0
def main():
    args = sanitize_help(get_parser()).parse_args()
    if not args.quiet:
        info('filter-abund-single.py', ['counting', 'SeqAn'])

    configure_logging(args.quiet)
    check_input_files(args.datafile, args.force)
    check_space([args.datafile], args.force)

    if args.savegraph:
        tablesize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, tablesize, args.force)

    report_on_config(args)

    log_info('making countgraph')
    graph = khmer_args.create_countgraph(args)

    # first, load reads into graph
    rparser = khmer.ReadParser(args.datafile)
    threads = []
    log_info('consuming input, round 1 -- {datafile}', datafile=args.datafile)
    for _ in range(args.threads):
        cur_thread = \
            threading.Thread(
                target=graph.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(cur_thread)
        cur_thread.start()

    for _ in threads:
        _.join()

    log_info('Total number of unique k-mers: {nk}', nk=graph.n_unique_kmers())

    fp_rate = khmer.calc_expected_collisions(graph, args.force)
    log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate)

    # now, trim.

    # the filtering function.
    def process_fn(record):
        name = record.name
        seq = record.sequence
        seqN = seq.replace('N', 'A')

        _, trim_at = graph.trim_on_abundance(seqN, args.cutoff)

        if trim_at >= args.ksize:
            # be sure to not to change the 'N's in the trimmed sequence -
            # so, return 'seq' and not 'seqN'.
            return name, seq[:trim_at]

        return None, None

    # the filtering loop
    log_info('filtering {datafile}', datafile=args.datafile)
    if args.outfile is None:
        outfile = os.path.basename(args.datafile) + '.abundfilt'
    else:
        outfile = args.outfile
    outfp = open(outfile, 'wb')
    outfp = get_file_writer(outfp, args.gzip, args.bzip)

    tsp = ThreadedSequenceProcessor(process_fn, verbose=not args.quiet)
    tsp.start(verbose_loader(args.datafile), outfp)

    log_info('output in {outfile}', outfile=outfile)

    if args.savegraph:
        log_info('Saving k-mer countgraph filename {graph}',
                 graph=args.savegraph)
        graph.save(args.savegraph)
Exemplo n.º 27
0
def main():
    info('filter-abund-single.py', ['counting', 'SeqAn'])
    args = sanitize_help(get_parser()).parse_args()

    check_input_files(args.datafile, args.force)
    check_space([args.datafile], args.force)

    if args.savegraph:
        tablesize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, tablesize, args.force)

    report_on_config(args)

    print('making countgraph', file=sys.stderr)
    graph = khmer_args.create_countgraph(args)

    # first, load reads into graph
    rparser = khmer.ReadParser(args.datafile)
    threads = []
    print('consuming input, round 1 --', args.datafile, file=sys.stderr)
    for _ in range(args.threads):
        cur_thread = \
            threading.Thread(
                target=graph.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(cur_thread)
        cur_thread.start()

    for _ in threads:
        _.join()

    print('Total number of unique k-mers: {0}'.format(graph.n_unique_kmers()),
          file=sys.stderr)

    fp_rate = khmer.calc_expected_collisions(graph, args.force)
    print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr)

    # now, trim.

    # the filtering function.
    def process_fn(record):
        name = record.name
        seq = record.sequence
        seqN = seq.replace('N', 'A')

        _, trim_at = graph.trim_on_abundance(seqN, args.cutoff)

        if trim_at >= args.ksize:
            # be sure to not to change the 'N's in the trimmed sequence -
            # so, return 'seq' and not 'seqN'.
            return name, seq[:trim_at]

        return None, None

    # the filtering loop
    print('filtering', args.datafile, file=sys.stderr)
    outfile = os.path.basename(args.datafile) + '.abundfilt'
    outfile = open(outfile, 'wb')
    outfp = get_file_writer(outfile, args.gzip, args.bzip)

    tsp = ThreadedSequenceProcessor(process_fn)
    tsp.start(verbose_loader(args.datafile), outfp)

    print('output in', outfile.name, file=sys.stderr)

    if args.savegraph:
        print('Saving k-mer countgraph filename',
              args.savegraph,
              file=sys.stderr)
        graph.save(args.savegraph)
Exemplo n.º 28
0
def main():
    info('filter-abund-single.py', ['counting', 'SeqAn'])
    args = get_parser().parse_args()
    check_input_files(args.datafile, args.force)
    check_space([args.datafile], args.force)
    if args.savetable:
        check_space_for_hashtable(args, 'countgraph', args.force)
    report_on_config(args)

    print('making countgraph', file=sys.stderr)
    htable = khmer_args.create_countgraph(args)

    # first, load reads into hash table
    rparser = khmer.ReadParser(args.datafile)
    threads = []
    print('consuming input, round 1 --', args.datafile, file=sys.stderr)
    for _ in range(args.threads):
        cur_thread = \
            threading.Thread(
                target=htable.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(cur_thread)
        cur_thread.start()

    for _ in threads:
        _.join()

    if args.report_total_kmers:
        print('Total number of unique k-mers: {0}'.format(
            htable.n_unique_kmers()), file=sys.stderr)

    fp_rate = khmer.calc_expected_collisions(htable, args.force)
    print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr)

    # now, trim.

    # the filtering function.
    def process_fn(record):
        name = record.name
        seq = record.sequence
        seqN = seq.replace('N', 'A')

        _, trim_at = htable.trim_on_abundance(seqN, args.cutoff)

        if trim_at >= args.ksize:
            # be sure to not to change the 'N's in the trimmed sequence -
            # so, return 'seq' and not 'seqN'.
            return name, seq[:trim_at]

        return None, None

    # the filtering loop
    print('filtering', args.datafile, file=sys.stderr)
    outfile = os.path.basename(args.datafile) + '.abundfilt'
    outfp = open(outfile, 'w')

    tsp = ThreadedSequenceProcessor(process_fn)
    tsp.start(verbose_loader(args.datafile), outfp)

    print('output in', outfile, file=sys.stderr)

    if args.savetable:
        print('Saving k-mer counting table filename',
              args.savetable, file=sys.stderr)
        print('...saving to', args.savetable, file=sys.stderr)
        htable.save(args.savetable)
    print('wrote to: ', outfile, file=sys.stderr)
Exemplo n.º 29
0
def main():
    parser = build_counting_multifile_args()
    parser.add_argument('--cutoff',
                        '-C',
                        dest='cutoff',
                        default=DEFAULT_CUTOFF,
                        type=int,
                        help="Trim at k-mers below this abundance.")

    parser.add_argument('-V',
                        '--variable-coverage',
                        action='store_true',
                        dest='variable_coverage',
                        default=False)
    parser.add_argument('--normalize-to',
                        '-Z',
                        type=int,
                        dest='normalize_to',
                        help='base variable-coverage cutoff on this median'
                        ' k-mer abundance',
                        default=DEFAULT_NORMALIZE_LIMIT)

    args = parser.parse_args()

    counting_ht = args.input_table
    infiles = args.input_filenames

    print 'file with ht: %s' % counting_ht

    print 'loading hashtable'
    ht = khmer.load_counting_hash(counting_ht)
    K = ht.ksize()

    print "K:", K

    # the filtering function.
    def process_fn(record):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        if args.variable_coverage:  # only trim when sequence has high enough C
            med, _, _ = ht.get_median_count(seq)
            if med < args.normalize_to:
                return name, seq

        trim_seq, trim_at = ht.trim_on_abundance(seq, args.cutoff)

        if trim_at >= K:
            return name, trim_seq

        return None, None

    # the filtering loop
    for infile in infiles:
        print 'filtering', infile
        outfile = os.path.basename(infile) + '.abundfilt'
        outfp = open(outfile, 'w')

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print 'output in', outfile
Exemplo n.º 30
0
def main():
    args = sanitize_help(get_parser()).parse_args()
    if not args.quiet:
        info('filter-abund.py', ['counting'])

    configure_logging(args.quiet)

    infiles = args.input_filename
    if ('-' in infiles or '/dev/stdin' in infiles) and not \
       args.single_output_file:
        log_error("Accepting input from stdin; output filename must "
                  "be provided with -o.")
        sys.exit(1)

    for filename in infiles:
        check_input_files(filename, args.force)

    check_space(infiles, args.force)

    log_info('loading countgraph: {graph}', graph=args.input_graph)
    countgraph = khmer.load_countgraph(args.input_graph)
    ksize = countgraph.ksize()

    log_info("K: {ksize}", ksize=ksize)

    # the filtering function.
    def process_fn(record):
        name = record.name
        seq = record.sequence
        seqN = seq.replace('N', 'A')

        if args.variable_coverage:  # only trim when sequence has high enough C
            med, _, _ = countgraph.get_median_count(seqN)
            if med < args.normalize_to:
                return name, seq

        _, trim_at = countgraph.trim_on_abundance(seqN, args.cutoff)

        if trim_at >= ksize:
            # be sure to not to change the 'N's in the trimmed sequence -
            # so, return 'seq' and not 'seqN'.
            return name, seq[:trim_at]

        return None, None

    if args.single_output_file:
        outfile = args.single_output_file.name
        outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip)

    # the filtering loop
    for infile in infiles:
        log_info('filtering {infile}', infile=infile)
        if not args.single_output_file:
            outfile = os.path.basename(infile) + '.abundfilt'
            outfp = open(outfile, 'wb')
            outfp = get_file_writer(outfp, args.gzip, args.bzip)

        tsp = ThreadedSequenceProcessor(process_fn, n_workers=args.threads,
                                        verbose=not args.quiet)
        tsp.start(verbose_loader(infile), outfp)

        log_info('output in {outfile}', outfile=outfile)
Exemplo n.º 31
0
def main():
    args = sanitize_help(get_parser()).parse_args()
    if not args.quiet:
        info('filter-abund-single.py', ['counting', 'SeqAn'])

    configure_logging(args.quiet)
    check_input_files(args.datafile, args.force)
    check_space([args.datafile], args.force)

    if args.savegraph:
        tablesize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, tablesize, args.force)

    report_on_config(args)

    log_info('making countgraph')
    graph = khmer_args.create_countgraph(args)

    # first, load reads into graph
    rparser = khmer.ReadParser(args.datafile)
    threads = []
    log_info('consuming input, round 1 -- {datafile}', datafile=args.datafile)
    for _ in range(args.threads):
        cur_thread = \
            threading.Thread(
                target=graph.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(cur_thread)
        cur_thread.start()

    for _ in threads:
        _.join()

    log_info('Total number of unique k-mers: {nk}', nk=graph.n_unique_kmers())

    fp_rate = khmer.calc_expected_collisions(graph, args.force)
    log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate)

    # now, trim.

    # the filtering function.
    def process_fn(record):
        name = record.name
        seq = record.sequence
        seqN = seq.replace('N', 'A')

        if args.variable_coverage:  # only trim when sequence has high enough C
            med, _, _ = graph.get_median_count(seqN)
            if med < args.normalize_to:
                return name, seq

        _, trim_at = graph.trim_on_abundance(seqN, args.cutoff)

        if trim_at >= args.ksize:
            # be sure to not to change the 'N's in the trimmed sequence -
            # so, return 'seq' and not 'seqN'.
            return name, seq[:trim_at]

        return None, None

    # the filtering loop
    log_info('filtering {datafile}', datafile=args.datafile)
    if args.outfile is None:
        outfile = os.path.basename(args.datafile) + '.abundfilt'
    else:
        outfile = args.outfile
    outfp = open(outfile, 'wb')
    outfp = get_file_writer(outfp, args.gzip, args.bzip)

    tsp = ThreadedSequenceProcessor(process_fn, verbose=not args.quiet)
    tsp.start(verbose_loader(args.datafile), outfp)

    log_info('output in {outfile}', outfile=outfile)

    if args.savegraph:
        log_info('Saving k-mer countgraph filename {graph}',
                 graph=args.savegraph)
        graph.save(args.savegraph)
Exemplo n.º 32
0
def main():
    parser = build_construct_args(
        "Filter k-mers at the given abundance (inmem version).")
    add_threading_args(parser)

    parser.add_argument('--cutoff', '-C', dest='cutoff',
                        default=DEFAULT_CUTOFF, type=int,
                        help="Trim at k-mers below this abundance.")
    parser.add_argument('--savehash', dest='savehash', default='')
    parser.add_argument('datafile')
    
    args = parser.parse_args()
    report_on_config(args)

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes
    n_threads = int(args.n_threads)

    config = khmer.get_config()
    bufsz = config.get_reads_input_buffer_size()
    config.set_reads_input_buffer_size(n_threads * 64 * 1024)

    print 'making hashtable'
    ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads)

    filename = args.datafile

    ### first, load reads into hash table
    rparser = khmer.ReadParser(filename, n_threads)
    threads = []
    print 'consuming input, round 1 --', filename
    for tnum in xrange(n_threads):
        t = \
            threading.Thread(
                target=ht.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(t)
        t.start()

    for t in threads:
        t.join()

    fp_rate = khmer.calc_expected_collisions(ht)
    print 'fp rate estimated to be %1.3f' % fp_rate

    ### now, trim.

    ### the filtering function.
    def process_fn(record):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        trim_seq, trim_at = ht.trim_on_abundance(seq, args.cutoff)

        if trim_at >= K:
            return name, trim_seq

        return None, None

    ### the filtering loop
    print 'filtering', filename
    outfile = os.path.basename(filename) + '.abundfilt'
    outfp = open(outfile, 'w')

    tsp = ThreadedSequenceProcessor(process_fn)
    tsp.start(verbose_loader(filename), outfp)

    print 'output in', outfile

    if args.savehash:
        print 'Saving hashfile', args.savehash
        print '...saving to', args.savehash
        ht.save(args.savehash)
Exemplo n.º 33
0
def main():
    info('filter-abund-single.py', ['counting'])
    args = get_parser().parse_args()
    check_file_status(args.datafile)
    check_space([args.datafile])
    if args.savetable:
        check_space_for_hashtable(args.n_tables * args.min_tablesize)
    report_on_config(args)

    config = khmer.get_config()
    config.set_reads_input_buffer_size(args.threads * 64 * 1024)

    print 'making k-mer counting table'
    htable = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                     args.n_tables,
                                     args.threads)

    # first, load reads into hash table
    rparser = khmer.ReadParser(args.datafile, args.threads)
    threads = []
    print 'consuming input, round 1 --', args.datafile
    for _ in xrange(args.threads):
        cur_thread = \
            threading.Thread(
                target=htable.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(cur_thread)
        cur_thread.start()

    for _ in threads:
        _.join()

    if args.report_total_kmers:
        print >> sys.stderr, 'Total number of k-mers: {0}'.format(
            htable.n_occupied())

    fp_rate = khmer.calc_expected_collisions(htable)
    print 'fp rate estimated to be %1.3f' % fp_rate

    # now, trim.

    # the filtering function.
    def process_fn(record):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        trim_seq, trim_at = htable.trim_on_abundance(seq, args.cutoff)

        if trim_at >= args.ksize:
            return name, trim_seq

        return None, None

    # the filtering loop
    print 'filtering', args.datafile
    outfile = os.path.basename(args.datafile) + '.abundfilt'
    outfp = open(outfile, 'w')

    tsp = ThreadedSequenceProcessor(process_fn)
    tsp.start(verbose_loader(args.datafile), outfp)

    print 'output in', outfile

    if args.savetable:
        print 'Saving k-mer counting table filename', args.savetable
        print '...saving to', args.savetable
        htable.save(args.savetable)