예제 #1
0
def test_banding_to_disk(ksize, memory, numbands):
    """
    Test accuracy of banding in terms of the data structure contents.

    Stronger than the functional in-memory test, this function tests whether
    a computing k-mer abundances in banding mode produces the same data
    structure as counting k-mer abundances in the normal fashion.
    """
    infile = utils.get_test_data('banding-reads.fq')
    path1 = utils.get_temp_filename('normal.ct')
    path2 = utils.get_temp_filename('banding.ct')

    ct = khmer.Counttable(ksize, memory / 4, 4)
    ct.consume_seqfile(infile)
    ct.save(path1)
    fpr = khmer.calc_expected_collisions(ct)
    print('FPR', fpr)

    ct = khmer.Counttable(ksize, memory / 4, 4)
    for band in range(numbands):
        ct.consume_seqfile_banding(infile, numbands, band)
    ct.save(path2)
    fpr = khmer.calc_expected_collisions(ct)
    print('FPR', fpr)

    with open(path1, 'rb') as f1, open(path2, 'rb') as f2:
        assert f1.read() == f2.read()
예제 #2
0
def test_banding_to_disk(ksize, memory, numbands):
    """
    Test accuracy of banding in terms of the data structure contents.

    Stronger than the functional in-memory test, this function tests whether
    a computing k-mer abundances in banding mode produces the same data
    structure as counting k-mer abundances in the normal fashion.
    """
    infile = utils.get_test_data('banding-reads.fq')
    path1 = utils.get_temp_filename('normal.ct')
    path2 = utils.get_temp_filename('banding.ct')

    ct = khmer.Counttable(ksize, memory / 4, 4)
    ct.consume_seqfile(infile)
    ct.save(path1)
    fpr = khmer.calc_expected_collisions(ct)
    print('FPR', fpr)

    ct = khmer.Counttable(ksize, memory / 4, 4)
    for band in range(numbands):
        ct.consume_seqfile_banding(infile, numbands, band)
    ct.save(path2)
    fpr = khmer.calc_expected_collisions(ct)
    print('FPR', fpr)

    with open(path1, 'rb') as f1, open(path2, 'rb') as f2:
        assert f1.read() == f2.read()
예제 #3
0
def normalizeByMedian(cf):
	"""Wrapper for the normalizeByMedian_impl function."""
	kmersize = cf.get_parameter('kmersize', 'int')
	minhashsize = cf.get_parameter('minhashsize', 'float')
	nhashes = cf.get_parameter('nhashes', 'int')
	cutoff = cf.get_parameter('cutoff', 'int')
	
	inputfile = cf.get_input('inputfile')
	outputfile = cf.get_output('outputfile')
	infp = screed.open(inputfile)
	outfp = open(outputfile, 'w')
	
	discarded, total, ht, n = normalizeByMedian_impl(cf, infp, outfp, kmersize, \
		minhashsize, nhashes, cutoff)
		
	outfp.close()
	infp.close()

	if -1 < n:
		percent_kept = int(100. - discarded / float(total) * 100.)
		cf.write_log("DONE with %s; kept %s of %s or %s%%" % (inputfile, \
			total - discarded, total, percent_kept))
		cf.write_log("Output in %s" % outputfile)

	# Change 0.2 only if you really grok it.  HINT: You don't.
	fp_rate = khmer.calc_expected_collisions(ht)
	cf.write_log("fp rate estimated to be %1.3f" % fp_rate)

	if fp_rate > 0.20:
		cf.write_error("ERROR: the counting hash is too small.")
		cf.write_error("Increase the hashsize/num ht.")
		return constants.GENERIC_ERROR
	return constants.OK
예제 #4
0
 def __str__(self):
     return ('**Leaf:{name} [occupied: {nb}, fpr: {fpr:.2}]'
             '-> {metadata}'.format(name=self.name,
                                    metadata=self.metadata,
                                    nb=self.data.n_occupied(),
                                    fpr=khmer.calc_expected_collisions(
                                        self.data, True, 1.1)))
예제 #5
0
 def __str__(self):
     return "**Leaf:{name} [occupied: {nb}, fpr: {fpr:.2}] -> {metadata}".format(
         name=self.name,
         metadata=self.metadata,
         nb=self.graph.n_occupied(),
         fpr=khmer.calc_expected_collisions(self.graph, True, 1.1),
     )
예제 #6
0
def main():
    info("filter-abund-single.py", ["counting"])
    args = get_parser().parse_args()
    check_file_status(args.datafile)
    check_space([args.datafile])
    if args.savetable:
        check_space_for_hashtable(args.n_tables * args.min_tablesize)
    report_on_config(args)

    config = khmer.get_config()
    config.set_reads_input_buffer_size(args.threads * 64 * 1024)

    print "making k-mer counting table"
    htable = khmer.new_counting_hash(args.ksize, args.min_tablesize, args.n_tables, args.threads)

    # first, load reads into hash table
    rparser = khmer.ReadParser(args.datafile, args.threads)
    threads = []
    print "consuming input, round 1 --", args.datafile
    for _ in xrange(args.threads):
        cur_thread = threading.Thread(target=htable.consume_fasta_with_reads_parser, args=(rparser,))
        threads.append(cur_thread)
        cur_thread.start()

    for _ in threads:
        _.join()

    fp_rate = khmer.calc_expected_collisions(htable)
    print "fp rate estimated to be %1.3f" % fp_rate

    # now, trim.

    # the filtering function.
    def process_fn(record):
        name = record["name"]
        seq = record["sequence"]
        if "N" in seq:
            return None, None

        trim_seq, trim_at = htable.trim_on_abundance(seq, args.cutoff)

        if trim_at >= args.ksize:
            return name, trim_seq

        return None, None

    # the filtering loop
    print "filtering", args.datafile
    outfile = os.path.basename(args.datafile) + ".abundfilt"
    outfp = open(outfile, "w")

    tsp = ThreadedSequenceProcessor(process_fn)
    tsp.start(verbose_loader(args.datafile), outfp)

    print "output in", outfile

    if args.savetable:
        print "Saving k-mer counting table filename", args.savetable
        print "...saving to", args.savetable
        htable.save(args.savetable)
예제 #7
0
def main():
    args = sanitize_help(get_parser()).parse_args()

    configure_logging(args.quiet)
    check_input_files(args.datafile, args.force)
    check_space([args.datafile], args.force)

    if args.savegraph:
        tablesize = calculate_graphsize(args, "countgraph")
        check_space_for_graph(args.savegraph, tablesize, args.force)

    report_on_config(args)

    log_info("making countgraph")
    graph = khmer_args.create_countgraph(args)

    # first, load reads into graph
    rparser = khmer.ReadParser(args.datafile)
    threads = []
    log_info("consuming input, round 1 -- {datafile}", datafile=args.datafile)
    for _ in range(args.threads):
        cur_thread = threading.Thread(target=graph.consume_fasta_with_reads_parser, args=(rparser,))
        threads.append(cur_thread)
        cur_thread.start()

    for _ in threads:
        _.join()

    log_info("Total number of unique k-mers: {nk}", nk=graph.n_unique_kmers())

    fp_rate = khmer.calc_expected_collisions(graph, args.force)
    log_info("fp rate estimated to be {fpr:1.3f}", fpr=fp_rate)

    # the filtering loop
    log_info("filtering {datafile}", datafile=args.datafile)
    if args.outfile is None:
        outfile = os.path.basename(args.datafile) + ".abundfilt"
    else:
        outfile = args.outfile
    outfp = open(outfile, "wb")
    outfp = get_file_writer(outfp, args.gzip, args.bzip)

    paired_iter = broken_paired_reader(ReadParser(args.datafile), min_length=graph.ksize(), force_single=True)

    for n, is_pair, read1, read2 in paired_iter:
        assert not is_pair
        assert read2 is None

        trimmed_record, _ = trim_record(graph, read1, args.cutoff, args.variable_coverage, args.normalize_to)
        if trimmed_record:
            print((trimmed_record,))
            write_record(trimmed_record, outfp)

    log_info("output in {outfile}", outfile=outfile)

    if args.savegraph:
        log_info("Saving k-mer countgraph filename {graph}", graph=args.savegraph)
        graph.save(args.savegraph)
예제 #8
0
def main(args):
    info('build-graph.py', ['graph', 'SeqAn'])

    report_on_config(args, hashtype='nodegraph')
    base = args.output_filename
    filenames = args.input_filenames

    for fname in args.input_filenames:
        check_input_files(fname, args.force)

    # if optimization args are given, do optimization
    args = functions.do_sanity_checking(args, 0.01)

    check_space(args.input_filenames, args.force)
    check_space_for_hashtable(args, 'nodegraph', args.force)

    print('Saving k-mer presence table to %s' % base, file=sys.stderr)
    print('Loading kmers from sequences in %s' %
          repr(filenames), file=sys.stderr)
    if args.no_build_tagset:
        print('We WILL NOT build the tagset.', file=sys.stderr)
    else:
        print('We WILL build the tagset (for partitioning/traversal).',
              file=sys.stderr)

    print('making nodegraph', file=sys.stderr)
    htable = khmer_args.create_nodegraph(args)

    functions.build_graph(filenames, htable, args.threads,
                          not args.no_build_tagset)

    print('Total number of unique k-mers: {0}'.format(htable.n_unique_kmers()),
          file=sys.stderr)

    print('saving k-mer presence table in', base + '.pt', file=sys.stderr)
    htable.save(base + '.pt')

    if not args.no_build_tagset:
        print('saving tagset in', base + '.tagset', file=sys.stderr)
        htable.save_tagset(base + '.tagset')

    info_fp = open(base + '.info', 'w')
    info_fp.write('%d unique k-mers' % htable.n_unique_kmers())

    fp_rate = \
        khmer.calc_expected_collisions(htable, args.force, max_false_pos=.15)
    # 0.18 is ACTUAL MAX. Do not change.

    print('false positive rate estimated to be %1.3f' % fp_rate,
          file=sys.stderr)
    print('\nfalse positive rate estimated to be %1.3f' % fp_rate,
          file=info_fp)

    print('wrote to', base + '.info and', base + '.pt', file=sys.stderr)
    if not args.no_build_tagset:
        print('and ' + base + '.tagset', file=sys.stderr)

    sys.exit(0)
예제 #9
0
def main(args):
    graph_type = 'nodegraph'
    report_on_config(args, graphtype=graph_type)
    base = args.output_filename
    filenames = args.input_filenames

    for fname in args.input_filenames:
        check_input_files(fname, args.force)

    graphsize = calculate_graphsize(args, graph_type)
    space_needed = (args.n_tables * graphsize /
                    khmer._buckets_per_byte[graph_type])
    check_space_for_graph(args.output_filename, space_needed, args.force)

    print('Saving k-mer nodegraph to %s' % base, file=sys.stderr)
    print('Loading kmers from sequences in %s' % repr(filenames),
          file=sys.stderr)
    if args.no_build_tagset:
        print('We WILL NOT build the tagset.', file=sys.stderr)
    else:
        print('We WILL build the tagset (for partitioning/traversal).',
              file=sys.stderr)

    print('making nodegraph', file=sys.stderr)
    nodegraph = khmer_args.create_nodegraph(args)

    oxfuncs.build_graph(filenames, nodegraph, args.threads,
                        not args.no_build_tagset)

    print('Total number of unique k-mers: {0}'.format(
        nodegraph.n_unique_kmers()),
          file=sys.stderr)

    print('saving k-mer nodegraph in', base, file=sys.stderr)
    nodegraph.save(base)

    if not args.no_build_tagset:
        print('saving tagset in', base + '.tagset', file=sys.stderr)
        nodegraph.save_tagset(base + '.tagset')

    info_fp = open(base + '.info', 'w')
    info_fp.write('%d unique k-mers' % nodegraph.n_unique_kmers())

    fp_rate = \
        khmer.calc_expected_collisions(
            nodegraph, args.force, max_false_pos=.15)
    # 0.18 is ACTUAL MAX. Do not change.

    print('false positive rate estimated to be %1.3f' % fp_rate,
          file=sys.stderr)
    print('\nfalse positive rate estimated to be %1.3f' % fp_rate,
          file=info_fp)

    print('wrote to ' + base + '.info and ' + base, file=sys.stderr)
    if not args.no_build_tagset:
        print('and ' + base + '.tagset', file=sys.stderr)

    sys.exit(0)
예제 #10
0
def main(args):
    graph_type = 'nodegraph'
    report_on_config(args, graphtype=graph_type)
    base = args.output_filename
    filenames = args.input_filenames

    for fname in args.input_filenames:
        check_input_files(fname, args.force)

    graphsize = calculate_graphsize(args, graph_type)
    space_needed = (args.n_tables * graphsize /
                    khmer._buckets_per_byte[graph_type])
    check_space_for_graph(args.output_filename, space_needed, args.force)

    print('Saving k-mer nodegraph to %s' % base, file=sys.stderr)
    print('Loading kmers from sequences in %s' %
          repr(filenames), file=sys.stderr)
    if args.no_build_tagset:
        print('We WILL NOT build the tagset.', file=sys.stderr)
    else:
        print('We WILL build the tagset (for partitioning/traversal).',
              file=sys.stderr)

    print('making nodegraph', file=sys.stderr)
    nodegraph = khmer_args.create_nodegraph(args)

    oxfuncs.build_graph(filenames, nodegraph, args.threads,
                        not args.no_build_tagset)

    print('Total number of unique k-mers: {0}'.format(
        nodegraph.n_unique_kmers()), file=sys.stderr)

    print('saving k-mer nodegraph in', base, file=sys.stderr)
    nodegraph.save(base)

    if not args.no_build_tagset:
        print('saving tagset in', base + '.tagset', file=sys.stderr)
        nodegraph.save_tagset(base + '.tagset')

    info_fp = open(base + '.info', 'w')
    info_fp.write('%d unique k-mers' % nodegraph.n_unique_kmers())

    fp_rate = \
        khmer.calc_expected_collisions(
            nodegraph, args.force, max_false_pos=.15)
    # 0.18 is ACTUAL MAX. Do not change.

    print('false positive rate estimated to be %1.3f' % fp_rate,
          file=sys.stderr)
    print('\nfalse positive rate estimated to be %1.3f' % fp_rate,
          file=info_fp)

    print('wrote to ' + base + '.info and ' + base, file=sys.stderr)
    if not args.no_build_tagset:
        print('and ' + base + '.tagset', file=sys.stderr)

    sys.exit(0)
예제 #11
0
def main():
    parser = build_construct_args()
    parser.add_argument('--build-tagset',
                        '-t',
                        default=True,
                        action='store_false',
                        help='Construct tagset while loading sequences')
    parser.add_argument('output_filename')
    parser.add_argument('input_filenames', nargs='+')

    args = parser.parse_args()

    if not args.quiet:
        if args.min_hashsize == DEFAULT_MIN_HASHSIZE:
            print >> sys.stderr, "** WARNING: hashsize is default!  You absodefly want to increase this!\n** Please read the docs!"

        print >> sys.stderr, '\nPARAMETERS:'
        print >> sys.stderr, ' - kmer size =    %d \t\t(-k)' % args.ksize
        print >> sys.stderr, ' - n hashes =     %d \t\t(-N)' % args.n_hashes
        print >> sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % args.min_hashsize
        print >> sys.stderr, ''
        print >> sys.stderr, 'Estimated memory usage is %.2g bytes (n_hashes x min_hashsize / 8)' % (
            args.n_hashes * args.min_hashsize / 8.)
        print >> sys.stderr, '-' * 8

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes

    base = args.output_filename
    filenames = args.input_filenames

    print 'Saving hashtable to %s' % base
    print 'Loading kmers from sequences in %s' % repr(filenames)

    ###

    print 'making hashtable'
    ht = khmer.new_hashbits(K, HT_SIZE, N_HT)

    for n, filename in enumerate(filenames):
        print 'consuming input', filename
        ht.consume_fasta_and_tag(filename)

    print 'saving hashtable in', base + '.ht'
    ht.save(base + '.ht')
    print 'saving tagset in', base + '.tagset'
    ht.save_tagset(base + '.tagset')

    fp_rate = khmer.calc_expected_collisions(ht)
    print 'fp rate estimated to be %1.3f' % fp_rate
    if fp_rate > 0.15:  # 0.18 is ACTUAL MAX. Do not change.
        print >> sys.stderr, "**"
        print >> sys.stderr, "** ERROR: the graph structure is too small for"
        print >> sys.stderr, "** this data set.  Increase hashsize/num ht."
        print >> sys.stderr, "**"
        sys.exit(-1)
예제 #12
0
def main():
    info('optimal_args_hashbits.py', ['graph', 'SeqAn'])
    args = get_parser().parse_args()
    report_on_config(args, hashtype='hashbits')

    filenames = args.input_filenames
    base = filenames[0]
    for _ in args.input_filenames:
        check_input_files(_, False)

    check_space(args.input_filenames, False)

    print('Counting kmers from sequences in %s' % repr(filenames),
          file=sys.stderr)

    htable = khmer.new_hashbits(args.ksize, args.max_tablesize, args.n_tables)
    target_method = htable.consume_fasta_with_reads_parser

    for _, filename in enumerate(filenames):
        rparser = khmer.ReadParser(filename)
        threads = []
        print('consuming input', filename, file=sys.stderr)
        for num in xrange(args.threads):
            cur_thread = threading.Thread(target=target_method,
                                          args=(rparser, ))
            threads.append(cur_thread)
            cur_thread.start()

        for thread in threads:
            thread.join()
    unique_kmers = htable.n_unique_kmers()
    print('Total number of unique k-mers: {0}'.format(unique_kmers),
          file=sys.stderr)

    info_optimal = open(base + '.optimal_args', 'w')

    fp_rate = khmer.calc_expected_collisions(htable)
    print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr)

    if fp_rate > 0.15:  # 0.18 is ACTUAL MAX. Do not change.
        print("**", file=sys.stderr)
        print(
            "** ERROR: the graph structure is too small for this data set."
            "Increase table size/# tables.",
            file=sys.stderr)
        print("**", file=sys.stderr)
        if not False:
            sys.exit(1)

    to_print = output_gen(unique_kmers, fp_rate)

    print(to_print, file=info_optimal)

    print('optimal arguments were written to',
          base + '.optimal_args',
          file=sys.stderr)
def main():
    parser = build_construct_args()
    add_threading_args(parser)

    parser.add_argument('datafile')
    
    args = parser.parse_args()
    report_on_config(args)

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes
    n_threads = int(args.n_threads)

    config = khmer.get_config()
    bufsz = config.get_reads_input_buffer_size()
    config.set_reads_input_buffer_size(n_threads * 64 * 1024)

    print 'making hashtable'
    ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads)

    filename = args.datafile
    
    ### first, load reads into hash table
    rparser = khmer.ReadParser(filename, n_threads)
    threads = []
    print 'consuming input, round 1 --', filename
    for tnum in xrange(n_threads):
        t = \
            threading.Thread(
                target=ht.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(t)
        t.start()

    for t in threads:
        t.join()

    fp_rate = khmer.calc_expected_collisions(ht)
    print 'fp rate estimated to be %1.3f' % fp_rate

    ### now, count.
    total = 0
    total_unique = 0
    for n, record in enumerate(screed.open(filename)):
        total += 1
        last_kmer = record.sequence[-K:]
        count = ht.get(last_kmer)
        if count == 1:
            total_unique += 1

    print 'singletons: %d unique; of %d total; %.3f' % \
        (total_unique, total, total_unique/float(total))
예제 #14
0
def main():
    parser = build_construct_args()
    parser.add_argument('--build-tagset', '-t', default=True,
                        action='store_false',
                        help='Construct tagset while loading sequences')
    parser.add_argument('output_filename')
    parser.add_argument('input_filenames', nargs='+')

    args = parser.parse_args()

    if not args.quiet:
        if args.min_hashsize == DEFAULT_MIN_HASHSIZE:
            print>>sys.stderr, "** WARNING: hashsize is default!  You absodefly want to increase this!\n** Please read the docs!"

        print>>sys.stderr, '\nPARAMETERS:'
        print>>sys.stderr, ' - kmer size =    %d \t\t(-k)' % args.ksize
        print>>sys.stderr, ' - n hashes =     %d \t\t(-N)' % args.n_hashes
        print>>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % args.min_hashsize
        print>>sys.stderr, ''
        print>>sys.stderr, 'Estimated memory usage is %.2g bytes (n_hashes x min_hashsize / 8)' % (args.n_hashes * args.min_hashsize / 8.)
        print>>sys.stderr, '-'*8

    K=args.ksize
    HT_SIZE=args.min_hashsize
    N_HT=args.n_hashes

    base = args.output_filename
    filenames = args.input_filenames

    print 'Saving hashtable to %s' % base
    print 'Loading kmers from sequences in %s' % repr(filenames)

    ###
    
    print 'making hashtable'
    ht = khmer.new_hashbits(K, HT_SIZE, N_HT)

    for n, filename in enumerate(filenames):
       print 'consuming input', filename
       ht.consume_fasta_and_tag(filename)

    print 'saving hashtable in', base + '.ht'
    ht.save(base + '.ht')
    print 'saving tagset in', base + '.tagset'
    ht.save_tagset(base + '.tagset')

    fp_rate = khmer.calc_expected_collisions(ht)
    print 'fp rate estimated to be %1.3f' % fp_rate
    if fp_rate > 0.15:          # 0.18 is ACTUAL MAX. Do not change.
        print >>sys.stderr, "**"
        print >>sys.stderr, "** ERROR: the graph structure is too small for"
        print >>sys.stderr, "** this data set.  Increase hashsize/num ht."
        print >>sys.stderr, "**"
        sys.exit(-1)
예제 #15
0
def main():
    info('optimal_args_nodegraph.py', ['graph', 'SeqAn'])
    args = get_parser().parse_args()
    report_on_config(args, graphtype='nodegraph')


    filenames = args.input_filenames
    base = filenames[0]
    for _ in args.input_filenames:
        check_input_files(_, False)

    check_space(args.input_filenames, False)

    print('Counting kmers from sequences in %s' % repr(filenames),
          file=sys.stderr)

    htable = khmer.new_nodegraph(args.ksize, args.max_tablesize, args.n_tables)
    target_method = htable.consume_fasta_with_reads_parser

    for _, filename in enumerate(filenames):
        rparser = khmer.ReadParser(filename)
        threads = []
        print('consuming input', filename, file=sys.stderr)
        for num in xrange(args.threads):
            cur_thread = threading.Thread(
                target=target_method, args=(rparser,))
            threads.append(cur_thread)
            cur_thread.start()

        for thread in threads:
            thread.join()
    unique_kmers = htable.n_unique_kmers()
    print('Total number of unique k-mers: {0}'.format(unique_kmers),
          file=sys.stderr)

    info_optimal = open(base + '.optimal_args', 'w')

    fp_rate = khmer.calc_expected_collisions(htable)
    print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr)

    if fp_rate > 0.15:          # 0.18 is ACTUAL MAX. Do not change.
        print("**", file=sys.stderr)
        print("** ERROR: the graph structure is too small for this data set."
              "Increase table size/# tables.", file=sys.stderr)
        print("**", file=sys.stderr)
        if not False:
            sys.exit(1)

    to_print = graphsize_args_report(unique_kmers, fp_rate)
    
    print(to_print, file=info_optimal)
    
    print('optimal arguments were written to', base + '.optimal_args',
          file=sys.stderr)
def main():
    parser = build_construct_args()
    add_threading_args(parser)

    parser.add_argument('datafile')

    args = parser.parse_args()
    report_on_config(args)

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes
    n_threads = int(args.n_threads)

    config = khmer.get_config()
    bufsz = config.get_reads_input_buffer_size()
    config.set_reads_input_buffer_size(n_threads * 64 * 1024)

    print 'making hashtable'
    ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads)

    filename = args.datafile

    ### first, load reads into hash table
    rparser = khmer.ReadParser(filename, n_threads)
    threads = []
    print 'consuming input, round 1 --', filename
    for tnum in xrange(n_threads):
        t = \
            threading.Thread(
                target=ht.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(t)
        t.start()

    for t in threads:
        t.join()

    fp_rate = khmer.calc_expected_collisions(ht)
    print 'fp rate estimated to be %1.3f' % fp_rate

    ### now, count.
    total = 0
    total_unique = 0
    for n, record in enumerate(screed.open(filename)):
        total += 1
        last_kmer = record.sequence[-K:]
        count = ht.get(last_kmer)
        if count == 1:
            total_unique += 1

    print 'singletons: %d unique; of %d total; %.3f' % \
        (total_unique, total, total_unique/float(total))
예제 #17
0
def main(args):
    info('build-graph.py', ['graph', 'SeqAn'])

    report_on_config(args, hashtype='hashbits')
    base = args.output_filename
    filenames = args.input_filenames

    for fname in args.input_filenames:
        check_input_files(fname, args.force)

    check_space(args.input_filenames, args.force)
    check_space_for_hashtable(
        (float(args.n_tables * args.min_tablesize) / 8.), args.force)

    print >>sys.stderr, 'Saving k-mer presence table to %s' % base
    print >>sys.stderr, 'Loading kmers from sequences in %s' % repr(filenames)
    if args.no_build_tagset:
        print >>sys.stderr, 'We WILL NOT build the tagset.'
    else:
        print >>sys.stderr, 'We WILL build the tagset', \
                            ' (for partitioning/traversal).'

    print >>sys.stderr, 'making k-mer presence table'
    htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables)

    functions.build_graph(filenames, htable, args.threads,
                          not args.no_build_tagset)

    print >> sys.stderr, 'Total number of unique k-mers: {0}'.format(
        htable.n_unique_kmers())

    print >>sys.stderr, 'saving k-mer presence table in', base + '.pt'
    htable.save(base + '.pt')

    if not args.no_build_tagset:
        print >>sys.stderr, 'saving tagset in', base + '.tagset'
        htable.save_tagset(base + '.tagset')

    info_fp = open(base + '.info', 'w')
    info_fp.write('%d unique k-mers' % htable.n_unique_kmers())

    fp_rate = \
        khmer.calc_expected_collisions(htable, args.force, max_false_pos=.15)
    # 0.18 is ACTUAL MAX. Do not change.

    print >>sys.stderr, 'false positive rate estimated to be %1.3f' % fp_rate
    print >>info_fp, '\nfalse positive rate estimated to be %1.3f' % fp_rate

    print >> sys.stderr, 'wrote to', base + '.info and', base + '.pt'
    if not args.no_build_tagset:
        print >> sys.stderr, 'and ' + base + '.tagset'

    sys.exit(0)
예제 #18
0
def main(args):
    info("build-graph.py", ["graph", "SeqAn"])

    report_on_config(args, hashtype="hashbits")
    base = args.output_filename
    filenames = args.input_filenames

    for fname in args.input_filenames:
        check_input_files(fname, args.force)

    check_space(args.input_filenames, args.force)
    check_space_for_hashtable((float(args.n_tables * args.min_tablesize) / 8.0), args.force)

    print("Saving k-mer presence table to %s" % base, file=sys.stderr)
    print("Loading kmers from sequences in %s" % repr(filenames), file=sys.stderr)
    if args.no_build_tagset:
        print("We WILL NOT build the tagset.", file=sys.stderr)
    else:
        print("We WILL build the tagset (for partitioning/traversal).", file=sys.stderr)

    print("making k-mer presence table", file=sys.stderr)
    htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables)

    functions.build_graph(filenames, htable, args.threads, not args.no_build_tagset)

    print("Total number of unique k-mers: {0}".format(htable.n_unique_kmers()), file=sys.stderr)

    print("saving k-mer presence table in", base + ".pt", file=sys.stderr)
    htable.save(base + ".pt")

    if not args.no_build_tagset:
        print("saving tagset in", base + ".tagset", file=sys.stderr)
        htable.save_tagset(base + ".tagset")

    info_fp = open(base + ".info", "w")
    info_fp.write("%d unique k-mers" % htable.n_unique_kmers())

    fp_rate = khmer.calc_expected_collisions(htable, args.force, max_false_pos=0.15)
    # 0.18 is ACTUAL MAX. Do not change.

    print("false positive rate estimated to be %1.3f" % fp_rate, file=sys.stderr)
    print("\nfalse positive rate estimated to be %1.3f" % fp_rate, file=info_fp)

    print("wrote to", base + ".info and", base + ".pt", file=sys.stderr)
    if not args.no_build_tagset:
        print("and " + base + ".tagset", file=sys.stderr)

    sys.exit(0)
예제 #19
0
def main(args):
    info("build-graph.py", ["graph", "SeqAn"])

    report_on_config(args, graphtype="nodegraph")
    base = args.output_filename
    filenames = args.input_filenames

    for fname in args.input_filenames:
        check_input_files(fname, args.force)

    graphsize = calculate_graphsize(args, "nodegraph")
    check_space_for_graph(args.output_filename, graphsize, args.force)

    print("Saving k-mer nodegraph to %s" % base, file=sys.stderr)
    print("Loading kmers from sequences in %s" % repr(filenames), file=sys.stderr)
    if args.no_build_tagset:
        print("We WILL NOT build the tagset.", file=sys.stderr)
    else:
        print("We WILL build the tagset (for partitioning/traversal).", file=sys.stderr)

    print("making nodegraph", file=sys.stderr)
    nodegraph = khmer_args.create_nodegraph(args)

    oxfuncs.build_graph(filenames, nodegraph, args.threads, not args.no_build_tagset)

    print("Total number of unique k-mers: {0}".format(nodegraph.n_unique_kmers()), file=sys.stderr)

    print("saving k-mer nodegraph in", base, file=sys.stderr)
    nodegraph.save(base)

    if not args.no_build_tagset:
        print("saving tagset in", base + ".tagset", file=sys.stderr)
        nodegraph.save_tagset(base + ".tagset")

    info_fp = open(base + ".info", "w")
    info_fp.write("%d unique k-mers" % nodegraph.n_unique_kmers())

    fp_rate = khmer.calc_expected_collisions(nodegraph, args.force, max_false_pos=0.15)
    # 0.18 is ACTUAL MAX. Do not change.

    print("false positive rate estimated to be %1.3f" % fp_rate, file=sys.stderr)
    print("\nfalse positive rate estimated to be %1.3f" % fp_rate, file=info_fp)

    print("wrote to " + base + ".info and " + base, file=sys.stderr)
    if not args.no_build_tagset:
        print("and " + base + ".tagset", file=sys.stderr)

    sys.exit(0)
예제 #20
0
def main():  # pylint: disable=too-many-branches,too-many-statements
    info('saturate-by-median.py', ['diginorm'])
    args = get_parser().parse_args()

    report_on_config(args)

    report_fp = args.report
    report_frequency = args.report_frequency

    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames)
    if args.savetable:
        check_space_for_hashtable(args.n_tables * args.min_tablesize)

    # list to save error files along with throwing exceptions
    if args.force:
        corrupt_files = []

    if args.loadtable:
        print 'loading k-mer counting table from', args.loadtable
        htable = khmer.load_counting_hash(args.loadtable)
    else:
        print 'making k-mer counting table'
        htable = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                         args.n_tables)

    total = 0
    discarded = 0

    for index, input_filename in enumerate(args.input_filenames):
        total_acc = 0
        discarded_acc = 0

        try:
            total_acc, discarded_acc = normalize_by_median(
                input_filename, htable, args, report_fp, report_frequency)
        except IOError as err:
            handle_error(err, input_filename)
            if not args.force:
                print >> sys.stderr, '** Exiting!'
                sys.exit(1)
            else:
                print >> sys.stderr, '*** Skipping error file, moving on...'
                corrupt_files.append(input_filename)
        else:
            if total_acc == 0 and discarded_acc == 0:
                print 'SKIPPED empty file', input_filename
            else:
                total += total_acc
                discarded += discarded_acc
                print 'DONE with {inp}; kept {kept} of {total} or {perc:2}%'\
                    .format(inp=input_filename,
                            kept=total - discarded, total=total,
                            perc=int(100. - discarded / float(total) * 100.))

    if args.savetable:
        print 'Saving k-mer counting table through', input_filename
        print '...saving to', args.savetable
        htable.save(args.savetable)

    fp_rate = khmer.calc_expected_collisions(htable)
    print 'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate)

    if args.force and len(corrupt_files) > 0:
        print >> sys.stderr, "** WARNING: Finished with errors!"
        print >> sys.stderr, "** IOErrors occurred in the following files:"
        print >> sys.stderr, "\t", " ".join(corrupt_files)

    if fp_rate > MAX_FALSE_POSITIVE_RATE:
        print >> sys.stderr, "**"
        print >> sys.stderr, ("** ERROR: the k-mer counting table is too small"
                              " for this data set.  Increase tablesize/# "
                              "tables.")
        print >> sys.stderr, "**"
        print >> sys.stderr, "** Do not use these results!!"
        sys.exit(1)
예제 #21
0
def main():

    info('load-into-counting.py', ['counting'])
    args = get_parser().parse_args()
    report_on_config(args)

    base = args.output_countingtable_filename
    filenames = args.input_sequence_filename

    for name in args.input_sequence_filename:
        check_file_status(name)

    check_space(args.input_sequence_filename)
    check_space_for_hashtable(args.n_tables * args.min_tablesize)

    print 'Saving k-mer counting table to %s' % base
    print 'Loading kmers from sequences in %s' % repr(filenames)

    print 'making k-mer counting table'
    htable = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                     args.n_tables, args.n_threads)
    htable.set_use_bigcount(args.bigcount)

    config = khmer.get_config()
    config.set_reads_input_buffer_size(args.n_threads * 64 * 1024)

    for index, filename in enumerate(filenames):

        rparser = khmer.ReadParser(filename, args.n_threads)
        threads = []
        print 'consuming input', filename
        for _ in xrange(args.n_threads):
            cur_thrd = \
                threading.Thread(
                    target=htable.consume_fasta_with_reads_parser,
                    args=(rparser, )
                )
            threads.append(cur_thrd)
            cur_thrd.start()

        for _ in threads:
            _.join()

        if index > 0 and index % 10 == 0:
            check_space_for_hashtable(args.n_tables * args.min_tablesize)
            print 'mid-save', base
            htable.save(base)
            open(base + '.info', 'w').write('through %s' % filename)

    print 'saving', base
    htable.save(base)

    info_fp = open(base + '.info', 'w')
    info_fp.write('through end: %s\n' % filename)

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(htable)
    print 'fp rate estimated to be %1.3f' % fp_rate
    print >> info_fp, 'fp rate estimated to be %1.3f' % fp_rate

    if fp_rate > 0.20:
        print >> sys.stderr, "**"
        print >> sys.stderr, ("** ERROR: the k-mer counting table is too small"
                              " this data set.  Increase tablesize/# tables.")
        print >> sys.stderr, "**"
        sys.exit(1)

    print 'DONE.'
예제 #22
0
def main():
    parser = build_construct_args()
    parser.add_argument('-C',
                        '--cutoff',
                        type=int,
                        dest='cutoff',
                        default=DEFAULT_DESIRED_COVERAGE)
    parser.add_argument('-s', '--savehash', dest='savehash', default='')
    parser.add_argument('-l', '--loadhash', dest='loadhash', default='')
    parser.add_argument('input_filenames', nargs='+')

    args = parser.parse_args()

    if not args.quiet:
        if args.min_hashsize == DEFAULT_MIN_HASHSIZE:
            print >> sys.stderr, "** WARNING: hashsize is default!  You absodefly want to increase this!\n** Please read the docs!"

        print >> sys.stderr, '\nPARAMETERS:'
        print >> sys.stderr, ' - kmer size =    %d \t\t(-k)' % args.ksize
        print >> sys.stderr, ' - n hashes =     %d \t\t(-N)' % args.n_hashes
        print >> sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % args.min_hashsize
        print >> sys.stderr, ''
        print >> sys.stderr, 'Estimated memory usage is %.2g bytes (n_hashes x min_hashsize)' % (
            args.n_hashes * args.min_hashsize)
        print >> sys.stderr, '-' * 8

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes
    DESIRED_COVERAGE = args.cutoff

    filenames = args.input_filenames

    if args.loadhash:
        print 'loading hashtable from', args.loadhash
        ht = khmer.load_counting_hash(args.loadhash)
    else:
        print 'making hashtable'
        ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)

    total = 0
    discarded = 0
    for input_filename in filenames:
        output_name = os.path.basename(input_filename) + '.keepkad'
        outfp = open(output_name, 'w')

        for n, record in enumerate(screed.open(input_filename)):
            if n > 0 and n % 10000 == 0:
                print '... kept', total - discarded, 'of', total, ', or', \
                    int(100. - discarded / float(total) * 100.), '%'
                print '... in file', input_filename

            total += 1

            if len(record.sequence) < K:
                continue

            seq = record.sequence.replace('N', 'A')
            kad = ht.get_kadian_count(seq)

            if kad < DESIRED_COVERAGE:
                ht.consume(seq)
                outfp.write('>%s\n%s\n' % (record.name, record.sequence))
            else:
                discarded += 1

        print 'DONE with', input_filename, '; kept', total - discarded, 'of',\
            total, 'or', int(100. - discarded / float(total) * 100.), '%'
        print 'output in', output_name

    if args.savehash:
        print 'Saving hashfile through', input_filename
        print '...saving to', args.savehash
        ht.save(args.savehash)

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(ht)
    print 'fp rate estimated to be %1.3f' % fp_rate

    if fp_rate > 0.20:
        print >> sys.stderr, "**"
        print >> sys.stderr, "** ERROR: the counting hash is too small for"
        print >> sys.stderr, "** this data set.  Increase hashsize/num ht."
        print >> sys.stderr, "**"
        print >> sys.stderr, "** Do not use these results!!"
        sys.exit(-1)
예제 #23
0
def main():  # pylint: disable=too-many-branches,too-many-statements
    parser = sanitize_help(get_parser())
    args = parser.parse_args()

    configure_logging(args.quiet)
    report_on_config(args)

    report_fp = args.report
    force_single = args.force_single

    # check for similar filenames
    # if we're using a single output file only check for identical filenames
    # otherwise, check for identical BASE names as well.
    filenames = []
    basenames = []
    for pathfilename in args.input_filenames:
        filenames.append(pathfilename)
        if args.single_output_file:
            continue  # nothing more to worry about

        basename = os.path.basename(pathfilename)
        if basename in basenames:
            log_error('ERROR: Duplicate filename--Cannot handle this!')
            log_error('** Exiting!')
            sys.exit(1)

        basenames.append(basename)

    # check that files exist and there is sufficient output disk space.
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savegraph is not None:
        graphsize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, graphsize, args.force)

    # load or create counting table.
    if args.loadgraph:
        log_info('loading k-mer countgraph from {graph}', graph=args.loadgraph)
        countgraph = khmer.load_countgraph(args.loadgraph)
    else:
        log_info('making countgraph')
        countgraph = khmer_args.create_countgraph(args)

    # create an object to handle diginorm of all files
    norm = Normalizer(args.cutoff, countgraph)
    with_diagnostics = WithDiagnostics(norm, report_fp, args.report_frequency)

    # make a list of all filenames and if they're paired or not;
    # if we don't know if they're paired, default to allowing but not
    # forcing pairing.
    files = []
    for element in filenames:
        files.append([element, args.paired])
    if args.unpaired_reads:
        files.append([args.unpaired_reads, False])

    corrupt_files = []
    outfp = None
    output_name = None

    if args.single_output_file:
        outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip)
    else:
        if '-' in filenames or '/dev/stdin' in filenames:
            print(
                "Accepting input from stdin; output filename must "
                "be provided with '-o'.",
                file=sys.stderr)
            sys.exit(1)

    #
    # main loop: iterate over all files given, do diginorm.
    #

    for filename, require_paired in files:
        if not args.single_output_file:
            output_name = os.path.basename(filename) + '.keep'
            outfp = open(output_name, 'wb')
            outfp = get_file_writer(outfp, args.gzip, args.bzip)

        # failsafe context manager in case an input file breaks
        with catch_io_errors(filename, outfp, args.single_output_file,
                             args.force, corrupt_files):
            screed_iter = clean_input_reads(screed.open(filename))
            reader = broken_paired_reader(screed_iter,
                                          min_length=args.ksize,
                                          force_single=force_single,
                                          require_paired=require_paired)

            # actually do diginorm
            for record in with_diagnostics(reader, filename):
                if record is not None:
                    write_record(record, outfp)

            log_info('output in {name}', name=describe_file_handle(outfp))
            if not args.single_output_file:
                outfp.close()

    # finished - print out some diagnostics.

    log_info('Total number of unique k-mers: {umers}',
             umers=countgraph.n_unique_kmers())

    if args.savegraph is not None:
        log_info('...saving to {name}', name=args.savegraph)
        countgraph.save(args.savegraph)

    fp_rate = \
        khmer.calc_expected_collisions(countgraph, False, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975

    log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate)

    if args.force and len(corrupt_files) > 0:
        log_error("** WARNING: Finished with errors!")
        log_error("** I/O Errors occurred in the following files:")
        log_error("\t" + " ".join(corrupt_files))
예제 #24
0
def main():

    info('load-into-counting.py', ['counting', 'SeqAn'])

    args = sanitize_help(get_parser()).parse_args()
    report_on_config(args)

    base = args.output_countgraph_filename
    filenames = args.input_sequence_filename

    for name in args.input_sequence_filename:
        check_input_files(name, args.force)

    tablesize = calculate_graphsize(args, 'countgraph')
    check_space_for_graph(args.output_countgraph_filename, tablesize,
                          args.force)

    check_file_writable(base)
    check_file_writable(base + ".info")

    print('Saving k-mer countgraph to %s' % base, file=sys.stderr)
    print('Loading kmers from sequences in %s' % repr(filenames),
          file=sys.stderr)

    # clobber the '.info' file now, as we always open in append mode below
    if os.path.exists(base + '.info'):
        os.remove(base + '.info')

    print('making countgraph', file=sys.stderr)
    countgraph = khmer_args.create_countgraph(args)
    countgraph.set_use_bigcount(args.bigcount)

    filename = None

    total_num_reads = 0

    for index, filename in enumerate(filenames):

        rparser = khmer.ReadParser(filename)
        threads = []
        print('consuming input', filename, file=sys.stderr)
        for _ in range(args.threads):
            cur_thrd = \
                threading.Thread(
                    target=countgraph.consume_fasta_with_reads_parser,
                    args=(rparser, )
                )
            threads.append(cur_thrd)
            cur_thrd.start()

        for thread in threads:
            thread.join()

        if index > 0 and index % 10 == 0:
            tablesize = calculate_graphsize(args, 'countgraph')
            check_space_for_graph(base, tablesize, args.force)
            print('mid-save', base, file=sys.stderr)

            countgraph.save(base)
        with open(base + '.info', 'a') as info_fh:
            print('through', filename, file=info_fh)
        total_num_reads += rparser.num_reads

    n_kmers = countgraph.n_unique_kmers()
    print('Total number of unique k-mers:', n_kmers, file=sys.stderr)
    with open(base + '.info', 'a') as info_fp:
        print('Total number of unique k-mers:', n_kmers, file=info_fp)

    print('saving', base, file=sys.stderr)
    countgraph.save(base)

    # Change max_false_pos=0.2 only if you really grok it. HINT: You don't
    fp_rate = \
        khmer.calc_expected_collisions(
            countgraph, args.force, max_false_pos=.2)

    with open(base + '.info', 'a') as info_fp:
        print('fp rate estimated to be %1.3f\n' % fp_rate, file=info_fp)

    if args.summary_info:
        mr_fmt = args.summary_info.lower()
        mr_file = base + '.info.' + mr_fmt
        print("Writing summmary info to", mr_file, file=sys.stderr)
        with open(mr_file, 'w') as mr_fh:
            if mr_fmt == 'json':
                mr_data = {
                    "ht_name": os.path.basename(base),
                    "fpr": fp_rate,
                    "num_kmers": n_kmers,
                    "files": filenames,
                    "mrinfo_version": "0.2.0",
                    "num_reads": total_num_reads,
                }
                json.dump(mr_data, mr_fh)
                mr_fh.write('\n')
            elif mr_fmt == 'tsv':
                mr_fh.write("ht_name\tfpr\tnum_kmers\tnum_reads\tfiles\n")
                vals = [
                    os.path.basename(base),
                    "{:1.3f}".format(fp_rate),
                    str(n_kmers),
                    str(total_num_reads),
                    ";".join(filenames),
                ]
                mr_fh.write("\t".join(vals) + "\n")

    print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr)

    print('DONE.', file=sys.stderr)
    print('wrote to:', base + '.info', file=sys.stderr)
예제 #25
0
def main():
    parser = argparse.ArgumentParser(description='XXX')

    env_ksize = os.environ.get('KHMER_KSIZE', DEFAULT_K)
    env_n_hashes = os.environ.get('KHMER_N_HASHES', DEFAULT_N_HT)
    env_hashsize = os.environ.get('KHMER_MIN_HASHSIZE', DEFAULT_MIN_HASHSIZE)

    parser.add_argument('--ksize', '-k', type=int, dest='ksize',
                        default=env_ksize,
                        help='k-mer size to use')
    parser.add_argument('--n_hashes', '-N', type=int, dest='n_hashes',
                        default=env_n_hashes,
                        help='number of hash tables to use')
    parser.add_argument('--hashsize', '-x', type=float, dest='min_hashsize',
                        default=env_hashsize,
                        help='lower bound on hashsize to use')

    parser.add_argument('--cutoff', '-C', type=int, dest='abund_cutoff',
                        help='remove k-mers below this abundance',
                        default=DEFAULT_CUTOFF)

    parser.add_argument('--normalize-to', '-Z', type=int, dest='normalize_to',
                        help='base cutoff on median k-mer abundance of this',
                        default=DEFAULT_NORMALIZE_LIMIT)

    parser.add_argument('--variable-coverage', '-V', action='store_true',
                        dest='variable_coverage', default=False,
                        help='Only trim low-abundance k-mers from sequences '
                        'that have high coverage.')
    parser.add_argument('--tempdir', '-T', type=str, dest='tempdir',
                        default='./')

    parser.add_argument('input_filenames', nargs='+')
    args = parser.parse_args()

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes

    CUTOFF = args.abund_cutoff
    NORMALIZE_LIMIT = args.normalize_to

    print >>sys.stderr, "K:", K
    print >>sys.stderr, "HT SIZE:", HT_SIZE
    print >>sys.stderr, "N HT:", N_HT
    print >>sys.stderr, "CUTOFF:", CUTOFF
    print >>sys.stderr, "NORMALIZE_LIMIT:", NORMALIZE_LIMIT

    print >>sys.stderr, 'making hashtable'
    ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)

    tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir)
    print >>sys.stderr, 'created temporary directory %s; ' % tempdir + \
          'use -T to change location'

    ###

    save_pass2 = 0

    read_bp = 0
    read_reads = 0

    pass2list = []
    for filename in args.input_filenames:
        pass2filename = os.path.basename(filename) + '.pass2'
        pass2filename = os.path.join(tempdir, pass2filename)

        pass2list.append((filename, pass2filename))

        pass2fp = open(pass2filename, 'w')

        for n, read in enumerate(screed.open(filename)):
            if n % 100000 == 0:
                print >>sys.stderr, '...', n, filename, save_pass2, \
                      read_reads, read_bp
                
            read_reads += 1
            read_bp += len(read.sequence)

            seq = read.sequence.replace('N', 'A')
            med, _, _ = ht.get_median_count(seq)

            # has this portion of the graph saturated? if not,
            # consume & save => pass2.
            if med < NORMALIZE_LIMIT:
                ht.consume(seq)
                pass2fp.write(output_single(read))
                save_pass2 += 1
            else:
                posns = ht.find_spectral_error_positions(seq, CUTOFF)
                posns = add_n_posns(posns, read.sequence)
                print read.name, ",".join(map(str, posns))
                
        pass2fp.close()

        print >>sys.stderr, '%s: kept aside %d of %d from first pass, in %s' %\
              (filename, save_pass2, n + 1, filename)

    n_omitted = 0
    for orig_filename, pass2filename in pass2list:
        print >>sys.stderr,'second pass: looking at ' + \
              'sequences kept aside in %s' % pass2filename
        for n, read in enumerate(screed.open(pass2filename)):
            if n % 100000 == 0:
                print >>sys.stderr, '... x 2', n, pass2filename, read_reads, \
                      read_bp

            seq = read.sequence.replace('N', 'A')
            med, _, _ = ht.get_median_count(seq)

            if med >= NORMALIZE_LIMIT or not args.variable_coverage:
                posns = ht.find_spectral_error_positions(seq, CUTOFF)
                posns = add_n_posns(posns, read.sequence)
                print read.name, ",".join(map(str, posns))

            if args.variable_coverage and med < NORMALIZE_LIMIT:
                print read.name, 'V'
                n_omitted += 1

        print >>sys.stderr, 'removing %s' % pass2filename
        os.unlink(pass2filename)

    print >>sys.stderr, 'removing temp directory & contents (%s)' % tempdir
    shutil.rmtree(tempdir)

    print >>sys.stderr, 'read %d reads, %d bp' % (read_reads, read_bp,)
    if args.variable_coverage:
        print >>sys.stderr, 'omitted %d reads for -V' % (n_omitted)

    fp_rate = khmer.calc_expected_collisions(ht)
    print >>sys.stderr, \
          'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate)

    if fp_rate > MAX_FALSE_POSITIVE_RATE:
        print >> sys.stderr, "**"
        print >> sys.stderr, ("** ERROR: the k-mer counting table is too small"
                              " for this data set. Increase tablesize/# "
                              "tables.")
        print >> sys.stderr, "**"
        print >> sys.stderr, "** Do not use these results!!"
        sys.exit(1)
예제 #26
0
def main():
    parser = build_construct_args()
    parser.add_argument('-C', '--cutoff', type=int, dest='cutoff',
                        default=DEFAULT_DESIRED_COVERAGE)
    parser.add_argument('-s', '--savehash', dest='savehash', default='')
    parser.add_argument('-l', '--loadhash', dest='loadhash',
                        default='')
    parser.add_argument('-R', '--report-to-file', dest='report_file',
                        type=argparse.FileType('w'))
    parser.add_argument('input_filenames', nargs='+')

    args = parser.parse_args()

    if not args.quiet:
        if args.min_hashsize == DEFAULT_MIN_HASHSIZE:
            print>>sys.stderr, "** WARNING: hashsize is default!  You absodefly want to increase this!\n** Please read the docs!"

        print>>sys.stderr, '\nPARAMETERS:'
        print>>sys.stderr, ' - kmer size =    %d \t\t(-k)' % args.ksize
        print>>sys.stderr, ' - n hashes =     %d \t\t(-N)' % args.n_hashes
        print>>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % args.min_hashsize
        print>>sys.stderr, ''
        print>>sys.stderr, 'Estimated memory usage is %.2g bytes (n_hashes x min_hashsize)' % (args.n_hashes * args.min_hashsize)
        print>>sys.stderr, '-'*8

    K=args.ksize
    HT_SIZE=args.min_hashsize
    N_HT=args.n_hashes
    DESIRED_COVERAGE=args.cutoff
    report_fp = args.report_file

    filenames = args.input_filenames

    if args.loadhash:
        print 'loading hashtable from', args.loadhash
        ht = khmer.load_counting_hash(args.loadhash)
    else:
        print 'making hashtable'
        ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)

    total = 0
    discarded = 0
    for input_filename in filenames:
        output_name = os.path.basename(input_filename) + '.keep'
        outfp = open(output_name, 'w')

	n = -1
        for n, record in enumerate(screed.open(input_filename)):
            if n > 0 and n % 100000 == 0:
                print '... kept', total - discarded, 'of', total, ', or', \
                    int(100. - discarded / float(total) * 100.), '%'
                print '... in file', input_filename

                if report_fp:
                    print>>report_fp, total, total - discarded, \
                        1. - (discarded / float(total))
                    report_fp.flush()

            total += 1

            if len(record.sequence) < K:
                continue

            seq = record.sequence.replace('N', 'A')
            med, _, _ = ht.get_median_count(seq)

            if med < DESIRED_COVERAGE:
                ht.consume(seq)
                outfp.write('>%s\n%s\n' % (record.name, record.sequence))
            else:
                discarded += 1

	if -1 < n:
	    print 'DONE with', input_filename, '; kept', total - discarded, 'of',\
		total, 'or', int(100. - discarded / float(total) * 100.), '%'
	    print 'output in', output_name
	else: print 'SKIPPED empty file', input_filename

    if args.savehash:
        print 'Saving hashfile through', input_filename
        print '...saving to', args.savehash
        ht.save(args.savehash)

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(ht)
    print 'fp rate estimated to be %1.3f' % fp_rate

    if fp_rate > 0.20:
        print >>sys.stderr, "**"
        print >>sys.stderr, "** ERROR: the counting hash is too small for"
        print >>sys.stderr, "** this data set.  Increase hashsize/num ht."
        print >>sys.stderr, "**"
        print >>sys.stderr, "** Do not use these results!!"
        sys.exit(-1)
예제 #27
0
def main():
    parser = build_construct_args()
    parser.add_argument('-C', '--cutoff', type=int, dest='cutoff',
                        default=DEFAULT_DESIRED_COVERAGE)
    parser.add_argument('-p', '--paired', action='store_true')
    parser.add_argument('-s', '--savehash', dest='savehash', default='')
    parser.add_argument('-l', '--loadhash', dest='loadhash',
                        default='')
    parser.add_argument('-R', '--report-to-file', dest='report_file',
                        type=argparse.FileType('w'))
    parser.add_argument('input_filenames', nargs='+')

    args = parser.parse_args()

    if not args.quiet:
        if args.min_hashsize == DEFAULT_MIN_HASHSIZE:
            print >>sys.stderr, \
                "** WARNING: hashsize is default!  " \
                "You absodefly want to increase this!\n** " \
                "Please read the docs!"

        print >>sys.stderr, '\nPARAMETERS:'
        print >>sys.stderr, ' - kmer size =    %d \t\t(-k)' % args.ksize
        print >>sys.stderr, ' - n hashes =     %d \t\t(-N)' % args.n_hashes
        print >>sys.stderr, \
            ' - min hashsize = %-5.2g \t(-x)' % args.min_hashsize
        print >>sys.stderr, ' - paired =	      %s \t\t(-p)' % args.paired
        print >>sys.stderr, ''
        print >>sys.stderr, \
            'Estimated memory usage is %.2g bytes (n_hashes x min_hashsize)' \
            % (args.n_hashes * args.min_hashsize)
        print >>sys.stderr, '-' * 8

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes
    DESIRED_COVERAGE = args.cutoff
    report_fp = args.report_file
    filenames = args.input_filenames

    # In paired mode we read two records at a time
    batch_size = 1
    if args.paired:
        batch_size = 2

    if args.loadhash:
        print 'loading hashtable from', args.loadhash
        ht = khmer.load_counting_hash(args.loadhash)
    else:
        print 'making hashtable'
        ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)

    total = 0
    discarded = 0

    for input_filename in filenames:
        output_name = os.path.basename(input_filename) + '.keep'
        outfp = open(output_name, 'w')

        n = -1
        for n, batch in enumerate(batchwise(screed.open(
                input_filename), batch_size)):
            if n > 0 and n % 10000 == 0:
                print '... kept', total - discarded, 'of', total, ', or', \
                    int(100. - discarded / float(total) * 100.), '%'
                print '... in file', input_filename

                if report_fp:
                    print>>report_fp, total, total - discarded, \
                        1. - (discarded / float(total))
                    report_fp.flush()

            total += batch_size

            # If in paired mode, check that the reads are properly interleaved
            if args.paired:
                if not validpair(batch[0], batch[1]):
                    print >>sys.stderr, \
                        'Error: Improperly interleaved pairs %s %s' \
                        % (batch[0].name, batch[1].name)
                    sys.exit(-1)

            # Emit the batch of reads if any read passes the filter
            # and all reads are longer than K
            passed_filter = False
            passed_length = True
            for record in batch:
                if len(record.sequence) < K:
                    passed_length = False
                    continue

                seq = record.sequence.replace('N', 'A')
                med, _, _ = ht.get_median_count(seq)

                if med < DESIRED_COVERAGE:
                    ht.consume(seq)
                    passed_filter = True

            # Emit records if any passed
            if passed_length and passed_filter:
                for record in batch:
                    if hasattr(record, 'accuracy'):
                        outfp.write('@%s\n%s\n+\n%s\n' % (record.name,
                                                          record.sequence,
                                                          record.accuracy))
                    else:
                        outfp.write(
                            '>%s\n%s\n' % (record.name, record.sequence))
            else:
                discarded += batch_size

        if -1 < n:
            print \
                'DONE with', input_filename, '; kept', total - discarded, \
                'of', total, 'or', \
                int(100. - discarded / float(total) * 100.), '%'
            print 'output in', output_name
        else:
            print 'SKIPPED empty file', input_filename

    if args.savehash:
        print 'Saving hashfile through', input_filename
        print '...saving to', args.savehash
        ht.save(args.savehash)

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(ht)
    print 'fp rate estimated to be %1.3f' % fp_rate

    if fp_rate > 0.20:
        print >>sys.stderr, "**"
        print >>sys.stderr, "** ERROR: the counting hash is too small for"
        print >>sys.stderr, "** this data set.  Increase hashsize/num ht."
        print >>sys.stderr, "**"
        print >>sys.stderr, "** Do not use these results!!"
        sys.exit(-1)
예제 #28
0
def main():

    info('load-into-counting.py', ['counting'])
    args = get_parser().parse_args()
    report_on_config(args)

    base = args.output_countingtable_filename
    filenames = args.input_sequence_filename

    for name in args.input_sequence_filename:
        check_file_status(name)

    check_space(args.input_sequence_filename)
    check_space_for_hashtable(args.n_tables * args.min_tablesize)

    print 'Saving k-mer counting table to %s' % base
    print 'Loading kmers from sequences in %s' % repr(filenames)

    print 'making k-mer counting table'
    htable = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                     args.n_tables, args.n_threads)
    htable.set_use_bigcount(args.bigcount)

    config = khmer.get_config()
    config.set_reads_input_buffer_size(args.n_threads * 64 * 1024)

    for index, filename in enumerate(filenames):

        rparser = khmer.ReadParser(filename, args.n_threads)
        threads = []
        print 'consuming input', filename
        for _ in xrange(args.n_threads):
            cur_thrd = \
                threading.Thread(
                    target=htable.consume_fasta_with_reads_parser,
                    args=(rparser, )
                )
            threads.append(cur_thrd)
            cur_thrd.start()

        for _ in threads:
            _.join()

        if index > 0 and index % 10 == 0:
            check_space_for_hashtable(args.n_tables * args.min_tablesize)
            print 'mid-save', base
            htable.save(base)
            open(base + '.info', 'w').write('through %s' % filename)

    print 'saving', base
    htable.save(base)

    info_fp = open(base + '.info', 'w')
    info_fp.write('through end: %s\n' % filename)

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(htable)
    print 'fp rate estimated to be %1.3f' % fp_rate
    print >> info_fp, 'fp rate estimated to be %1.3f' % fp_rate

    if fp_rate > 0.20:
        print >> sys.stderr, "**"
        print >> sys.stderr, ("** ERROR: the k-mer counting table is too small"
                              " this data set.  Increase tablesize/# tables.")
        print >> sys.stderr, "**"
        sys.exit(1)

    print 'DONE.'
def main():  # pylint: disable=too-many-branches,too-many-statements
    info('saturate-by-median.py', ['diginorm'])
    parser = sanitize_help(get_parser())
    args = parser.parse_args()

    report_on_config(args)

    report_fp = args.report
    report_frequency = args.report_frequency

    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, False)
    if args.savegraph:
        check_space_for_graph(args, 'countgraph', False)

    # list to save error files along with throwing exceptions
    if args.force:
        corrupt_files = []

    if args.loadgraph:
        print('loading k-mer countgraph from', args.loadgraph)
        htable = khmer.load_countgraph(args.loadgraph)
    else:
        print('making countgraph')
        htable = create_countgraph(args)

    total = 0
    discarded = 0

    for index, input_filename in enumerate(args.input_filenames):
        total_acc = 0
        discarded_acc = 0

        try:
            total_acc, discarded_acc = normalize_by_median(input_filename,
                                                           htable, args,
                                                           report_fp,
                                                           report_frequency)
        except IOError as err:
            handle_error(err, input_filename)
            if not args.force:
                print("NOTE: This can be overridden using the --force"
                      " argument", file=sys.stderr)
                print('** Exiting!', file=sys.stderr)
                sys.exit(1)
            else:
                print('*** Skipping error file, moving on...', file=sys.stderr)
                corrupt_files.append(input_filename)
        else:
            if total_acc == 0 and discarded_acc == 0:
                print('SKIPPED empty file', input_filename)
            else:
                total += total_acc
                discarded += discarded_acc
                print('DONE with {inp}; kept {kept} of {total} or {perc:2}%'\
                    .format(inp=input_filename,
                            kept=total - discarded, total=total,
                            perc=int(100. - discarded / float(total) * 100.)))

    if args.savegraph:
        print('Saving k-mer countgraph through', input_filename)
        print('...saving to', args.savegraph)
        htable.save(args.savegraph)

    # re: threshold, see Zhang et al.,
    # http://arxiv.org/abs/1309.2975
    fp_rate = khmer.calc_expected_collisions(htable, args.force, max_false_pos=.8)
    print('fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate))

    if args.force and len(corrupt_files) > 0:
        print("** WARNING: Finished with errors!", file=sys.stderr)
        print("** I/O Errors occurred in the following files:", file=sys.stderr)
        print("\t", " ".join(corrupt_files), file=sys.stderr)
예제 #30
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('seqfiles', nargs='+')
    parser.add_argument('-o', '--output', default=None)
    parser.add_argument('-k', '--ksize', default=DEFAULT_KSIZE, type=int)
    parser.add_argument('-x',
                        '--tablesize',
                        default=NODEGRAPH_SIZE,
                        type=float)
    parser.add_argument('--force', action='store_true')
    args = parser.parse_args()

    assert args.ksize % 2, "ksize must be odd"
    assert args.output, "you probably want an output file"

    print('building graphs and loading files')

    # Create graph, and two stop bloom filters - one for loading, one for
    # traversing. Create them all here so that we can error out quickly
    # if memory is a problem.

    graph = khmer.Nodegraph(args.ksize, args.tablesize, 2)
    print(graph.ksize(), graph.hashsizes())
    stop_bf = khmer.Nodegraph(args.ksize, args.tablesize, 2)
    stop_bf2 = khmer.Nodegraph(args.ksize, args.tablesize, 2)
    n = 0

    # load in all of the input sequences, one file at a time.
    for seqfile in args.seqfiles:
        for record in screed.open(seqfile):
            n += 1
            if n % 10000 == 0:
                print('...', seqfile, n)
            graph.consume(record.sequence)

    # complain if too small set of graphs was used.
    fp_rate = khmer.calc_expected_collisions(graph,
                                             args.force,
                                             max_false_pos=.05)

    # initialize the object that will track information for us.
    pathy = Pathfinder(args.ksize)

    print('finding high degree nodes')
    degree_nodes = khmer.HashSet(args.ksize)
    n = 0
    for seqfile in args.seqfiles:
        for record in screed.open(seqfile):
            n += 1
            if n % 10000 == 0:
                print('...2', seqfile, n)
            # walk across sequences, find all high degree nodes,
            # name them and cherish them. Don't do this on identical sequences.
            if min(stop_bf2.get_kmer_counts(record.sequence)) == 0:
                stop_bf2.consume(record.sequence)
                degree_nodes += graph.find_high_degree_nodes(record.sequence)
    del stop_bf2

    if not len(degree_nodes):
        print('no high degree nodes; exiting.')
        sys.exit(0)

    # get all of the degree > 2 nodes and give them IDs.
    for node in degree_nodes:
        pathy.new_segment(node)

    print('traversing linear segments from', len(degree_nodes), 'nodes')

    # now traverse from each high degree nodes into all neighboring nodes,
    # seeking adjacencies.  if neighbor is high degree node, add it to
    # adjacencies; if neighbor is not, then traverse the linear path.  also
    # track minhashes while we're at it.
    for n, k in enumerate(degree_nodes):
        if n % 10000 == 0:
            print('...', n, 'of', len(degree_nodes))

        # retrieve the segment ID of the primary node.
        k_id = pathy.segments_r[k]

        # find all the neighbors of this high-degree node.
        nbh = graph.neighbors(k)
        for nk in nbh:
            # neighbor is high degree? fine, mark its adjacencies.
            if nk in degree_nodes:
                nk_id = pathy.segments_r[nk]
                pathy.add_adjacency(k_id, nk_id)
            else:
                # linear! walk it.
                traverse_and_mark_linear_paths(graph, nk, stop_bf, pathy,
                                               degree_nodes)

    print(len(pathy.segments), 'segments, containing',
          sum(pathy.segments.values()), 'nodes')

    # save to GML
    if args.output:
        print('saving to', args.output)
        fp = open(args.output, 'w')
        w = GmlWriter(fp, [], [])

        for k, v in pathy.segments.items():
            w.add_vertex(k, v, [])

        for k, v in pathy.adjacencies.items():
            for edge in v:
                w.add_edge(k, edge, [])
예제 #31
0
def main():
    parser = sanitize_help(get_parser())
    args = parser.parse_args()
    if not args.quiet:
        info('trim-low-abund.py', ['streaming'])

    configure_logging(args.quiet)

    ###

    if len(set(args.input_filenames)) != len(args.input_filenames):
        log_error("Error: Cannot input the same filename multiple times.")
        sys.exit(1)

    if args.trim_at_coverage != DEFAULT_TRIM_AT_COVERAGE and \
       not args.variable_coverage:
        log_error("Error: --trim-at-coverage/-Z given, but "
                  "--variable-coverage/-V not specified.")
        sys.exit(1)

    if args.diginorm_coverage != DEFAULT_DIGINORM_COVERAGE and \
       not args.diginorm:
        log_error("Error: --diginorm-coverage given, but "
                  "--diginorm not specified.")
        sys.exit(1)

    if args.diginorm and args.single_pass:
        log_error("Error: --diginorm and --single-pass are incompatible!\n"
                  "You probably want to use normalize-by-median.py instead.")
        sys.exit(1)

    ###

    report_on_config(args)
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savegraph:
        graphsize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, graphsize, args.force)

    if ('-' in args.input_filenames or '/dev/stdin' in args.input_filenames) \
       and not args.output:
        log_error("Accepting input from stdin; output filename must "
                  "be provided with -o.")
        sys.exit(1)

    if args.loadgraph:
        log_info('loading countgraph from {graph}', graph=args.loadgraph)
        ct = khmer.load_countgraph(args.loadgraph)
    else:
        log_info('making countgraph')
        ct = khmer_args.create_countgraph(args)

    K = ct.ksize()
    tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir)
    log_info(
        'created temporary directory {temp};\n'
        'use -T to change location',
        temp=tempdir)

    trimmer = Trimmer(ct, not args.variable_coverage, args.cutoff,
                      args.trim_at_coverage)
    if args.diginorm:
        trimmer.set_diginorm(args.diginorm_coverage)

    # ### FIRST PASS ###

    save_pass2_total = 0

    written_bp = 0
    written_reads = 0

    # only create the file writer once if outfp is specified; otherwise,
    # create it for each file.
    if args.output:
        trimfp = get_file_writer(args.output, args.gzip, args.bzip)

    pass2list = []
    for filename in args.input_filenames:
        # figure out temporary filename for 2nd pass
        pass2filename = os.path.basename(filename) + '.pass2'
        pass2filename = os.path.join(tempdir, pass2filename)
        pass2fp = open(pass2filename, 'w')

        # construct output filenames
        if args.output is None:
            # note: this will be saved in trimfp.
            outfp = open(os.path.basename(filename) + '.abundtrim', 'wb')

            # get file handle w/gzip, bzip
            trimfp = get_file_writer(outfp, args.gzip, args.bzip)

        # record all this info
        pass2list.append((filename, pass2filename, trimfp))

        # input file stuff: get a broken_paired reader.
        screed_iter = screed.open(filename)
        paired_iter = broken_paired_reader(screed_iter,
                                           min_length=K,
                                           force_single=args.ignore_pairs)

        # main loop through the file.
        n_start = trimmer.n_reads
        save_start = trimmer.n_saved

        watermark = REPORT_EVERY_N_READS
        for read in trimmer.pass1(paired_iter, pass2fp):
            if (trimmer.n_reads - n_start) > watermark:
                log_info(
                    "... {filename} {n_saved} {n_reads} {n_bp} "
                    "{w_reads} {w_bp}",
                    filename=filename,
                    n_saved=trimmer.n_saved,
                    n_reads=trimmer.n_reads,
                    n_bp=trimmer.n_bp,
                    w_reads=written_reads,
                    w_bp=written_bp)
                watermark += REPORT_EVERY_N_READS

            # write out the trimmed/etc sequences that AREN'T going to be
            # revisited in a 2nd pass.
            write_record(read, trimfp)
            written_bp += len(read)
            written_reads += 1
        pass2fp.close()

        log_info("{filename}: kept aside {kept} of {total} from first pass",
                 filename=filename,
                 kept=trimmer.n_saved - save_start,
                 total=trimmer.n_reads - n_start)

    # first pass goes across all the data, so record relevant stats...
    n_reads = trimmer.n_reads
    n_bp = trimmer.n_bp
    n_skipped = trimmer.n_skipped
    bp_skipped = trimmer.bp_skipped
    save_pass2_total = trimmer.n_saved

    # ### SECOND PASS. ###

    # nothing should have been skipped yet!
    assert trimmer.n_skipped == 0
    assert trimmer.bp_skipped == 0

    if args.single_pass:
        pass2list = []

    # go back through all the files again.
    for _, pass2filename, trimfp in pass2list:
        log_info('second pass: looking at sequences kept aside in {pass2}',
                 pass2=pass2filename)

        # note that for this second pass, we don't care about paired
        # reads - they will be output in the same order they're read in,
        # so pairs will stay together if not orphaned.  This is in contrast
        # to the first loop.  Hence, force_single=True below.

        screed_iter = screed.open(pass2filename, parse_description=False)
        paired_iter = broken_paired_reader(screed_iter,
                                           min_length=K,
                                           force_single=True)

        watermark = REPORT_EVERY_N_READS
        for read in trimmer.pass2(paired_iter):
            if (trimmer.n_reads - n_start) > watermark:
                log_info('... x 2 {a} {b} {c} {d} {e} {f} {g}',
                         a=trimmer.n_reads - n_start,
                         b=pass2filename,
                         c=trimmer.n_saved,
                         d=trimmer.n_reads,
                         e=trimmer.n_bp,
                         f=written_reads,
                         g=written_bp)
                watermark += REPORT_EVERY_N_READS

            write_record(read, trimfp)
            written_reads += 1
            written_bp += len(read)

        log_info('removing {pass2}', pass2=pass2filename)
        os.unlink(pass2filename)

        # if we created our own trimfps, close 'em.
        if not args.output:
            trimfp.close()

    log_info('removing temp directory & contents ({temp})', temp=tempdir)
    shutil.rmtree(tempdir)

    trimmed_reads = trimmer.trimmed_reads

    n_passes = 1.0 + (float(save_pass2_total) / n_reads)
    percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\
        n_reads * 100.0

    log_info('read {read} reads, {bp} bp', read=n_reads, bp=n_bp)
    log_info('wrote {wr} reads, {wbp} bp', wr=written_reads, wbp=written_bp)
    log_info('looked at {st} reads twice ({np:.2f} passes)',
             st=save_pass2_total,
             np=n_passes)
    log_info('removed {r} reads and trimmed {t} reads ({p:.2f}%)',
             r=n_reads - written_reads,
             t=trimmed_reads,
             p=percent_reads_trimmed)
    log_info('trimmed or removed {p:.2f}%% of bases ({bp} total)',
             p=(1 - (written_bp / float(n_bp))) * 100.0,
             bp=n_bp - written_bp)

    if args.variable_coverage:
        percent_reads_hicov = 100.0 * float(n_reads - n_skipped) / n_reads
        log_info('{n} reads were high coverage ({p:.2f}%);',
                 n=n_reads - n_skipped,
                 p=percent_reads_hicov)
        log_info('skipped {r} reads/{bp} bases because of low coverage',
                 r=n_skipped,
                 bp=bp_skipped)

    fp_rate = \
        khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975
    log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate)

    log_info('output in *.abundtrim')

    if args.savegraph:
        log_info("Saving k-mer countgraph to {graph}", graph=args.savegraph)
        ct.save(args.savegraph)
예제 #32
0
def main():
    parser = build_counting_args()
    parser.add_argument("-t",
                        "--trusted-cutoff",
                        dest="trusted_cutoff",
                        type=int,
                        default=3)
    parser.add_argument("--bits-theta",
                        help="Tuning parameter controlling"
                        "trade off of speed vs alignment sensitivity",
                        default=1.0,
                        type=float,
                        dest="bits_theta")
    parser.add_argument('--normalize-to',
                        '-Z',
                        type=int,
                        dest='normalize_to',
                        help='base cutoff on abundance',
                        default=DEFAULT_NORMALIZE_LIMIT)
    parser.add_argument('-s', '--savehash', dest='savehash', default='')
    parser.add_argument('-l', '--loadhash', dest='loadhash', default='')
    parser.add_argument('--details-out', dest="details_out")
    parser.add_argument('input_filenames', nargs='+')

    args = parser.parse_args()

    if not args.quiet:
        print('\nPARAMETERS:', file=sys.stderr)
        print(' - kmer size =    %d \t\t(-k)' % args.ksize, file=sys.stderr)
        print(' - n hashes =     %d \t\t(-N)' % args.n_tables, file=sys.stderr)
        print(' - min hashsize = %-5.2g \t(-x)' % \
            args.max_tablesize, file=sys.stderr)
        print('', file=sys.stderr)
        print('Estimated memory usage is %.2g bytes ' \
            '(n_hashes x min_hashsize)' % \
            (args.n_tables * args.max_tablesize), file=sys.stderr)
        print('-' * 8, file=sys.stderr)

    K = args.ksize
    HT_SIZE = args.max_tablesize
    N_HT = args.n_tables

    filenames = args.input_filenames

    if args.loadhash:
        print('loading hashtable from', args.loadhash)
        ht = khmer.load_countgraph(args.loadhash)
    else:
        print('making hashtable')
        ht = khmer.Countgraph(K, HT_SIZE, N_HT)

    aligner = khmer.ReadAligner(ht, args.trusted_cutoff, args.bits_theta)

    if args.details_out is not None:
        details_out = open(args.details_out, "w")
    else:
        details_out = None

    total = 0
    discarded = 0
    for input_filename in filenames:
        output_name = os.path.basename(input_filename) + '.keepvar'
        outfp = open(output_name, 'w')

        for n, record in enumerate(screed.open(input_filename)):
            if n > 0 and n % 10000 == 0:
                print('... kept', total - discarded, 'of', total, ', or', \
                    int(100. - discarded / float(total) * 100.), '%')
                print('... in file', input_filename)

            total += 1

            if len(record.sequence) < K:
                continue

            seq = record.sequence.upper().replace('N', 'A')

            ##

            # build the alignment...
            score, graph_alignment, read_alignment, truncated = \
                aligner.align(record.sequence)

            # next, decide whether or to keep it.
            keep = False
            if truncated:
                keep = True  # keep all truncated alignments - why?
            else:

                # build a better sequence -- this is the corrected one.
                graph_seq = graph_alignment.replace("-", "")
                # OR?
                #graph_seq = ""
                #for i in range(len(graph_alignment)):
                #    if graph_alignment[i] == "-":
                #        graph_seq += read_alignment[i]
                #    else:
                #        graph_seq += graph_alignment[i]

                # get the minimum count for this new sequence
                mincount = ht.get_min_count(graph_seq)
                if mincount < args.normalize_to:
                    keep = True

            if details_out is not None:
                details_out.write(
                    "+{7}\t{0:0.2f}\t{3}\t{4}\nread:      "
                    "{6}\ngraph_aln: {1}\nread_aln:  {2}\nstored_seq:{5}\n"
                    "".format(score, graph_alignment, read_alignment,
                              truncated, keep, seq, record.sequence,
                              record.name))

            if keep:
                ht.consume(seq)
                outfp.write('>%s\n%s\n' % (record.name, record.sequence))
            else:
                discarded += 1

        if total:
            print('DONE with', input_filename, \
                '; kept', total - discarded, 'of', total, 'or', \
                int(100. - discarded / float(total) * 100.), '%')
        print('output in', output_name)

    if args.savehash:
        print('Saving hashfile through', input_filename)
        print('...saving to', args.savehash)
        ht.save(args.savehash)

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(ht, args.force, max_false_pos=.2)
    print('fp rate estimated to be %1.3f' % fp_rate)
예제 #33
0
def main():  # pylint: disable=too-many-branches,too-many-statements
    info('normalize-by-median.py', ['diginorm'])
    args = get_parser().parse_args()

    report_on_config(args)

    report_fp = args.report

    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savetable:
        check_space_for_hashtable(
            args.n_tables * args.min_tablesize, args.force)

    # list to save error files along with throwing exceptions
    if args.force:
        corrupt_files = []

    if args.loadtable:
        print 'loading k-mer counting table from', args.loadtable
        htable = khmer.load_counting_hash(args.loadtable)
    else:
        print 'making k-mer counting table'
        htable = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                         args.n_tables)

    total = 0
    discarded = 0
    input_filename = None

    for index, input_filename in enumerate(args.input_filenames):
        if args.single_output_filename != '':
            output_name = args.single_output_filename
            outfp = open(args.single_output_filename, 'a')
        else:
            output_name = os.path.basename(input_filename) + '.keep'
            outfp = open(output_name, 'w')

        total_acc = 0
        discarded_acc = 0

        try:
            total_acc, discarded_acc = normalize_by_median(input_filename,
                                                           outfp, htable, args,
                                                           report_fp)
        except IOError as err:
            handle_error(err, output_name, input_filename, args.fail_save,
                         htable)
            if not args.force:
                print >> sys.stderr, '** Exiting!'

                sys.exit(1)
            else:
                print >> sys.stderr, '*** Skipping error file, moving on...'
                corrupt_files.append(input_filename)
        else:
            if total_acc == 0 and discarded_acc == 0:
                print 'SKIPPED empty file', input_filename
            else:
                total += total_acc
                discarded += discarded_acc
                print 'DONE with {inp}; kept {kept} of {total} or {perc:2}%'\
                      .format(inp=input_filename, kept=total - discarded,
                              total=total, perc=int(100. - discarded /
                                                    float(total) * 100.))
                print 'output in', output_name

        if (args.dump_frequency > 0 and
                index > 0 and index % args.dump_frequency == 0):
            print 'Backup: Saving k-mer counting file through', input_filename
            if args.savetable:
                hashname = args.savetable
                print '...saving to', hashname
            else:
                hashname = 'backup.ct'
                print 'Nothing given for savetable, saving to', hashname
            htable.save(hashname)

    if args.report_total_kmers:
        print >> sys.stderr, 'Total number of unique k-mers: {0}'.format(
            htable.n_unique_kmers())

    if args.savetable:
        print 'Saving k-mer counting table through', input_filename
        print '...saving to', args.savetable
        htable.save(args.savetable)

    fp_rate = khmer.calc_expected_collisions(htable)
    print 'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate)

    if args.force and len(corrupt_files) > 0:
        print >> sys.stderr, "** WARNING: Finished with errors!"
        print >> sys.stderr, "** IOErrors occurred in the following files:"
        print >> sys.stderr, "\t", " ".join(corrupt_files)

    if fp_rate > MAX_FALSE_POSITIVE_RATE:
        print >> sys.stderr, "**"
        print >> sys.stderr, ("** ERROR: the k-mer counting table is too small"
                              " for this data set. Increase tablesize/# "
                              "tables.")
        print >> sys.stderr, "**"
        print >> sys.stderr, "** Do not use these results!!"
        if not args.force:
            sys.exit(1)
예제 #34
0
def main():
    parser = argparse.ArgumentParser(description='XXX')

    env_ksize = os.environ.get('KHMER_KSIZE', DEFAULT_K)
    env_n_hashes = os.environ.get('KHMER_N_HASHES', DEFAULT_N_HT)
    env_hashsize = os.environ.get('KHMER_MIN_HASHSIZE', DEFAULT_MIN_HASHSIZE)

    parser.add_argument('--ksize',
                        '-k',
                        type=int,
                        dest='ksize',
                        default=env_ksize,
                        help='k-mer size to use')
    parser.add_argument('--n_hashes',
                        '-N',
                        type=int,
                        dest='n_hashes',
                        default=env_n_hashes,
                        help='number of hash tables to use')
    parser.add_argument('--hashsize',
                        '-x',
                        type=float,
                        dest='min_hashsize',
                        default=env_hashsize,
                        help='lower bound on hashsize to use')

    parser.add_argument('--cutoff',
                        '-C',
                        type=int,
                        dest='abund_cutoff',
                        help='remove k-mers below this abundance',
                        default=DEFAULT_CUTOFF)

    parser.add_argument('--normalize-to',
                        '-Z',
                        type=int,
                        dest='normalize_to',
                        help='base cutoff on median k-mer abundance of this',
                        default=DEFAULT_NORMALIZE_LIMIT)

    parser.add_argument('--variable-coverage',
                        '-V',
                        action='store_true',
                        dest='variable_coverage',
                        default=False,
                        help='Only trim low-abundance k-mers from sequences '
                        'that have high coverage.')
    parser.add_argument('--tempdir',
                        '-T',
                        type=str,
                        dest='tempdir',
                        default='./')

    parser.add_argument('input_filenames', nargs='+')
    args = parser.parse_args()

    ###

    if len(set(args.input_filenames)) != len(args.input_filenames):
        print >>sys.stderr, \
            "Error: Cannot input the same filename multiple times."
        sys.exit(1)

    ###

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes

    CUTOFF = args.abund_cutoff
    NORMALIZE_LIMIT = args.normalize_to

    print 'making hashtable'
    ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)

    tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir)
    print 'created temporary directory %s; use -T to change location' % tempdir

    ###

    save_pass2_total = 0

    read_bp = 0
    read_reads = 0
    wrote_bp = 0
    wrote_reads = 0
    trimmed_reads = 0

    pass2list = []
    for filename in args.input_filenames:
        pass2filename = os.path.basename(filename) + '.pass2'
        pass2filename = os.path.join(tempdir, pass2filename)
        trimfilename = os.path.basename(filename) + '.abundtrim'

        pass2list.append((filename, pass2filename, trimfilename))

        pass2fp = open(pass2filename, 'w')
        trimfp = open(trimfilename, 'w')

        save_pass2 = 0
        for n, read in enumerate(screed.open(filename)):
            if n % 10000 == 0:
                print '...', n, filename, save_pass2, read_reads, read_bp, \
                    wrote_reads, wrote_bp

            read_reads += 1
            read_bp += len(read.sequence)

            seq = read.sequence.replace('N', 'A')
            med, _, _ = ht.get_median_count(seq)

            # has this portion of the graph saturated? if not,
            # consume & save => pass2.
            if med < NORMALIZE_LIMIT:
                ht.consume(seq)
                pass2fp.write(output_single(read))
                save_pass2 += 1
            else:  # trim!!
                trim_seq, trim_at = ht.trim_on_abundance(seq, CUTOFF)
                if trim_at >= K:
                    trimfp.write(output_single(read, trim_at))
                    wrote_reads += 1
                    wrote_bp += trim_at
                    if trim_at != len(read.sequence):
                        trimmed_reads += 1

        pass2fp.close()
        trimfp.close()

        print '%s: kept aside %d of %d from first pass, in %s' % \
              (filename, save_pass2, n, filename)
        save_pass2_total += save_pass2

    skipped_n = 0
    skipped_bp = 0
    for orig_filename, pass2filename, trimfilename in pass2list:
        print 'second pass: looking at sequences kept aside in %s' % \
              pass2filename
        for n, read in enumerate(screed.open(pass2filename)):
            if n % 10000 == 0:
                print '... x 2', n, pass2filename, read_reads, read_bp, \
                      wrote_reads, wrote_bp

            trimfp = open(trimfilename, 'a')

            seq = read.sequence.replace('N', 'A')
            med, _, _ = ht.get_median_count(seq)

            # do we retain low-abundance components unchanged?
            if med < NORMALIZE_LIMIT and args.variable_coverage:
                trimfp.write(output_single(read))
                wrote_reads += 1
                wrote_bp += len(read.sequence)
                skipped_n += 1
                skipped_bp += len(read.sequence)

            # otherwise, examine/trim/truncate.
            else:  # med >= NORMALIZE LIMIT or not args.variable_coverage
                trim_seq, trim_at = ht.trim_on_abundance(seq, CUTOFF)
                if trim_at >= K:
                    trimfp.write(output_single(read, trim_at))
                    wrote_reads += 1
                    wrote_bp += trim_at
                    if trim_at != len(read.sequence):
                        trimmed_reads += 1

        print 'removing %s' % pass2filename
        os.unlink(pass2filename)

    print 'removing temp directory & contents (%s)' % tempdir
    shutil.rmtree(tempdir)

    print 'read %d reads, %d bp' % (
        read_reads,
        read_bp,
    )
    print 'wrote %d reads, %d bp' % (
        wrote_reads,
        wrote_bp,
    )
    print 'removed %d reads and trimmed %d reads' % (
        read_reads - wrote_reads,
        trimmed_reads,
    )
    print 'looked at %d reads twice' % (save_pass2_total, )
    print 'trimmed or removed %.2f%% of bases (%d total)' % \
        ((1 - (wrote_bp / float(read_bp))) * 100., read_bp - wrote_bp)
    if args.variable_coverage:
        print 'skipped %d reads/%d bases because of low coverage' % \
              (skipped_n, skipped_bp)
        print 'output in *.abundtrim'

    fp_rate = khmer.calc_expected_collisions(ht)
    print >>sys.stderr, \
        'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate)

    if fp_rate > MAX_FALSE_POSITIVE_RATE:
        print >> sys.stderr, "**"
        print >> sys.stderr, ("** ERROR: the k-mer counting table is too small"
                              " for this data set. Increase tablesize/# "
                              "tables.")
        print >> sys.stderr, "**"
        print >> sys.stderr, "** Do not use these results!!"
        sys.exit(1)
예제 #35
0
def main():  # pylint: disable=too-many-locals,too-many-statements
    info('do-partition.py', ['graph'])
    args = get_parser().parse_args()

    report_on_config(args, hashtype='hashbits')

    for infile in args.input_filenames:
        check_file_status(infile)

    check_space(args.input_filenames)

    print 'Saving k-mer presence table to %s' % args.graphbase
    print 'Loading kmers from sequences in %s' % repr(args.input_filenames)

    print '--'
    print 'SUBSET SIZE', args.subset_size
    print 'N THREADS', args.n_threads
    print '--'

    # load-graph

    print 'making k-mer presence table'
    htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables)

    for _, filename in enumerate(args.input_filenames):
        print 'consuming input', filename
        htable.consume_fasta_and_tag(filename)

    fp_rate = khmer.calc_expected_collisions(htable)
    print 'fp rate estimated to be %1.3f' % fp_rate
    if fp_rate > 0.15:  # 0.18 is ACTUAL MAX. Do not change.
        print >> sys.stderr, "**"
        print >> sys.stderr, ("** ERROR: the graph structure is too small for"
                              " this data set.  Increase k-mer presence table "
                              "size/num of tables.")
        print >> sys.stderr, "**"
        sys.exit(1)

    # partition-graph

    # do we want to exhaustively traverse the graph?
    stop_big_traversals = args.no_big_traverse
    if stop_big_traversals:
        print '** This script brakes for lumps: stop_big_traversals is true.'
    else:
        print '** Traverse all the things: stop_big_traversals is false.'

    #
    # now, partition!
    #

    # divide the tags up into subsets
    divvy = htable.divide_tags_into_subsets(int(args.subset_size))
    n_subsets = len(divvy)
    divvy.append(0)

    # build a queue of tasks:
    worker_q = Queue.Queue()

    # break up the subsets into a list of worker tasks
    for _ in range(0, n_subsets):
        start = divvy[_]
        end = divvy[_ + 1]
        worker_q.put((htable, _, start, end))

    print 'enqueued %d subset tasks' % n_subsets
    open('%s.info' % args.graphbase,
         'w').write('%d subsets total\n' % (n_subsets))

    if n_subsets < args.n_threads:
        args.n_threads = n_subsets

    # start threads!
    print 'starting %d threads' % args.n_threads
    print '---'

    threads = []
    for _ in range(args.n_threads):
        cur_thread = threading.Thread(target=worker,
                                      args=(worker_q, args.graphbase,
                                            stop_big_traversals))
        threads.append(cur_thread)
        cur_thread.start()

    print 'done starting threads'

    # wait for threads
    for _ in threads:
        _.join()

    print '---'
    print 'done making subsets! see %s.subset.*.pmap' % (args.graphbase, )

    # merge-partitions

    pmap_files = glob.glob(args.graphbase + '.subset.*.pmap')

    print 'loading %d pmap files (first one: %s)' % (len(pmap_files),
                                                     pmap_files[0])

    htable = khmer.new_hashbits(args.ksize, 1, 1)

    for pmap_file in pmap_files:
        print 'merging', pmap_file
        htable.merge_subset_from_disk(pmap_file)

    if args.remove_subsets:
        print 'removing pmap files'
        for pmap_file in pmap_files:
            os.unlink(pmap_file)

    # annotate-partitions

    for infile in args.input_filenames:
        print 'outputting partitions for', infile
        outfile = os.path.basename(infile) + '.part'
        part_count = htable.output_partitions(infile, outfile)
        print 'output %d partitions for %s' % (part_count, infile)
        print 'partitions are in', outfile
예제 #36
0
def main():
    parser = build_construct_args()
    add_threading_args(parser)
    parser.add_argument('output_filename')
    parser.add_argument('input_filenames', nargs='+')
    parser.add_argument('-b', '--no-bigcount', dest='bigcount', default=True,
                        action='store_false',
                        help='Do not count k-mers past 255')

    args = parser.parse_args()
    report_on_config(args)

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes

    base = args.output_filename
    filenames = args.input_filenames
    n_threads = int(args.n_threads)

    print 'Saving hashtable to %s' % base
    print 'Loading kmers from sequences in %s' % repr(filenames)

    ###

    print 'making hashtable'
    ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads)
    ht.set_use_bigcount(args.bigcount)

    config = khmer.get_config()
    bufsz = config.get_reads_input_buffer_size()
    config.set_reads_input_buffer_size(n_threads * 64 * 1024)

    for n, filename in enumerate(filenames):

        rparser = khmer.ReadParser(filename, n_threads)
        threads = []
        print 'consuming input', filename
        for tnum in xrange(n_threads):
            t = \
                threading.Thread(
                    target=ht.consume_fasta_with_reads_parser,
                    args=(rparser, )
                )
            threads.append(t)
            t.start()

        for t in threads:
            t.join()

        if n > 0 and n % 10 == 0:
            print 'mid-save', base
            ht.save(base)
            open(base + '.info', 'w').write('through %s' % filename)

    print 'saving', base
    ht.save(base)

    info_fp = open(base + '.info', 'w')
    info_fp.write('through end: %s\n' % filename)

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(ht)
    print 'fp rate estimated to be %1.3f' % fp_rate
    print >>info_fp, 'fp rate estimated to be %1.3f' % fp_rate

    if fp_rate > 0.20:
        print >>sys.stderr, "**"
        print >>sys.stderr, "** ERROR: the counting hash is too small for"
        print >>sys.stderr, "** this data set.  Increase hashsize/num ht."
        print >>sys.stderr, "**"
        sys.exit(-1)

    print 'DONE.'
예제 #37
0
def main():

    info('collect-reads.py', ['counting'])
    args = get_parser().parse_args()
    report_on_config(args)

    base = args.output_countingtable_filename
    filenames = args.input_sequence_filename

    for name in args.input_sequence_filename:
        check_input_files(name, False)

    check_space(args.input_sequence_filename, False)
    check_space_for_hashtable(args.n_tables * args.min_tablesize, False)

    print 'Saving k-mer counting table to %s' % base
    print 'Loading sequences from %s' % repr(filenames)
    if args.output:
        print 'Outputting sequences to', args.output

    print 'making k-mer counting table'
    htable = khmer.new_counting_hash(args.ksize, args.min_tablesize)
    htable.set_use_bigcount(args.bigcount)

    total_coverage = 0.
    n = 0

    for index, filename in enumerate(filenames):
        for record in screed.open(filename):
            seq = record.sequence.upper()
            if 'N' in seq:
                seq = seq.replace('N', 'A')

            try:
                med, _, _ = htable.get_median_count(seq)
            except ValueError:
                continue

            total_coverage += med
            n += 1

            if total_coverage / float(n) > args.coverage:
                print 'reached target average coverage:', \
                      total_coverage / float(n)
                break

            htable.consume(seq)
            if args.output:
                args.output.write(output_single(record))

            if n % 100000 == 0:
                print '...', index, filename, n, total_coverage / float(n)

        if total_coverage / float(n) > args.coverage:
            break

    print 'Collected %d reads' % (n, )

    if args.report_total_kmers:
        print >> sys.stderr, 'Total number of k-mers: {0}'.format(
            htable.n_occupied())

    print 'saving', base
    htable.save(base)

    info_fp = open(base + '.info', 'w')
    info_fp.write('through end: %s\n' % filenames[-1])

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(htable,
                                             args.force,
                                             max_false_pos=.2)
    print 'fp rate estimated to be %1.3f' % fp_rate
    print >> info_fp, 'fp rate estimated to be %1.3f' % fp_rate

    print 'DONE.'
예제 #38
0
def main():  # pylint: disable=too-many-locals,too-many-statements
    args = sanitize_help(get_parser()).parse_args()

    report_on_config(args, graphtype='nodegraph')

    for infile in args.input_filenames:
        check_input_files(infile, args.force)

    check_space(args.input_filenames, args.force)

    print('Saving k-mer nodegraph to %s' % args.graphbase, file=sys.stderr)
    print('Loading kmers from sequences in %s' % repr(args.input_filenames),
          file=sys.stderr)
    print('--', file=sys.stderr)
    print('SUBSET SIZE', args.subset_size, file=sys.stderr)
    print('N THREADS', args.threads, file=sys.stderr)
    print('--', file=sys.stderr)

    # load-graph.py

    print('making nodegraph', file=sys.stderr)
    nodegraph = khmer_args.create_nodegraph(args)

    for _, filename in enumerate(args.input_filenames):
        print('consuming input', filename, file=sys.stderr)
        nodegraph.consume_seqfile_and_tag(filename)

    # 0.18 is ACTUAL MAX. Do not change.
    fp_rate = \
        khmer.calc_expected_collisions(
            nodegraph, args.force, max_false_pos=.15)
    print('fp rate estimated to be %1.3f' % fp_rate, file=sys.stderr)

    # partition-graph

    # do we want to exhaustively traverse the graph?
    stop_big_traversals = args.no_big_traverse
    if stop_big_traversals:
        print('** This script brakes for lumps: ',
              'stop_big_traversals is true.',
              file=sys.stderr)
    else:
        print('** Traverse all the things:',
              ' stop_big_traversals is false.',
              file=sys.stderr)

    #
    # now, partition!
    #

    # divide the tags up into subsets
    divvy = nodegraph.divide_tags_into_subsets(int(args.subset_size))
    divvy = list(divvy)
    n_subsets = len(divvy)
    divvy.append(0)

    # build a queue of tasks:
    worker_q = queue.Queue()

    # break up the subsets into a list of worker tasks
    for _ in range(0, n_subsets):
        start = divvy[_]
        end = divvy[_ + 1]
        worker_q.put((nodegraph, _, start, end))

    print('enqueued %d subset tasks' % n_subsets, file=sys.stderr)
    open('%s.info' % args.graphbase,
         'w').write('%d subsets total\n' % (n_subsets))

    if n_subsets < args.threads:
        args.threads = n_subsets

    # start threads!
    print('starting %d threads' % args.threads, file=sys.stderr)
    print('---', file=sys.stderr)

    threads = []
    for _ in range(args.threads):
        cur_thread = threading.Thread(target=worker,
                                      args=(worker_q, args.graphbase,
                                            stop_big_traversals))
        threads.append(cur_thread)
        cur_thread.start()

    print('done starting threads', file=sys.stderr)

    # wait for threads
    for _ in threads:
        _.join()

    print('---', file=sys.stderr)
    print('done making subsets! see %s.subset.*.pmap' % (args.graphbase, ),
          file=sys.stderr)

    # merge-partitions

    pmap_files = glob.glob(args.graphbase + '.subset.*.pmap')

    print('loading %d pmap files (first one: %s)' %
          (len(pmap_files), pmap_files[0]),
          file=sys.stderr)

    nodegraph = khmer.Nodegraph(args.ksize, 1, 1)

    for pmap_file in pmap_files:
        print('merging', pmap_file, file=sys.stderr)
        nodegraph.merge_subset_from_disk(pmap_file)

    if not args.keep_subsets:
        print('removing pmap files', file=sys.stderr)
        for pmap_file in pmap_files:
            os.unlink(pmap_file)

    # annotate-partitions

    for infile in args.input_filenames:
        print('outputting partitions for', infile, file=sys.stderr)
        outfile = os.path.basename(infile) + '.part'
        part_count = nodegraph.output_partitions(infile, outfile)
        print('output %d partitions for %s' % (part_count, infile),
              file=sys.stderr)
        print('partitions are in', outfile, file=sys.stderr)
예제 #39
0
def main():
    parser = build_counting_args()
    parser.add_argument("-t",
                        "--trusted-cutoff",
                        dest="trusted_cutoff",
                        type=int,
                        default=3)
    parser.add_argument(
        "--bits-theta",
        help=
        "Tuning parameter controlling trade off of speed vs alignment sensitivity",
        default=1.0,
        type=float,
        dest="bits_theta")
    parser.add_argument('-C',
                        '--cutoff',
                        type=int,
                        dest='cutoff',
                        default=DEFAULT_MINIMUM_COVERAGE)
    parser.add_argument('-s', '--savehash', dest='savehash', default='')
    parser.add_argument('-l', '--loadhash', dest='loadhash', default='')
    parser.add_argument('--details-out', dest="details_out")
    parser.add_argument('input_filenames', nargs='+')

    args = parser.parse_args()

    if not args.quiet:
        print >> sys.stderr, '\nPARAMETERS:'
        print >> sys.stderr, ' - kmer size =    %d \t\t(-k)' % args.ksize
        print >> sys.stderr, ' - n hashes =     %d \t\t(-N)' % args.n_hashes
        print >>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % \
            args.min_hashsize
        print >> sys.stderr, ''
        print >>sys.stderr, 'Estimated memory usage is %.2g bytes ' \
            '(n_hashes x min_hashsize)' % (
            args.n_hashes * args.min_hashsize)
        print >> sys.stderr, '-' * 8

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes
    DESIRED_COVERAGE = args.cutoff

    filenames = args.input_filenames

    if args.loadhash:
        print 'loading hashtable from', args.loadhash
        ht = khmer.load_counting_hash(args.loadhash)
    else:
        print 'making hashtable'
        ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)

    aligner = khmer.new_readaligner(ht, args.trusted_cutoff, args.bits_theta)

    if args.details_out != None:
        details_out = open(args.details_out, "w")
    else:
        details_out = None

    total = 0
    discarded = 0
    for input_filename in filenames:
        output_name = os.path.basename(input_filename) + '.keepalign'
        outfp = open(output_name, 'w')

        for n, record in enumerate(screed.open(input_filename)):
            if n > 0 and n % 10000 == 0:
                print '... kept', total - discarded, 'of', total, ', or', \
                    int(100. - discarded / float(total) * 100.), '%'
                print '... in file', input_filename

            total += 1

            if len(record.sequence) < K:
                continue

            seq = record.sequence.upper().replace('N', 'A')

            ##
            score, graph_alignment, read_alignment, truncated = aligner.align(
                record.sequence)

            keep = False
            if truncated:
                keep = True
            else:
                if False:
                    graph_seq = graph_alignment.replace("-", "")
                else:
                    graph_seq = ""
                    for i in range(len(graph_alignment)):
                        if graph_alignment[i] == "-":
                            graph_seq += read_alignment[i]
                        else:
                            graph_seq += graph_alignment[i]

                mincount = ht.get_min_count(graph_seq)
                keep = True
                seq = graph_seq

                #if mincount < DESIRED_COVERAGE:
                #    keep = True
                #    seq = graph_seq
                #else:
                #    assert not keep

            if details_out != None:
                details_out.write(
                    "+{7}\t{0:0.2f}\t{3}\t{4}\nread:      {6}\ngraph_aln: {1}\nread_aln:  {2}\nstored_seq:{5}\n"
                    .format(score, graph_alignment, read_alignment, truncated,
                            keep, seq, record.sequence, record.name))

            if keep:
                ht.consume(seq)
                outfp.write('>%s\n%s\n' % (record.name, seq))
            else:
                discarded += 1

        print 'DONE with', input_filename, '; kept', total - discarded, 'of',\
            total, 'or', int(100. - discarded / float(total) * 100.), '%'
        print 'output in', output_name

    if args.savehash:
        print 'Saving hashfile through', input_filename
        print '...saving to', args.savehash
        ht.save(args.savehash)

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(ht)
    print 'fp rate estimated to be %1.3f' % fp_rate

    if fp_rate > 0.20:
        print >> sys.stderr, "**"
        print >> sys.stderr, "** ERROR: the counting hash is too small for"
        print >> sys.stderr, "** this data set.  Increase hashsize/num ht."
        print >> sys.stderr, "**"
        print >> sys.stderr, "** Do not use these results!!"
        sys.exit(-1)
예제 #40
0
def main():
    parser = build_construct_args()
    parser.add_argument('-C', '--cutoff', type=int, dest='cutoff',
                        default=DEFAULT_DESIRED_COVERAGE)
    parser.add_argument('-p', '--paired', action='store_true')
    parser.add_argument('-s', '--savehash', dest='savehash', default='')
    parser.add_argument('-l', '--loadhash', dest='loadhash',
                        default='')
    parser.add_argument('-R', '--report-to-file', dest='report_file',
                        type=argparse.FileType('w'))
    parser.add_argument('input_filenames', nargs='+')

    args = parser.parse_args()

    if not args.quiet:
        if args.min_hashsize == DEFAULT_MIN_HASHSIZE:
            print>>sys.stderr, "** WARNING: hashsize is default!  You absodefly want to increase this!\n** Please read the docs!"

        print>>sys.stderr, '\nPARAMETERS:'
        print>>sys.stderr, ' - kmer size =    %d \t\t(-k)' % args.ksize
        print>>sys.stderr, ' - n hashes =     %d \t\t(-N)' % args.n_hashes
        print>>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % args.min_hashsize
        print>>sys.stderr, ' - paired =	      %s \t\t(-p)' % args.paired
        print>>sys.stderr, ''
        print>>sys.stderr, 'Estimated memory usage is %.2g bytes (n_hashes x min_hashsize)' % (args.n_hashes * args.min_hashsize)
        print>>sys.stderr, '-'*8

    K=args.ksize
    HT_SIZE=args.min_hashsize
    N_HT=args.n_hashes
    DESIRED_COVERAGE=args.cutoff
    report_fp = args.report_file
    filenames = args.input_filenames

    # In paired mode we read two records at a time
    batch_size = 1
    if args.paired:
        batch_size = 2

    if args.loadhash:
        print 'loading hashtable from', args.loadhash
        ht = khmer.load_counting_hash(args.loadhash)
    else:
        print 'making hashtable'
        ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)

    total = 0
    discarded = 0

    for input_filename in filenames:
        output_name = os.path.basename(input_filename) + '.keep'
        outfp = open(output_name, 'w')

	n = -1
        for n, batch in enumerate(batchwise(screed.open(input_filename), batch_size)):
            if n > 0 and n % 100000 == 0:
                print '... kept', total - discarded, 'of', total, ', or', \
                    int(100. - discarded / float(total) * 100.), '%'
                print '... in file', input_filename

                if report_fp:
                    print>>report_fp, total, total - discarded, \
                        1. - (discarded / float(total))
                    report_fp.flush()

            total += batch_size

            # If in paired mode, check that the reads are properly interleaved
            if args.paired:
                if not validpair(batch[0], batch[1]):
                    print >>sys.stderr, 'Error: Improperly interleaved pairs %s %s' % (batch[0].name, batch[1].name)
                    sys.exit(-1)

            # Emit the batch of reads if any read passes the filter
            # and all reads are longer than K
            passed_filter = False
            passed_length = True
            for record in batch:
                if len(record.sequence) < K:
                    passed_length = False
                    continue

                seq = record.sequence.replace('N', 'A')
                med, _, _ = ht.get_median_count(seq)

                if med < DESIRED_COVERAGE:
                    ht.consume(seq)
                    passed_filter = True
            
            # Emit records if any passed
            if passed_length and passed_filter:
                for record in batch:
                    if hasattr(record,'accuracy'):
                        outfp.write('@%s\n%s\n+\n%s\n' % (record.name, 
                                                          record.sequence, 
                                                          record.accuracy))
                    else:
                        outfp.write('>%s\n%s\n' % (record.name, record.sequence))
            else:
                discarded += batch_size

	if -1 < n:
	    print 'DONE with', input_filename, '; kept', total - discarded, 'of',\
		total, 'or', int(100. - discarded / float(total) * 100.), '%'
	    print 'output in', output_name
	else: print 'SKIPPED empty file', input_filename

    if args.savehash:
        print 'Saving hashfile through', input_filename
        print '...saving to', args.savehash
        ht.save(args.savehash)

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(ht)
    print 'fp rate estimated to be %1.3f' % fp_rate

    if fp_rate > 0.20:
        print >>sys.stderr, "**"
        print >>sys.stderr, "** ERROR: the counting hash is too small for"
        print >>sys.stderr, "** this data set.  Increase hashsize/num ht."
        print >>sys.stderr, "**"
        print >>sys.stderr, "** Do not use these results!!"
        sys.exit(-1)
예제 #41
0
def main():
    info('load-graph.py', ['graph'])
    args = get_parser().parse_args()
    report_on_config(args, hashtype='hashbits')

    base = args.output_filename
    filenames = args.input_filenames
    n_threads = int(args.n_threads)

    for _ in args.input_filenames:
        check_file_status(_)

    check_space(args.input_filenames)
    check_space_for_hashtable(float(args.n_tables * args.min_tablesize) / 8.)

    print 'Saving k-mer presence table to %s' % base
    print 'Loading kmers from sequences in %s' % repr(filenames)
    if args.no_build_tagset:
        print 'We WILL NOT build the tagset.'
    else:
        print 'We WILL build the tagset (for partitioning/traversal).'

    print 'making k-mer presence table'
    htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables)

    if args.no_build_tagset:
        target_method = htable.consume_fasta_with_reads_parser
    else:
        target_method = htable.consume_fasta_and_tag_with_reads_parser

    config = khmer.get_config()
    config.set_reads_input_buffer_size(n_threads * 64 * 1024)

    for _, filename in enumerate(filenames):

        rparser = khmer.ReadParser(filename, n_threads)
        threads = []
        print 'consuming input', filename
        for _ in xrange(n_threads):
            cur_thrd = threading.Thread(target=target_method, args=(rparser, ))
            threads.append(cur_thrd)
            cur_thrd.start()

        for thread in threads:
            thread.join()

    if args.report_total_kmers:
        print >> sys.stderr, 'Total number of k-mers: {0}'.format(
            htable.n_occupied())

    print 'saving k-mer presence table in', base + '.pt'
    htable.save(base + '.pt')

    if not args.no_build_tagset:
        print 'saving tagset in', base + '.tagset'
        htable.save_tagset(base + '.tagset')

    info_fp = open(base + '.info', 'w')
    info_fp.write('%d unique k-mers' % htable.n_unique_kmers())

    fp_rate = khmer.calc_expected_collisions(htable)
    print 'fp rate estimated to be %1.3f' % fp_rate
    if args.write_fp_rate:
        print >> info_fp, \
            '\nfalse positive rate estimated to be %1.3f' % fp_rate

    if fp_rate > 0.15:  # 0.18 is ACTUAL MAX. Do not change.
        print >> sys.stderr, "**"
        print >> sys.stderr, ("** ERROR: the graph structure is too small for "
                              "this data set. Increase table size/# tables.")
        print >> sys.stderr, "**"
        sys.exit(1)
예제 #42
0
def main():
    info('filter-abund-single.py', ['counting', 'SeqAn'])
    args = get_parser().parse_args()
    check_input_files(args.datafile, args.force)
    check_space([args.datafile], args.force)
    if args.savetable:
        check_space_for_hashtable(args.n_tables * args.min_tablesize,
                                  args.force)
    report_on_config(args)

    print >> sys.stderr, 'making k-mer counting table'
    htable = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                     args.n_tables)

    # first, load reads into hash table
    rparser = khmer.ReadParser(args.datafile)
    threads = []
    print >> sys.stderr, 'consuming input, round 1 --', args.datafile
    for _ in xrange(args.threads):
        cur_thread = \
            threading.Thread(
                target=htable.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(cur_thread)
        cur_thread.start()

    for _ in threads:
        _.join()

    if args.report_total_kmers:
        print >> sys.stderr, 'Total number of unique k-mers: {0}'.format(
            htable.n_unique_kmers())

    fp_rate = khmer.calc_expected_collisions(htable, args.force)
    print >> sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate

    # now, trim.

    # the filtering function.
    def process_fn(record):
        name = record.name
        seq = record.sequence
        seqN = seq.replace('N', 'A')

        _, trim_at = htable.trim_on_abundance(seqN, args.cutoff)

        if trim_at >= args.ksize:
            # be sure to not to change the 'N's in the trimmed sequence -
            # so, return 'seq' and not 'seqN'.
            return name, seq[:trim_at]

        return None, None

    # the filtering loop
    print >> sys.stderr, 'filtering', args.datafile
    outfile = os.path.basename(args.datafile) + '.abundfilt'
    outfp = open(outfile, 'w')

    tsp = ThreadedSequenceProcessor(process_fn)
    tsp.start(verbose_loader(args.datafile), outfp)

    print >> sys.stderr, 'output in', outfile

    if args.savetable:
        print >>sys.stderr, 'Saving k-mer counting table filename', \
            args.savetable
        print >> sys.stderr, '...saving to', args.savetable
        htable.save(args.savetable)
    print >> sys.stderr, 'wrote to: ', outfile
예제 #43
0
파일: do-partition.py 프로젝트: Xyroe/khmer
def main():  # pylint: disable=too-many-locals,too-many-statements
    info("do-partition.py", ["graph"])
    args = sanitize_help(get_parser()).parse_args()

    report_on_config(args, graphtype="nodegraph")

    for infile in args.input_filenames:
        check_input_files(infile, args.force)

    check_space(args.input_filenames, args.force)

    print("Saving k-mer nodegraph to %s" % args.graphbase, file=sys.stderr)
    print("Loading kmers from sequences in %s" % repr(args.input_filenames), file=sys.stderr)
    print("--", file=sys.stderr)
    print("SUBSET SIZE", args.subset_size, file=sys.stderr)
    print("N THREADS", args.threads, file=sys.stderr)
    print("--", file=sys.stderr)

    # load-graph.py

    print("making nodegraph", file=sys.stderr)
    nodegraph = khmer_args.create_nodegraph(args)

    for _, filename in enumerate(args.input_filenames):
        print("consuming input", filename, file=sys.stderr)
        nodegraph.consume_fasta_and_tag(filename)

    # 0.18 is ACTUAL MAX. Do not change.
    fp_rate = khmer.calc_expected_collisions(nodegraph, args.force, max_false_pos=0.15)
    print("fp rate estimated to be %1.3f" % fp_rate, file=sys.stderr)

    # partition-graph

    # do we want to exhaustively traverse the graph?
    stop_big_traversals = args.no_big_traverse
    if stop_big_traversals:
        print("** This script brakes for lumps: ", "stop_big_traversals is true.", file=sys.stderr)
    else:
        print("** Traverse all the things:", " stop_big_traversals is false.", file=sys.stderr)

    #
    # now, partition!
    #

    # divide the tags up into subsets
    divvy = nodegraph.divide_tags_into_subsets(int(args.subset_size))
    divvy = list(divvy)
    n_subsets = len(divvy)
    divvy.append(0)

    # build a queue of tasks:
    worker_q = queue.Queue()

    # break up the subsets into a list of worker tasks
    for _ in range(0, n_subsets):
        start = divvy[_]
        end = divvy[_ + 1]
        worker_q.put((nodegraph, _, start, end))

    print("enqueued %d subset tasks" % n_subsets, file=sys.stderr)
    open("%s.info" % args.graphbase, "w").write("%d subsets total\n" % (n_subsets))

    if n_subsets < args.threads:
        args.threads = n_subsets

    # start threads!
    print("starting %d threads" % args.threads, file=sys.stderr)
    print("---", file=sys.stderr)

    threads = []
    for _ in range(args.threads):
        cur_thread = threading.Thread(target=worker, args=(worker_q, args.graphbase, stop_big_traversals))
        threads.append(cur_thread)
        cur_thread.start()

    assert threading.active_count() == args.threads + 1

    print("done starting threads", file=sys.stderr)

    # wait for threads
    for _ in threads:
        _.join()

    print("---", file=sys.stderr)
    print("done making subsets! see %s.subset.*.pmap" % (args.graphbase,), file=sys.stderr)

    # merge-partitions

    pmap_files = glob.glob(args.graphbase + ".subset.*.pmap")

    print("loading %d pmap files (first one: %s)" % (len(pmap_files), pmap_files[0]), file=sys.stderr)

    nodegraph = khmer.Nodegraph(args.ksize, 1, 1)

    for pmap_file in pmap_files:
        print("merging", pmap_file, file=sys.stderr)
        nodegraph.merge_subset_from_disk(pmap_file)

    if args.remove_subsets:
        print("removing pmap files", file=sys.stderr)
        for pmap_file in pmap_files:
            os.unlink(pmap_file)

    # annotate-partitions

    for infile in args.input_filenames:
        print("outputting partitions for", infile, file=sys.stderr)
        outfile = os.path.basename(infile) + ".part"
        part_count = nodegraph.output_partitions(infile, outfile)
        print("output %d partitions for %s" % (part_count, infile), file=sys.stderr)
        print("partitions are in", outfile, file=sys.stderr)
예제 #44
0
def main():  # pylint: disable=too-many-branches,too-many-statements
    parser = sanitize_help(get_parser())
    args = parser.parse_args()

    configure_logging(args.quiet)
    report_on_config(args)

    report_fp = args.report
    force_single = args.force_single

    # check for similar filenames
    # if we're using a single output file only check for identical filenames
    # otherwise, check for identical BASE names as well.
    filenames = []
    basenames = []
    for pathfilename in args.input_filenames:
        filenames.append(pathfilename)
        if args.single_output_file:
            continue  # nothing more to worry about

        basename = os.path.basename(pathfilename)
        if basename in basenames:
            log_error('ERROR: Duplicate filename--Cannot handle this!')
            log_error('** Exiting!')
            sys.exit(1)

        basenames.append(basename)

    # check that files exist and there is sufficient output disk space.
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savegraph is not None:
        graphsize = calculate_graphsize(args, 'countgraph')
        check_space_for_graph(args.savegraph, graphsize, args.force)

    # load or create counting table.
    if args.loadgraph:
        log_info('loading k-mer countgraph from {graph}',
                 graph=args.loadgraph)
        countgraph = Countgraph.load(args.loadgraph)
    else:
        log_info('making countgraph')
        countgraph = khmer_args.create_countgraph(args)

    # create an object to handle diginorm of all files
    norm = Normalizer(args.cutoff, countgraph)
    with_diagnostics = WithDiagnostics(norm, report_fp, args.report_frequency)

    # make a list of all filenames and if they're paired or not;
    # if we don't know if they're paired, default to allowing but not
    # forcing pairing.
    files = []
    for element in filenames:
        files.append([element, args.paired])
    if args.unpaired_reads:
        files.append([args.unpaired_reads, False])

    corrupt_files = []
    outfp = None
    output_name = None

    if args.single_output_file:
        outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip)
    else:
        if '-' in filenames or '/dev/stdin' in filenames:
            print("Accepting input from stdin; output filename must "
                  "be provided with '-o'.", file=sys.stderr)
            sys.exit(1)

    #
    # main loop: iterate over all files given, do diginorm.
    #

    for filename, require_paired in files:
        if not args.single_output_file:
            output_name = os.path.basename(filename) + '.keep'
            outfp = open(output_name, 'wb')
            outfp = get_file_writer(outfp, args.gzip, args.bzip)

        # failsafe context manager in case an input file breaks
        with catch_io_errors(filename, outfp, args.single_output_file,
                             args.force, corrupt_files):
            screed_iter = clean_input_reads(screed.open(filename))
            reader = broken_paired_reader(screed_iter, min_length=args.ksize,
                                          force_single=force_single,
                                          require_paired=require_paired)

            # actually do diginorm
            for record in with_diagnostics(reader, filename):
                if record is not None:
                    write_record(record, outfp)

            log_info('output in {name}', name=describe_file_handle(outfp))
            if not args.single_output_file:
                outfp.close()

    # finished - print out some diagnostics.

    log_info('Total number of unique k-mers: {umers}',
             umers=countgraph.n_unique_kmers())

    if args.savegraph is not None:
        log_info('...saving to {name}', name=args.savegraph)
        countgraph.save(args.savegraph)

    fp_rate = \
        khmer.calc_expected_collisions(countgraph, False, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975

    log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate)

    if args.force and len(corrupt_files) > 0:
        log_error("** WARNING: Finished with errors!")
        log_error("** I/O Errors occurred in the following files:")
        log_error("\t" + " ".join(corrupt_files))
예제 #45
0
def main():
    info('correct-reads.py', ['streaming'])
    args = sanitize_help(get_parser()).parse_args()

    ###

    if len(set(args.input_filenames)) != len(args.input_filenames):
        print("Error: Cannot input the same filename multiple times.",
              file=sys.stderr)
        sys.exit(1)

    ###

    report_on_config(args)
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    tablesize = calculate_graphsize(args, 'countgraph')

    if args.savegraph:
        check_space_for_graph(args.savegraph, tablesize, args.force)

    K = args.ksize

    CUTOFF = args.cutoff
    NORMALIZE_LIMIT = args.normalize_to

    if args.loadgraph:
        print('loading k-mer countgraph from', args.loadgraph, file=sys.stderr)
        ct = Countgraph.load(args.loadgraph)
    else:
        print('making k-mer countgraph', file=sys.stderr)
        ct = create_countgraph(args, multiplier=8 / (9. + 0.3))
    tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir)
    print('created temporary directory %s; use -T to change location' %
          tempdir,
          file=sys.stderr)

    aligner = khmer.ReadAligner(ct, args.cutoff, args.bits_theta)

    # ### FIRST PASS ###

    save_pass2_total = 0

    n_bp = 0
    n_reads = 0
    written_bp = 0
    written_reads = 0
    corrected_reads = 0

    pass2list = []
    for filename in args.input_filenames:
        pass2filename = os.path.basename(filename) + '.pass2'
        pass2filename = os.path.join(tempdir, pass2filename)
        if args.out is None:
            corrfp = open(os.path.basename(filename) + '.corr', 'w')
        else:
            corrfp = args.out

        pass2list.append((filename, pass2filename, corrfp))

        screed_iter = screed.open(filename, parse_description=False)
        pass2fp = open(pass2filename, 'w')

        save_pass2 = 0
        n = 0

        paired_iter = broken_paired_reader(screed_iter,
                                           min_length=K,
                                           force_single=args.ignore_pairs)
        for n, is_pair, read1, read2 in paired_iter:
            if n % 10000 == 0:
                print('...',
                      n,
                      filename,
                      save_pass2,
                      n_reads,
                      n_bp,
                      written_reads,
                      written_bp,
                      file=sys.stderr)

            # we want to track paired reads here, to make sure that pairs
            # are not split between first pass and second pass.

            if is_pair:
                n_reads += 2
                n_bp += len(read1.sequence) + len(read2.sequence)

                seq1 = read1.sequence.replace('N', 'A')
                seq2 = read2.sequence.replace('N', 'A')

                med1, _, _ = ct.get_median_count(seq1)
                med2, _, _ = ct.get_median_count(seq2)

                if med1 < NORMALIZE_LIMIT or med2 < NORMALIZE_LIMIT:
                    ct.consume(seq1)
                    ct.consume(seq2)
                    write_record_pair(read1, read2, pass2fp)
                    save_pass2 += 2
                else:
                    is_aligned, new_seq1 = correct_sequence(aligner, seq1)
                    if is_aligned:
                        if new_seq1 != read1.sequence:
                            corrected_reads += 1
                        read1.sequence = new_seq1
                        if hasattr(read1, 'quality'):
                            fix_quality(read1)

                    is_aligned, new_seq2 = correct_sequence(aligner, seq2)
                    if is_aligned:
                        if new_seq2 != read2.sequence:
                            corrected_reads += 1
                        read2.sequence = new_seq2
                        if hasattr(read2, 'quality'):
                            fix_quality(read2)

                    write_record_pair(read1, read2, corrfp)
                    written_reads += 2
                    written_bp += len(read1)
                    written_bp += len(read2)
            else:
                n_reads += 1
                n_bp += len(read1.sequence)

                seq = read1.sequence.replace('N', 'A')

                med, _, _ = ct.get_median_count(seq)

                # has this portion of the graph saturated? if not,
                # consume & save => pass2.
                if med < NORMALIZE_LIMIT:
                    ct.consume(seq)
                    write_record(read1, pass2fp)
                    save_pass2 += 1
                else:  # trim!!
                    is_aligned, new_seq = correct_sequence(aligner, seq)
                    if is_aligned:
                        if new_seq != read1.sequence:
                            corrected_reads += 1
                        read1.sequence = new_seq
                        if hasattr(read1, 'quality'):
                            fix_quality(read1)

                        write_record(read1, corrfp)

                        written_reads += 1
                        written_bp += len(new_seq)

        pass2fp.close()

        print('%s: kept aside %d of %d from first pass, in %s' %
              (filename, save_pass2, n, filename),
              file=sys.stderr)
        save_pass2_total += save_pass2

    # ### SECOND PASS. ###

    skipped_n = 0
    skipped_bp = 0
    for _, pass2filename, corrfp in pass2list:
        print(('second pass: looking at sequences kept aside in %s') %
              pass2filename,
              file=sys.stderr)

        # note that for this second pass, we don't care about paired
        # reads - they will be output in the same order they're read in,
        # so pairs will stay together if not orphaned.  This is in contrast
        # to the first loop.

        for n, read in enumerate(
                screed.open(pass2filename, parse_description=False)):
            if n % 10000 == 0:
                print('... x 2',
                      n,
                      pass2filename,
                      written_reads,
                      written_bp,
                      file=sys.stderr)

            seq = read.sequence.replace('N', 'A')
            med, _, _ = ct.get_median_count(seq)

            # do we retain low-abundance components unchanged?
            if med < NORMALIZE_LIMIT and args.variable_coverage:
                write_record(read, corrfp)

                written_reads += 1
                written_bp += len(read.sequence)
                skipped_n += 1
                skipped_bp += len(read.sequence)

            # otherwise, examine/correct.
            else:  # med >= NORMALIZE LIMIT or not args.variable_coverage
                is_aligned, new_seq = correct_sequence(aligner, seq)
                if is_aligned:
                    if new_seq != read.sequence:
                        corrected_reads += 1
                    read.sequence = new_seq
                    if hasattr(read, 'quality'):
                        fix_quality(read)
                    write_record(read, corrfp)

                    written_reads += 1
                    written_bp += len(new_seq)

        print('removing %s' % pass2filename, file=sys.stderr)
        os.unlink(pass2filename)

    print('removing temp directory & contents (%s)' % tempdir, file=sys.stderr)
    shutil.rmtree(tempdir)

    n_passes = 1.0 + (float(save_pass2_total) / n_reads)
    percent_reads_corrected = float(corrected_reads +
                                    (n_reads - written_reads)) /\
        n_reads * 100.0

    print('read %d reads, %d bp' % (
        n_reads,
        n_bp,
    ), file=sys.stderr)
    print('wrote %d reads, %d bp' % (
        written_reads,
        written_bp,
    ),
          file=sys.stderr)
    print('looked at %d reads twice (%.2f passes)' %
          (save_pass2_total, n_passes),
          file=sys.stderr)
    print('removed %d reads and corrected %d reads (%.2f%%)' %
          (n_reads - written_reads, corrected_reads, percent_reads_corrected),
          file=sys.stderr)
    print('removed %.2f%% of bases (%d total)' %
          ((1 - (written_bp / float(n_bp))) * 100.0, n_bp - written_bp),
          file=sys.stderr)

    if args.variable_coverage:
        percent_reads_hicov = 100.0 * float(n_reads - skipped_n) / n_reads
        print('%d reads were high coverage (%.2f%%);' %
              (n_reads - skipped_n, percent_reads_hicov),
              file=sys.stderr)
        print(('skipped %d reads/%d bases because of low coverage') %
              (skipped_n, skipped_bp),
              file=sys.stderr)

    fp_rate = \
        khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975
    print('fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate),
          file=sys.stderr)

    print('output in *.corr', file=sys.stderr)

    if args.savegraph:
        print("Saving k-mer countgraph to", args.savegraph, file=sys.stderr)
        ct.save(args.savegraph)
예제 #46
0
def main():
    info('trim-low-abund.py', ['streaming'])
    parser = get_parser()
    args = parser.parse_args()

    ###

    if len(set(args.input_filenames)) != len(args.input_filenames):
        print >>sys.stderr, \
            "Error: Cannot input the same filename multiple times."
        sys.exit(1)

    ###

    report_on_config(args)
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savetable:
        check_space_for_hashtable(args.n_tables * args.min_tablesize,
                                  args.force)

    K = args.ksize

    CUTOFF = args.cutoff
    NORMALIZE_LIMIT = args.normalize_to

    if args.loadtable:
        print >> sys.stderr, 'loading k-mer counting table from', args.loadtable
        ct = khmer.load_counting_hash(args.loadtable)
    else:
        print >> sys.stderr, 'making k-mer counting table'
        ct = khmer.new_counting_hash(K, args.min_tablesize, args.n_tables)

    tempdir = tempfile.mkdtemp('khmer', 'tmp', args.tempdir)
    print >>sys.stderr, 'created temporary directory %s; ' \
                        'use -T to change location' % tempdir

    # ### FIRST PASS ###

    save_pass2_total = 0

    n_bp = 0
    n_reads = 0
    written_bp = 0
    written_reads = 0
    trimmed_reads = 0

    pass2list = []
    for filename in args.input_filenames:
        pass2filename = os.path.basename(filename) + '.pass2'
        pass2filename = os.path.join(tempdir, pass2filename)
        if args.out is None:
            trimfp = open(os.path.basename(filename) + '.abundtrim', 'w')
        else:
            trimfp = args.out

        pass2list.append((filename, pass2filename, trimfp))

        screed_iter = screed.open(filename, parse_description=False)
        pass2fp = open(pass2filename, 'w')

        save_pass2 = 0
        n = 0

        paired_iter = broken_paired_reader(screed_iter,
                                           min_length=K,
                                           force_single=args.ignore_pairs)
        for n, is_pair, read1, read2 in paired_iter:
            if n % 10000 == 0:
                print >>sys.stderr, '...', n, filename, save_pass2, \
                    n_reads, n_bp, written_reads, written_bp

            # we want to track paired reads here, to make sure that pairs
            # are not split between first pass and second pass.

            if is_pair:
                n_reads += 2
                n_bp += len(read1.sequence) + len(read2.sequence)

                seq1 = read1.sequence.replace('N', 'A')
                seq2 = read2.sequence.replace('N', 'A')

                med1, _, _ = ct.get_median_count(seq1)
                med2, _, _ = ct.get_median_count(seq2)

                if med1 < NORMALIZE_LIMIT or med2 < NORMALIZE_LIMIT:
                    ct.consume(seq1)
                    ct.consume(seq2)
                    write_record_pair(read1, read2, pass2fp)
                    save_pass2 += 2
                else:
                    _, trim_at1 = ct.trim_on_abundance(seq1, CUTOFF)
                    _, trim_at2 = ct.trim_on_abundance(seq2, CUTOFF)

                    if trim_at1 >= K:
                        read1 = trim_record(read1, trim_at1)

                    if trim_at2 >= K:
                        read2 = trim_record(read2, trim_at2)

                    if trim_at1 != len(seq1):
                        trimmed_reads += 1
                    if trim_at2 != len(seq2):
                        trimmed_reads += 1

                    write_record_pair(read1, read2, trimfp)
                    written_reads += 2
                    written_bp += trim_at1 + trim_at2
            else:
                n_reads += 1
                n_bp += len(read1.sequence)

                seq = read1.sequence.replace('N', 'A')

                med, _, _ = ct.get_median_count(seq)

                # has this portion of the graph saturated? if not,
                # consume & save => pass2.
                if med < NORMALIZE_LIMIT:
                    ct.consume(seq)
                    write_record(read1, pass2fp)
                    save_pass2 += 1
                else:  # trim!!
                    _, trim_at = ct.trim_on_abundance(seq, CUTOFF)
                    if trim_at >= K:
                        new_read = trim_record(read1, trim_at)
                        write_record(new_read, trimfp)

                        written_reads += 1
                        written_bp += trim_at

                        if trim_at != len(read1.sequence):
                            trimmed_reads += 1

        pass2fp.close()

        print >>sys.stderr, '%s: kept aside %d of %d from first pass, in %s' \
            % (filename, save_pass2, n, filename)
        save_pass2_total += save_pass2

    # ### SECOND PASS. ###

    skipped_n = 0
    skipped_bp = 0
    for _, pass2filename, trimfp in pass2list:
        print >> sys.stderr, ('second pass: looking at sequences kept aside '
                              'in %s') % pass2filename

        # note that for this second pass, we don't care about paired
        # reads - they will be output in the same order they're read in,
        # so pairs will stay together if not orphaned.  This is in contrast
        # to the first loop.

        for n, read in enumerate(
                screed.open(pass2filename, parse_description=False)):
            if n % 10000 == 0:
                print >>sys.stderr, '... x 2', n, pass2filename, \
                    written_reads, written_bp

            seq = read.sequence.replace('N', 'A')
            med, _, _ = ct.get_median_count(seq)

            # do we retain low-abundance components unchanged?
            if med < NORMALIZE_LIMIT and args.variable_coverage:
                write_record(read, trimfp)

                written_reads += 1
                written_bp += len(read.sequence)
                skipped_n += 1
                skipped_bp += len(read.sequence)

            # otherwise, examine/trim/truncate.
            else:  # med >= NORMALIZE LIMIT or not args.variable_coverage
                _, trim_at = ct.trim_on_abundance(seq, CUTOFF)
                if trim_at >= K:
                    new_read = trim_record(read, trim_at)
                    write_record(new_read, trimfp)

                    written_reads += 1
                    written_bp += trim_at

                    if trim_at != len(read.sequence):
                        trimmed_reads += 1

        print >> sys.stderr, 'removing %s' % pass2filename
        os.unlink(pass2filename)

    print >> sys.stderr, 'removing temp directory & contents (%s)' % tempdir
    shutil.rmtree(tempdir)

    n_passes = 1.0 + (float(save_pass2_total) / n_reads)
    percent_reads_trimmed = float(trimmed_reads + (n_reads - written_reads)) /\
        n_reads * 100.0

    print >> sys.stderr, 'read %d reads, %d bp' % (
        n_reads,
        n_bp,
    )
    print >> sys.stderr, 'wrote %d reads, %d bp' % (
        written_reads,
        written_bp,
    )
    print >>sys.stderr, 'looked at %d reads twice (%.2f passes)' % \
        (save_pass2_total, n_passes)
    print >>sys.stderr, 'removed %d reads and trimmed %d reads (%.2f%%)' % \
        (n_reads - written_reads, trimmed_reads, percent_reads_trimmed)
    print >>sys.stderr, 'trimmed or removed %.2f%% of bases (%d total)' % \
        ((1 - (written_bp / float(n_bp))) * 100.0, n_bp - written_bp)

    if args.variable_coverage:
        percent_reads_hicov = 100.0 * float(n_reads - skipped_n) / n_reads
        print >>sys.stderr, '%d reads were high coverage (%.2f%%);' % \
            (n_reads - skipped_n, percent_reads_hicov)
        print >> sys.stderr, ('skipped %d reads/%d bases because of low'
                              'coverage') % (skipped_n, skipped_bp)

    fp_rate = \
        khmer.calc_expected_collisions(ct, args.force, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975
    print >>sys.stderr, \
        'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate)

    print >> sys.stderr, 'output in *.abundtrim'

    if args.savetable:
        print >> sys.stderr, "Saving k-mer counting table to", args.savetable
        ct.save(args.savetable)
예제 #47
0
def main():
    parser = build_construct_args()
    parser.add_argument('input_filename')
    parser.add_argument('read_filename')

    args = parser.parse_args()

    if not args.quiet:
        if args.min_hashsize == DEFAULT_MIN_HASHSIZE:
            print >>sys.stderr, "** WARNING: hashsize is default!  " \
                "You absodefly want to increase this!\n** " \
                "Please read the docs!"

        print >>sys.stderr, '\nPARAMETERS:'
        print >>sys.stderr, ' - kmer size =    %d \t\t(-k)' % args.ksize
        print >>sys.stderr, ' - n hashes =     %d \t\t(-N)' % args.n_hashes
        print >>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % \
            args.min_hashsize
        print >>sys.stderr, ''
        print >>sys.stderr, 'Estimated memory usage is %.2g bytes ' \
            '(n_hashes x min_hashsize / 8)' % (
            args.n_hashes * args.min_hashsize / 8.)
        print >>sys.stderr, '-' * 8

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes

    inp = args.input_filename
    readsfile = args.read_filename
    tag = args.tag

    outfile = os.path.basename(readsfile) + '.' + tag + '.sweep2'
    outfp = open(outfile, 'w')

    # create a hashbits data structure
    ht = khmer.new_hashbits(K, HT_SIZE, N_HT)

    # load contigs, connect into N partitions
    print 'loading input reads from', inp
    ht.consume_fasta(inp)

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(ht)
    print 'fp rate estimated to be %1.3f' % fp_rate

    if fp_rate > 0.20:
        print >>sys.stderr, "**"
        print >>sys.stderr, "** ERROR: the counting hash is too small for"
        print >>sys.stderr, "** this data set.  Increase hashsize/num ht."
        print >>sys.stderr, "**"
        print >>sys.stderr, "** Do not use these results!!"
        sys.exit(-1)

    print 'starting sweep.'

    n = 0
    m = 0
    for record in screed.open(readsfile):
        if len(record.sequence) < K:
            continue

        if n % 100000 == 0:
            print '...', n, m

        count = ht.get_median_count(record.sequence)[0]
        if count:
            m += 1
            outfp.write('>%s\n%s\n' % (record.name, record.sequence))
        n += 1
예제 #48
0
def main():

    info('load-into-counting.py', ['counting', 'SeqAn'])

    args = get_parser().parse_args()
    report_on_config(args)

    base = args.output_countingtable_filename
    filenames = args.input_sequence_filename

    for name in args.input_sequence_filename:
        check_input_files(name, args.force)

    check_space(args.input_sequence_filename, args.force)
    check_space_for_hashtable(args.n_tables * args.min_tablesize, args.force)

    check_file_writable(base)
    check_file_writable(base + ".info")

    print >>sys.stderr, 'Saving k-mer counting table to %s' % base
    print >>sys.stderr, 'Loading kmers from sequences in %s' % repr(filenames)

    # clobber the '.info' file now, as we always open in append mode below
    if os.path.exists(base + '.info'):
        os.remove(base + '.info')

    print >>sys.stderr, 'making k-mer counting table'
    htable = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                     args.n_tables)
    htable.set_use_bigcount(args.bigcount)

    filename = None

    for index, filename in enumerate(filenames):

        rparser = khmer.ReadParser(filename)
        threads = []
        print >>sys.stderr, 'consuming input', filename
        for _ in xrange(args.threads):
            cur_thrd = \
                threading.Thread(
                    target=htable.consume_fasta_with_reads_parser,
                    args=(rparser, )
                )
            threads.append(cur_thrd)
            cur_thrd.start()

        for thread in threads:
            thread.join()

        if index > 0 and index % 10 == 0:
            check_space_for_hashtable(args.n_tables * args.min_tablesize,
                                      args.force)
            print >>sys.stderr, 'mid-save', base
            htable.save(base)
        with open(base + '.info', 'a') as info_fh:
            print >> info_fh, 'through', filename

    n_kmers = htable.n_unique_kmers()
    if args.report_total_kmers:
        print >> sys.stderr, 'Total number of unique k-mers:', n_kmers
        with open(base + '.info', 'a') as info_fp:
            print >>info_fp, 'Total number of unique k-mers:', n_kmers

    print >>sys.stderr, 'saving', base
    htable.save(base)

    # Change max_false_pos=0.2 only if you really grok it. HINT: You don't
    fp_rate = \
        khmer.calc_expected_collisions(htable, args.force, max_false_pos=.2)

    with open(base + '.info', 'a') as info_fp:
        print >> info_fp, 'fp rate estimated to be %1.3f\n' % fp_rate

    if args.summary_info:
        mr_fmt = args.summary_info.lower()
        mr_file = base + '.info.' + mr_fmt
        print >> sys.stderr, "Writing summmary info to", mr_file
        with open(mr_file, 'w') as mr_fh:
            if mr_fmt == 'json':
                mr_data = {
                    "ht_name": os.path.basename(base),
                    "fpr": fp_rate,
                    "num_kmers": n_kmers,
                    "files": filenames,
                    "mrinfo_version": "0.1.0",
                }
                json.dump(mr_data, mr_fh)
                mr_fh.write('\n')
            elif mr_fmt == 'tsv':
                mr_fh.write("ht_name\tfpr\tnum_kmers\tfiles\n")
                mr_fh.write("{b:s}\t{fpr:1.3f}\t{k:d}\t{fls:s}\n".format(
                    b=os.path.basename(base), fpr=fp_rate, k=n_kmers,
                    fls=";".join(filenames)))

    print >> sys.stderr, 'fp rate estimated to be %1.3f' % fp_rate

    print >>sys.stderr, 'DONE.'
    print >>sys.stderr, 'wrote to:', base + '.info'
예제 #49
0
def main():  # pylint: disable=too-many-branches,too-many-statements
    info('normalize-by-median.py', ['diginorm'])
    args = get_parser().parse_args()

    report_on_config(args)

    report_fp = args.report

    # check for similar filenames
    filenames = []
    for pathfilename in args.input_filenames:
        filename = pathfilename.split('/')[-1]
        if (filename in filenames):
            print >>sys.stderr, "WARNING: At least two input files are named \
%s . (The script normalize-by-median.py can not handle this, only one .keep \
file for one of the input files will be generated.)" % filename
        else:
            filenames.append(filename)

    # check for others
    check_valid_file_exists(args.input_filenames)
    check_space(args.input_filenames, args.force)
    if args.savetable:
        check_space_for_hashtable(
            args.n_tables * args.min_tablesize, args.force)

    # list to save error files along with throwing exceptions
    corrupt_files = []

    if args.loadtable:
        print 'loading k-mer counting table from', args.loadtable
        htable = khmer.load_counting_hash(args.loadtable)
    else:
        print >> sys.stderr, 'making k-mer counting table'
        htable = khmer.new_counting_hash(args.ksize, args.min_tablesize,
                                         args.n_tables)

    input_filename = None

    for index, input_filename in enumerate(args.input_filenames):
        total_acc, discarded_acc, corrupt_files = \
            normalize_by_median_and_check(
                input_filename, htable, args.single_output_file,
                args.fail_save, args.paired, args.cutoff, args.force,
                corrupt_files, report_fp)

        if (args.dump_frequency > 0 and
                index > 0 and index % args.dump_frequency == 0):
            print 'Backup: Saving k-mer counting file through', input_filename
            if args.savetable:
                hashname = args.savetable
                print '...saving to', hashname
            else:
                hashname = 'backup.ct'
                print 'Nothing given for savetable, saving to', hashname
            htable.save(hashname)

    if args.paired and args.unpaired_reads:
        args.paired = False
        output_name = args.unpaired_reads
        if not args.single_output_file:
            output_name = os.path.basename(args.unpaired_reads) + '.keep'
        outfp = open(output_name, 'w')
        total_acc, discarded_acc, corrupt_files = \
            normalize_by_median_and_check(
                args.unpaired_reads, htable, args.single_output_file,
                args.fail_save, args.paired, args.cutoff, args.force,
                corrupt_files, report_fp)

    if args.report_total_kmers:
        print >> sys.stderr, 'Total number of unique k-mers: {0}'.format(
            htable.n_unique_kmers())

    if args.savetable:
        print 'Saving k-mer counting table through', input_filename
        print '...saving to', args.savetable
        htable.save(args.savetable)

    fp_rate = \
        khmer.calc_expected_collisions(htable, args.force, max_false_pos=.8)
    # for max_false_pos see Zhang et al., http://arxiv.org/abs/1309.2975

    print >> sys.stderr, \
        'fp rate estimated to be {fpr:1.3f}'.format(fpr=fp_rate)

    if args.force and len(corrupt_files) > 0:
        print >> sys.stderr, "** WARNING: Finished with errors!"
        print >> sys.stderr, "** IOErrors occurred in the following files:"
        print >> sys.stderr, "\t", " ".join(corrupt_files)
예제 #50
0
def main():

    args = sanitize_help(get_parser()).parse_args()
    if not args.quiet:
        info('load-into-counting.py', ['counting', 'SeqAn'])

    configure_logging(args.quiet)
    report_on_config(args)

    base = args.output_countgraph_filename
    filenames = args.input_sequence_filename

    for name in args.input_sequence_filename:
        check_input_files(name, args.force)

    tablesize = calculate_graphsize(args, 'countgraph')
    check_space_for_graph(args.output_countgraph_filename, tablesize,
                          args.force)

    info_filename = base + ".info"
    check_file_writable(base)
    check_file_writable(info_filename)

    log_info('Saving k-mer countgraph to {base}', base=base)
    log_info('Loading kmers from sequences in {filenames}',
             filenames=repr(filenames))

    # clobber the '.info' file now, as we always open in append mode below
    with open(info_filename, 'w') as info_fp:
        print('khmer version:', khmer.__version__, file=info_fp)

    log_info('making countgraph')
    countgraph = khmer_args.create_countgraph(args)
    countgraph.set_use_bigcount(args.bigcount)

    filename = None

    total_num_reads = 0

    for index, filename in enumerate(filenames):

        rparser = khmer.ReadParser(filename)
        threads = []
        log_info('consuming input {input}', input=filename)
        for _ in range(args.threads):
            cur_thrd = \
                threading.Thread(
                    target=countgraph.consume_fasta_with_reads_parser,
                    args=(rparser, )
                )
            threads.append(cur_thrd)
            cur_thrd.start()

        for thread in threads:
            thread.join()

        if index > 0 and index % 10 == 0:
            tablesize = calculate_graphsize(args, 'countgraph')
            check_space_for_graph(base, tablesize, args.force)
            log_info('mid-save {base}', base=base)

            countgraph.save(base)
        with open(info_filename, 'a') as info_fh:
            print('through', filename, file=info_fh)
        total_num_reads += rparser.num_reads

    n_kmers = countgraph.n_unique_kmers()
    log_info('Total number of unique k-mers: {nk}', nk=n_kmers)
    with open(info_filename, 'a') as info_fp:
        print('Total number of unique k-mers:', n_kmers, file=info_fp)

    log_info('saving {base}', base=base)
    countgraph.save(base)

    # Change max_false_pos=0.2 only if you really grok it. HINT: You don't
    fp_rate = \
        khmer.calc_expected_collisions(
            countgraph, args.force, max_false_pos=.2)

    with open(info_filename, 'a') as info_fp:
        print('fp rate estimated to be %1.3f\n' % fp_rate, file=info_fp)

    if args.summary_info:
        mr_fmt = args.summary_info.lower()
        mr_file = base + '.info.' + mr_fmt
        log_info("Writing summmary info to {mr_file}", mr_file=mr_file)
        with open(mr_file, 'w') as mr_fh:
            if mr_fmt == 'json':
                mr_data = {
                    "ht_name": os.path.basename(base),
                    "fpr": fp_rate,
                    "num_kmers": n_kmers,
                    "files": filenames,
                    "mrinfo_version": "0.2.0",
                    "num_reads": total_num_reads,
                }
                json.dump(mr_data, mr_fh)
                mr_fh.write('\n')
            elif mr_fmt == 'tsv':
                mr_fh.write("ht_name\tfpr\tnum_kmers\tnum_reads\tfiles\n")
                vals = [
                    os.path.basename(base),
                    "{:1.3f}".format(fp_rate),
                    str(n_kmers),
                    str(total_num_reads),
                    ";".join(filenames),
                ]
                mr_fh.write("\t".join(vals) + "\n")

    log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate)

    log_info('DONE.')
    log_info('wrote to: {filename}', filename=info_filename)
예제 #51
0
파일: do-partition.py 프로젝트: RamRS/khmer
def main():  # pylint: disable=too-many-locals,too-many-statements
    info('do-partition.py', ['graph'])
    args = get_parser().parse_args()

    report_on_config(args, hashtype='hashbits')

    for infile in args.input_filenames:
        check_file_status(infile)

    check_space(args.input_filenames)

    print 'Saving k-mer presence table to %s' % args.graphbase
    print 'Loading kmers from sequences in %s' % repr(args.input_filenames)

    print '--'
    print 'SUBSET SIZE', args.subset_size
    print 'N THREADS', args.n_threads
    print '--'

    # load-graph

    print 'making k-mer presence table'
    htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables)

    for _, filename in enumerate(args.input_filenames):
        print 'consuming input', filename
        htable.consume_fasta_and_tag(filename)

    fp_rate = khmer.calc_expected_collisions(htable)
    print 'fp rate estimated to be %1.3f' % fp_rate
    if fp_rate > 0.15:          # 0.18 is ACTUAL MAX. Do not change.
        print >> sys.stderr, "**"
        print >> sys.stderr, ("** ERROR: the graph structure is too small for"
                              " this data set.  Increase k-mer presence table "
                              "size/num of tables.")
        print >> sys.stderr, "**"
        sys.exit(1)

    # partition-graph

    # do we want to exhaustively traverse the graph?
    stop_big_traversals = args.no_big_traverse
    if stop_big_traversals:
        print '** This script brakes for lumps: stop_big_traversals is true.'
    else:
        print '** Traverse all the things: stop_big_traversals is false.'

    #
    # now, partition!
    #

    # divide the tags up into subsets
    divvy = htable.divide_tags_into_subsets(int(args.subset_size))
    n_subsets = len(divvy)
    divvy.append(0)

    # build a queue of tasks:
    worker_q = Queue.Queue()

    # break up the subsets into a list of worker tasks
    for _ in range(0, n_subsets):
        start = divvy[_]
        end = divvy[_ + 1]
        worker_q.put((htable, _, start, end))

    print 'enqueued %d subset tasks' % n_subsets
    open('%s.info' % args.graphbase, 'w').write('%d subsets total\n'
                                                % (n_subsets))

    if n_subsets < args.n_threads:
        args.n_threads = n_subsets

    # start threads!
    print 'starting %d threads' % args.n_threads
    print '---'

    threads = []
    for _ in range(args.n_threads):
        cur_thread = threading.Thread(target=worker,
                                      args=(worker_q, args.graphbase,
                                            stop_big_traversals))
        threads.append(cur_thread)
        cur_thread.start()

    print 'done starting threads'

    # wait for threads
    for _ in threads:
        _.join()

    print '---'
    print 'done making subsets! see %s.subset.*.pmap' % (args.graphbase,)

    # merge-partitions

    pmap_files = glob.glob(args.graphbase + '.subset.*.pmap')

    print 'loading %d pmap files (first one: %s)' % (len(pmap_files),
                                                     pmap_files[0])

    htable = khmer.new_hashbits(args.ksize, 1, 1)

    for pmap_file in pmap_files:
        print 'merging', pmap_file
        htable.merge_subset_from_disk(pmap_file)

    if args.remove_subsets:
        print 'removing pmap files'
        for pmap_file in pmap_files:
            os.unlink(pmap_file)

    # annotate-partitions

    for infile in args.input_filenames:
        print 'outputting partitions for', infile
        outfile = os.path.basename(infile) + '.part'
        part_count = htable.output_partitions(infile, outfile)
        print 'output %d partitions for %s' % (part_count, infile)
        print 'partitions are in', outfile
예제 #52
0
def main():
    parser = build_counting_args()
    parser.add_argument("-t", "--trusted-cutoff", dest="trusted_cutoff", type=int, default=3)
    parser.add_argument("--bits-theta", help="Tuning parameter controlling trade off of speed vs alignment sensitivity", default=1.0, type=float, dest="bits_theta")
    parser.add_argument('-C', '--cutoff', type=int, dest='cutoff',
                        default=DEFAULT_MINIMUM_COVERAGE)
    parser.add_argument('-s', '--savehash', dest='savehash', default='')
    parser.add_argument('-l', '--loadhash', dest='loadhash',
                        default='')
    parser.add_argument('--details-out', dest="details_out")
    parser.add_argument('input_filenames', nargs='+')

    args = parser.parse_args()

    if not args.quiet:
        print >>sys.stderr, '\nPARAMETERS:'
        print >>sys.stderr, ' - kmer size =    %d \t\t(-k)' % args.ksize
        print >>sys.stderr, ' - n hashes =     %d \t\t(-N)' % args.n_hashes
        print >>sys.stderr, ' - min hashsize = %-5.2g \t(-x)' % \
            args.min_hashsize
        print >>sys.stderr, ''
        print >>sys.stderr, 'Estimated memory usage is %.2g bytes ' \
            '(n_hashes x min_hashsize)' % (
            args.n_hashes * args.min_hashsize)
        print >>sys.stderr, '-' * 8

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes
    DESIRED_COVERAGE = args.cutoff

    filenames = args.input_filenames

    if args.loadhash:
        print 'loading hashtable from', args.loadhash
        ht = khmer.load_counting_hash(args.loadhash)
    else:
        print 'making hashtable'
        ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)

    aligner = khmer.new_readaligner(ht, args.trusted_cutoff, args.bits_theta)
            
    if args.details_out != None:
        details_out = open(args.details_out, "w")
    else:
        details_out = None

    total = 0
    discarded = 0
    for input_filename in filenames:
        output_name = os.path.basename(input_filename) + '.keepalign'
        outfp = open(output_name, 'w')

        for n, record in enumerate(screed.open(input_filename)):
            if n > 0 and n % 10000 == 0:
                print '... kept', total - discarded, 'of', total, ', or', \
                    int(100. - discarded / float(total) * 100.), '%'
                print '... in file', input_filename

            total += 1

            if len(record.sequence) < K:
                continue

            seq = record.sequence.upper().replace('N', 'A')

            ##
            score, graph_alignment, read_alignment, truncated = aligner.align(record.sequence)

            keep = False
            if truncated:
                keep = True
            else:
                if False:
                    graph_seq = graph_alignment.replace("-", "")
                else:
                    graph_seq = ""
                    for i in range(len(graph_alignment)):
                        if graph_alignment[i] == "-":
                            graph_seq += read_alignment[i]
                        else:
                            graph_seq += graph_alignment[i]

                mincount = ht.get_min_count(graph_seq)
                keep = True
                seq = graph_seq

                #if mincount < DESIRED_COVERAGE:
                #    keep = True
                #    seq = graph_seq
                #else:
                #    assert not keep

            if details_out != None:
                details_out.write("+{7}\t{0:0.2f}\t{3}\t{4}\nread:      {6}\ngraph_aln: {1}\nread_aln:  {2}\nstored_seq:{5}\n".format(score, graph_alignment, read_alignment, truncated, keep, seq, record.sequence, record.name))


            if keep:
                ht.consume(seq)
                outfp.write('>%s\n%s\n' % (record.name, seq))
            else:
                discarded += 1

        print 'DONE with', input_filename, '; kept', total - discarded, 'of',\
            total, 'or', int(100. - discarded / float(total) * 100.), '%'
        print 'output in', output_name

    if args.savehash:
        print 'Saving hashfile through', input_filename
        print '...saving to', args.savehash
        ht.save(args.savehash)

    # Change 0.2 only if you really grok it.  HINT: You don't.
    fp_rate = khmer.calc_expected_collisions(ht)
    print 'fp rate estimated to be %1.3f' % fp_rate

    if fp_rate > 0.20:
        print >>sys.stderr, "**"
        print >>sys.stderr, "** ERROR: the counting hash is too small for"
        print >>sys.stderr, "** this data set.  Increase hashsize/num ht."
        print >>sys.stderr, "**"
        print >>sys.stderr, "** Do not use these results!!"
        sys.exit(-1)
예제 #53
0
def main():
    parser = build_construct_args(
        "Filter k-mers at the given abundance (inmem version).")
    add_threading_args(parser)

    parser.add_argument('--cutoff', '-C', dest='cutoff',
                        default=DEFAULT_CUTOFF, type=int,
                        help="Trim at k-mers below this abundance.")
    parser.add_argument('--savehash', dest='savehash', default='')
    parser.add_argument('datafile')
    
    args = parser.parse_args()
    report_on_config(args)

    K = args.ksize
    HT_SIZE = args.min_hashsize
    N_HT = args.n_hashes
    n_threads = int(args.n_threads)

    config = khmer.get_config()
    bufsz = config.get_reads_input_buffer_size()
    config.set_reads_input_buffer_size(n_threads * 64 * 1024)

    print 'making hashtable'
    ht = khmer.new_counting_hash(K, HT_SIZE, N_HT, n_threads)

    filename = args.datafile

    ### first, load reads into hash table
    rparser = khmer.ReadParser(filename, n_threads)
    threads = []
    print 'consuming input, round 1 --', filename
    for tnum in xrange(n_threads):
        t = \
            threading.Thread(
                target=ht.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(t)
        t.start()

    for t in threads:
        t.join()

    fp_rate = khmer.calc_expected_collisions(ht)
    print 'fp rate estimated to be %1.3f' % fp_rate

    ### now, trim.

    ### the filtering function.
    def process_fn(record):
        name = record['name']
        seq = record['sequence']
        if 'N' in seq:
            return None, None

        trim_seq, trim_at = ht.trim_on_abundance(seq, args.cutoff)

        if trim_at >= K:
            return name, trim_seq

        return None, None

    ### the filtering loop
    print 'filtering', filename
    outfile = os.path.basename(filename) + '.abundfilt'
    outfp = open(outfile, 'w')

    tsp = ThreadedSequenceProcessor(process_fn)
    tsp.start(verbose_loader(filename), outfp)

    print 'output in', outfile

    if args.savehash:
        print 'Saving hashfile', args.savehash
        print '...saving to', args.savehash
        ht.save(args.savehash)
예제 #54
0
파일: load-graph.py 프로젝트: RamRS/khmer
def main():
    info('load-graph.py', ['graph'])
    args = get_parser().parse_args()
    report_on_config(args, hashtype='hashbits')

    base = args.output_filename
    filenames = args.input_filenames
    n_threads = int(args.n_threads)

    for _ in args.input_filenames:
        check_file_status(_)

    check_space(args.input_filenames)
    check_space_for_hashtable(float(args.n_tables * args.min_tablesize) / 8.)

    print 'Saving k-mer presence table to %s' % base
    print 'Loading kmers from sequences in %s' % repr(filenames)
    if args.no_build_tagset:
        print 'We WILL NOT build the tagset.'
    else:
        print 'We WILL build the tagset (for partitioning/traversal).'

    print 'making k-mer presence table'
    htable = khmer.new_hashbits(args.ksize, args.min_tablesize, args.n_tables)

    if args.no_build_tagset:
        target_method = htable.consume_fasta_with_reads_parser
    else:
        target_method = htable.consume_fasta_and_tag_with_reads_parser

    config = khmer.get_config()
    config.set_reads_input_buffer_size(n_threads * 64 * 1024)

    for _, filename in enumerate(filenames):

        rparser = khmer.ReadParser(filename, n_threads)
        threads = []
        print 'consuming input', filename
        for _ in xrange(n_threads):
            cur_thrd = threading.Thread(target=target_method, args=(rparser, ))
            threads.append(cur_thrd)
            cur_thrd.start()

        for thread in threads:
            thread.join()

    print 'saving k-mer presence table in', base + '.pt'
    htable.save(base + '.pt')

    if not args.no_build_tagset:
        print 'saving tagset in', base + '.tagset'
        htable.save_tagset(base + '.tagset')

    info_fp = open(base + '.info', 'w')
    info_fp.write('%d unique k-mers' % htable.n_unique_kmers())

    fp_rate = khmer.calc_expected_collisions(htable)
    print 'fp rate estimated to be %1.3f' % fp_rate
    if fp_rate > 0.15:          # 0.18 is ACTUAL MAX. Do not change.
        print >> sys.stderr, "**"
        print >> sys.stderr, ("** ERROR: the graph structure is too small for "
                              "this data set.  Increase table size/# tables.")
        print >> sys.stderr, "**"
        sys.exit(1)
예제 #55
0
def main():

    args = sanitize_help(get_parser()).parse_args()

    configure_logging(args.quiet)
    report_on_config(args)

    base = args.output_countgraph_filename
    filenames = args.input_sequence_filename

    for name in args.input_sequence_filename:
        check_input_files(name, args.force)

    tablesize = calculate_graphsize(args, 'countgraph')
    check_space_for_graph(args.output_countgraph_filename, tablesize,
                          args.force)

    info_filename = base + ".info"
    check_file_writable(base)
    check_file_writable(info_filename)

    log_info('Saving k-mer countgraph to {base}', base=base)
    log_info('Loading kmers from sequences in {filenames}',
             filenames=repr(filenames))

    # clobber the '.info' file now, as we always open in append mode below
    with open(info_filename, 'w') as info_fp:
        print('khmer version:', khmer.__version__, file=info_fp)

    log_info('making countgraph')
    countgraph = khmer_args.create_countgraph(args)

    filename = None

    total_num_reads = 0

    for index, filename in enumerate(filenames):

        rparser = khmer.ReadParser(filename)
        threads = []
        log_info('consuming input {input}', input=filename)
        for _ in range(args.threads):
            cur_thrd = \
                threading.Thread(
                    target=countgraph.consume_seqfile_with_reads_parser,
                    args=(rparser, )
                )
            threads.append(cur_thrd)
            cur_thrd.start()

        for thread in threads:
            thread.join()

        if index > 0 and index % 10 == 0:
            tablesize = calculate_graphsize(args, 'countgraph')
            check_space_for_graph(base, tablesize, args.force)
            log_info('mid-save {base}', base=base)

            countgraph.save(base)
        with open(info_filename, 'a') as info_fh:
            print('through', filename, file=info_fh)
        total_num_reads += rparser.num_reads

    n_kmers = countgraph.n_unique_kmers()
    log_info('Total number of unique k-mers: {nk}', nk=n_kmers)
    with open(info_filename, 'a') as info_fp:
        print('Total number of unique k-mers:', n_kmers, file=info_fp)

    log_info('saving {base}', base=base)
    countgraph.save(base)

    # Change max_false_pos=0.2 only if you really grok it. HINT: You don't
    fp_rate = \
        khmer.calc_expected_collisions(
            countgraph, args.force, max_false_pos=.2)

    with open(info_filename, 'a') as info_fp:
        print('fp rate estimated to be %1.3f\n' % fp_rate, file=info_fp)

    if args.summary_info:
        mr_fmt = args.summary_info.lower()
        mr_file = base + '.info.' + mr_fmt
        log_info("Writing summmary info to {mr_file}", mr_file=mr_file)
        with open(mr_file, 'w') as mr_fh:
            if mr_fmt == 'json':
                mr_data = {
                    "ht_name": os.path.basename(base),
                    "fpr": fp_rate,
                    "num_kmers": n_kmers,
                    "files": filenames,
                    "mrinfo_version": "0.2.0",
                    "num_reads": total_num_reads,
                }
                json.dump(mr_data, mr_fh)
                mr_fh.write('\n')
            elif mr_fmt == 'tsv':
                mr_fh.write("ht_name\tfpr\tnum_kmers\tnum_reads\tfiles\n")
                vals = [
                    os.path.basename(base),
                    "{:1.3f}".format(fp_rate),
                    str(n_kmers),
                    str(total_num_reads),
                    ";".join(filenames),
                ]
                mr_fh.write("\t".join(vals) + "\n")

    log_info('fp rate estimated to be {fpr:1.3f}', fpr=fp_rate)

    log_info('DONE.')
    log_info('wrote to: {filename}', filename=info_filename)
예제 #56
0
파일: call.py 프로젝트: kevlar-dev/kevlar
def main(args):
    # Input and output files
    outstream = kevlar.open(args.out, 'w')
    writer = kevlar.vcf.VCFWriter(
        outstream,
        source='kevlar::call',
        refr=args.refr,
    )
    writer.write_header()

    # Contigs = query sequences
    contigstream = kevlar.parse_partitioned_reads(
        kevlar.parse_augmented_fastx(kevlar.open(args.queryseq, 'r')))
    contigs_by_partition = load_contigs(contigstream)

    gdnastream = kevlar.parse_partitioned_reads(
        kevlar.reference.load_refr_cutouts(kevlar.open(args.targetseq, 'r')))
    mask = None
    if args.gen_mask:
        message = 'generating mask of variant-spanning k-mers'
        kevlar.plog('[kevlar::call]', message)
        ntables = 4
        buckets = args.mask_mem * _buckets_per_byte['nodegraph'] / ntables
        mask = khmer.Nodetable(args.ksize, buckets, ntables)
    progress_indicator = kevlar.ProgressIndicator(
        '[kevlar::call] processed contigs/gDNAs for {counter} partitions',
        interval=10,
        breaks=[100, 1000, 10000],
    )
    for partid, gdnas in gdnastream:
        progress_indicator.update()
        if partid not in contigs_by_partition:
            continue
        contigs = contigs_by_partition[partid]
        caller = call(
            gdnas,
            contigs,
            partid,
            match=args.match,
            mismatch=args.mismatch,
            gapopen=args.open,
            gapextend=args.extend,
            ksize=args.ksize,
            refrfile=args.refr,
            debug=args.debug,
            mindist=5,
            homopolyfilt=not args.no_homopoly_filter,
            maxtargetlen=args.max_target_length,
        )
        for varcall in caller:
            if args.gen_mask:
                window = varcall.attribute('ALTWINDOW')
                if window is not None and len(window) >= args.ksize:
                    mask.consume(window)
            writer.write(varcall)
    if args.gen_mask:
        fpr = khmer.calc_expected_collisions(mask, max_false_pos=1.0)
        if fpr > args.mask_max_fpr:
            message = 'WARNING: mask FPR is {:.4f}'.format(fpr)
            message += '; exceeds user-specified limit'
            message += ' of {:.4f}'.format(args.mask_max_fpr)
            kevlar.plog('[kevlar::call]', message)
        mask.save(args.gen_mask)