Exemplo n.º 1
0
def test_save_load(tabletype):
    kh = tabletype(5, PRIMES_1m)
    savefile = utils.get_temp_filename('tablesave.out')

    # test add(dna)
    x = kh.add("ATGGC")
    z = kh.get("ATGGC")
    assert z == 1

    kh.save(savefile)

    # should we provide a single load function here? yes, probably. @CTB
    if tabletype == _Countgraph:
        loaded = khmer.load_countgraph(savefile)
    elif tabletype == _Counttable:
        loaded = khmer.load_counttable(savefile)
    elif tabletype == _SmallCountgraph:
        loaded = khmer.load_countgraph(savefile, small=True)
    elif tabletype == _SmallCounttable:
        loaded = khmer.load_counttable(savefile, small=True)
    elif tabletype == _Nodegraph:
        loaded = khmer.load_nodegraph(savefile)
    elif tabletype == _Nodetable:
        loaded = khmer.load_nodetable(savefile)
    else:
        raise Exception("unknown tabletype")

    z = loaded.get('ATGGC')
    assert z == 1
Exemplo n.º 2
0
def test_load_gz():
    inpath = utils.get_test_data('random-20-a.fa')

    savepath = utils.get_temp_filename('tempcountingsave1.ht')
    loadpath = utils.get_temp_filename('tempcountingsave1.ht.gz')

    sizes = list(PRIMES_1m)
    sizes.append(1000005)

    # save uncompressed hashtable.
    hi = khmer._Countgraph(12, sizes)
    hi.consume_fasta(inpath)
    hi.save(savepath)

    # compress.
    in_file = open(savepath, 'rb')
    out_file = gzip.open(loadpath, 'wb')
    out_file.writelines(in_file)
    out_file.close()
    in_file.close()

    # load compressed hashtable.
    try:
        ht = khmer.load_countgraph(loadpath)
    except OSError as err:
        assert 0, "Should not produce an OSError: " + str(err)

    tracking = khmer._Nodegraph(12, sizes)
    x = hi.abundance_distribution(inpath, tracking)

    tracking = khmer._Nodegraph(12, sizes)
    y = ht.abundance_distribution(inpath, tracking)

    assert sum(x) == 3966, sum(x)
    assert x == y, (x, y)
Exemplo n.º 3
0
def main():
    info("count-median.py", ["diginorm"])
    args = sanitize_help(get_parser()).parse_args()

    htfile = args.countgraph
    input_filename = args.input
    output = args.output

    infiles = [htfile, input_filename]
    for infile in infiles:
        check_input_files(infile, args.force)

    check_space(infiles, args.force)

    print("loading k-mer countgraph from", htfile, file=sys.stderr)
    countgraph = load_countgraph(htfile)
    ksize = countgraph.ksize()
    print("writing to", output.name, file=sys.stderr)

    output = csv.writer(output)
    # write headers:
    output.writerow(["name", "median", "average", "stddev", "seqlen"])

    for record in screed.open(input_filename):
        seq = record.sequence.upper()
        if "N" in seq:
            seq = seq.replace("N", "A")

        if ksize <= len(seq):
            medn, ave, stdev = countgraph.get_median_count(seq)
            ave, stdev = [round(x, 9) for x in (ave, stdev)]
            output.writerow([record.name, medn, ave, stdev, len(seq)])
Exemplo n.º 4
0
def main():
    counting_ht = sys.argv[1]
    infiles = sys.argv[2:]

    print('file with ht: %s' % counting_ht)
    print('-- settings:')
    print('N THREADS', WORKER_THREADS)
    print('--')

    print('making hashtable')
    ht = khmer.load_countgraph(counting_ht)
    K = ht.ksize()

    for infile in infiles:
        print('filtering', infile)
        outfile = os.path.basename(infile) + '.below'

        outfp = open(outfile, 'w')

        def process_fn(record, ht=ht):
            name = record['name']
            seq = record['sequence']
            if 'N' in seq:
                return None, None

            trim_seq, trim_at = ht.trim_below_abundance(seq, CUTOFF)

            if trim_at >= K:
                return name, trim_seq

            return None, None

        tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE)

        tsp.start(verbose_fasta_iter(infile), outfp)
Exemplo n.º 5
0
def main():

    hashfile = sys.argv[1]
    filename = sys.argv[2]
    figure = sys.argv[3]

    ht = khmer.load_countgraph(hashfile)

    outabund = open(os.path.basename(filename) + '.counts', 'w')

    counts = []
    d = {}
    for sequence in open(sys.argv[2]):
        sequence = sequence.strip()

        count = ht.get(sequence)
        counts.append(count)
        d[count] = d.get(count, 0) + 1

        if count > 1000:
            print(sequence, count, file=outabund)

    outfp = open(figure + '.countshist', 'w')
    sofar = 0
    sofar_cumu = 0
    for k in sorted(d.keys()):
        sofar += d[k]
        sofar_cumu += k * d[k]
        print(k, d[k], sofar, sofar_cumu, file=outfp)

    hist(counts, normed=True, cumulative=True, bins=100, range=(1, 1000))
    savefig(figure)
def test_load_into_counting_1():
    in1 = utils.get_test_data("test-abund-read-2.fa")
    out1 = utils.get_temp_filename("out.ct")

    cmd = """
       cat {in1} |
       {scripts}/load-into-counting.py -x 1e3 -N 2 -k 20 {out1} - \
       2> /dev/null
    """

    cmd = cmd.format(scripts=scriptpath(), in1=in1, out1=out1)
    print(cmd)

    (status, out, err) = run_shell_cmd(cmd)
    assert os.path.exists(out1)
    khmer.load_countgraph(out1)
Exemplo n.º 7
0
def test_load_gz():
    inpath = utils.get_test_data('random-20-a.fa')

    savepath = utils.get_temp_filename('tempcountingsave1.ht')
    loadpath = utils.get_temp_filename('tempcountingsave1.ht.gz')

    sizes = list(PRIMES_1m)
    sizes.append(1000005)

    # save uncompressed hashtable.
    hi = khmer._Countgraph(12, sizes)
    hi.consume_seqfile(inpath)
    hi.save(savepath)

    # compress.
    in_file = open(savepath, 'rb')
    out_file = gzip.open(loadpath, 'wb')
    out_file.writelines(in_file)
    out_file.close()
    in_file.close()

    # load compressed hashtable.
    try:
        ht = khmer.load_countgraph(loadpath)
    except OSError as err:
        assert 0, "Should not produce an OSError: " + str(err)

    tracking = khmer._Nodegraph(12, sizes)
    x = hi.abundance_distribution(inpath, tracking)

    tracking = khmer._Nodegraph(12, sizes)
    y = ht.abundance_distribution(inpath, tracking)

    assert sum(x) == 3966, sum(x)
    assert x == y, (x, y)
Exemplo n.º 8
0
def test_abund_dist_gz_bigcount_compressed_first():
    infile = utils.copy_test_data('test-abund-read-2.fa')
    script = 'load-into-counting.py'
    htfile = utils.get_temp_filename('test_ct.gz')
    args = ['-x', str(1e7), '-N', str(2), '-k', str(2), htfile, infile]
    utils.runscript(script, args)  # create a bigcount table
    assert os.path.exists(htfile)
    data = gzip.open(htfile, 'rb').read()  # read compressed bigcount table

    outfile = utils.get_temp_filename('test_ct')
    f_out = open(outfile, 'wb')  # output the bigcount table
    f_out.write(data)
    f_out.close()
    # load the compressed bigcount table
    try:
        countgraph = khmer.load_countgraph(outfile)
    except OSError as err:
        assert 0, 'Should not produce OSError: ' + str(err)

    assert countgraph.n_occupied() != 0
    hashsizes = countgraph.hashsizes()
    kmer_size = countgraph.ksize()
    tracking = khmer._Nodegraph(kmer_size, hashsizes)
    abundances = countgraph.abundance_distribution(infile, tracking)
    # calculate abundance distribution for compressed bigcount table
    flag = False
    # check if abundance is > 255
    # if ok  gzipped bigcount was loaded correctly
    for _, i in enumerate(abundances):
        print(_, i)
        if _ > 255 and i > 0:
            flag = True
            break
    assert flag
Exemplo n.º 9
0
def test_badload():

    try:
        countgraph = khmer.load_countgraph()
        assert 0, "this should fail"
    except TypeError as err:
        print(str(err))
Exemplo n.º 10
0
def main():
    info('count-kmers.py', ['counting'])
    args = get_parser().parse_args()

    print('hashtable from', args.input_count_graph_filename, file=sys.stderr)
    countgraph = khmer.load_countgraph(args.input_count_graph_filename)

    kmer_size = countgraph.ksize()
    hashsizes = countgraph.hashsizes()
    tracking = khmer._Nodegraph(  # pylint: disable=protected-access
        kmer_size, hashsizes)

    if args.output_file is None:
        args.output_file = sys.stdout
    writer = csv.writer(args.output_file)

    for filename in args.input_sequence_filenames:
        for record in screed.open(filename):
            seq = record.sequence.replace('N', 'A')
            for i in range(len(seq) - kmer_size + 1):
                kmer = seq[i:i + kmer_size]
                if not tracking.get(kmer):
                    tracking.count(kmer)
                    writer.writerow([kmer, str(countgraph.get(kmer))])

    print('Total number of unique k-mers: {0}'.format(
        countgraph.n_unique_kmers()),
          file=sys.stderr)
Exemplo n.º 11
0
def test_load_into_counting_1():
    in1 = utils.get_test_data('test-abund-read-2.fa')
    out1 = utils.get_temp_filename('out.ct')

    cmd = """
       cat {in1} |
       {scripts}/load-into-counting.py -x 1e3 -N 2 -k 20 {out1} - \
       2> /dev/null
    """

    cmd = cmd.format(scripts=scriptpath(), in1=in1, out1=out1)
    print(cmd)

    run_shell_cmd(cmd)
    assert os.path.exists(out1)
    khmer.load_countgraph(out1)
Exemplo n.º 12
0
def main():
    counting_ht = sys.argv[1]
    infiles = sys.argv[2:]

    print('file with ht: %s' % counting_ht)
    print('-- settings:')
    print('N THREADS', WORKER_THREADS)
    print('--')

    print('making hashtable')
    ht = khmer.load_countgraph(counting_ht)
    K = ht.ksize()

    for infile in infiles:
        print('filtering', infile)
        outfile = os.path.basename(infile) + '.below'

        outfp = open(outfile, 'w')

        def process_fn(record, ht=ht):
            name = record['name']
            seq = record['sequence']
            if 'N' in seq:
                return None, None

            trim_seq, trim_at = ht.trim_below_abundance(seq, CUTOFF)

            if trim_at >= K:
                return name, trim_seq

            return None, None

        tsp = ThreadedSequenceProcessor(process_fn, WORKER_THREADS, GROUPSIZE)

        tsp.start(verbose_fasta_iter(infile), outfp)
Exemplo n.º 13
0
def main():
    info('count-kmers.py', ['counting'])
    args = get_parser().parse_args()

    print ('hashtable from', args.input_count_graph_filename,
           file=sys.stderr)
    countgraph = khmer.load_countgraph(
        args.input_count_graph_filename)

    kmer_size = countgraph.ksize()
    hashsizes = countgraph.hashsizes()
    tracking = khmer._Nodegraph(  # pylint: disable=protected-access
        kmer_size, hashsizes)

    if args.output_file is None:
        args.output_file = sys.stdout
    writer = csv.writer(args.output_file)

    for filename in args.input_sequence_filenames:
        for record in screed.open(filename):
            seq = record.sequence.replace('N', 'A')
            for i in range(len(seq) - kmer_size + 1):
                kmer = seq[i:i+kmer_size]
                if not tracking.get(kmer):
                    tracking.count(kmer)
                    writer.writerow([kmer, str(countgraph.get(kmer))])

    print ('Total number of unique k-mers: {0}'.format(
        countgraph.n_unique_kmers()), file=sys.stderr)
Exemplo n.º 14
0
def main():
    args = sanitize_help(get_parser()).parse_args()

    htfile = args.countgraph
    input_filename = args.input
    output = args.output

    infiles = [htfile, input_filename]
    for infile in infiles:
        check_input_files(infile, args.force)

    check_space(infiles, args.force)

    print('loading k-mer countgraph from', htfile, file=sys.stderr)
    countgraph = load_countgraph(htfile)
    ksize = countgraph.ksize()
    print('writing to', output.name, file=sys.stderr)

    output = csv.writer(output)
    # write headers:
    output.writerow(['name', 'median', 'average', 'stddev', 'seqlen'])

    for record in screed.open(input_filename):
        seq = record.sequence.upper()
        if 'N' in seq:
            seq = seq.replace('N', 'A')

        if ksize <= len(seq):
            medn, ave, stdev = countgraph.get_median_count(seq)
            ave, stdev = [round(x, 9) for x in (ave, stdev)]
            output.writerow([record.name, medn, ave, stdev, len(seq)])
Exemplo n.º 15
0
def test_abund_dist_gz_bigcount_compressed_first():
    infile = utils.copy_test_data('test-abund-read-2.fa')
    script = 'load-into-counting.py'
    htfile = utils.get_temp_filename('test_ct.gz')
    args = ['-x', str(1e7), '-N', str(2), '-k', str(2), htfile, infile]
    utils.runscript(script, args)  # create a bigcount table
    assert os.path.exists(htfile)
    data = gzip.open(htfile, 'rb').read()  # read compressed bigcount table

    outfile = utils.get_temp_filename('test_ct')
    f_out = open(outfile, 'wb')  # output the bigcount table
    f_out.write(data)
    f_out.close()
    # load the compressed bigcount table
    try:
        countgraph = khmer.load_countgraph(outfile)
    except OSError as err:
        assert 0, 'Should not produce OSError: ' + str(err)

    assert countgraph.n_occupied() != 0
    hashsizes = countgraph.hashsizes()
    kmer_size = countgraph.ksize()
    tracking = khmer._Nodegraph(kmer_size, hashsizes)
    abundances = countgraph.abundance_distribution(infile, tracking)
    # calculate abundance distribution for compressed bigcount table
    flag = False
    # check if abundance is > 255
    # if ok  gzipped bigcount was loaded correctly
    for _, i in enumerate(abundances):
        print(_, i)
        if _ > 255 and i > 0:
            flag = True
            break
    assert flag
Exemplo n.º 16
0
def main():
    args = sanitize_help(get_parser()).parse_args()

    htfile = args.countgraph
    input_filename = args.input
    output = args.output

    infiles = [htfile, input_filename]
    for infile in infiles:
        check_input_files(infile, args.force)

    check_space(infiles, args.force)

    print('loading k-mer countgraph from', htfile, file=sys.stderr)
    countgraph = load_countgraph(htfile)
    ksize = countgraph.ksize()
    print('writing to', output.name, file=sys.stderr)

    output = csv.writer(output)
    # write headers:
    output.writerow(['name', 'median', 'average', 'stddev', 'seqlen'])

    for record in screed.open(input_filename):
        seq = record.sequence.upper()
        if 'N' in seq:
            seq = seq.replace('N', 'A')

        if ksize <= len(seq):
            medn, ave, stdev = countgraph.get_median_count(seq)
            ave, stdev = [round(x, 9) for x in (ave, stdev)]
            output.writerow([record.name, medn, ave, stdev, len(seq)])
Exemplo n.º 17
0
def main():
    info('filter-abund.py', ['counting'])
    args = sanitize_help(get_parser()).parse_args()

    check_input_files(args.input_graph, args.force)
    infiles = args.input_filename
    if ('-' in infiles or '/dev/stdin' in infiles) and not \
       args.single_output_file:
        print("Accepting input from stdin; output filename must "
              "be provided with -o.", file=sys.stderr)
        sys.exit(1)

    for filename in infiles:
        check_input_files(filename, args.force)

    check_space(infiles, args.force)

    print('loading countgraph:', args.input_graph,
          file=sys.stderr)
    countgraph = khmer.load_countgraph(args.input_graph)
    ksize = countgraph.ksize()

    print("K:", ksize, file=sys.stderr)

    # the filtering function.
    def process_fn(record):
        name = record.name
        seq = record.sequence
        seqN = seq.replace('N', 'A')

        if args.variable_coverage:  # only trim when sequence has high enough C
            med, _, _ = countgraph.get_median_count(seqN)
            if med < args.normalize_to:
                return name, seq

        _, trim_at = countgraph.trim_on_abundance(seqN, args.cutoff)

        if trim_at >= ksize:
            # be sure to not to change the 'N's in the trimmed sequence -
            # so, return 'seq' and not 'seqN'.
            return name, seq[:trim_at]

        return None, None

    if args.single_output_file:
        outfile = args.single_output_file.name
        outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip)

    # the filtering loop
    for infile in infiles:
        print('filtering', infile, file=sys.stderr)
        if not args.single_output_file:
            outfile = os.path.basename(infile) + '.abundfilt'
            outfp = open(outfile, 'wb')
            outfp = get_file_writer(outfp, args.gzip, args.bzip)

        tsp = ThreadedSequenceProcessor(process_fn, n_workers=args.threads)
        tsp.start(verbose_loader(infile), outfp)

        print('output in', outfile, file=sys.stderr)
Exemplo n.º 18
0
def test_badload():

    try:
        countgraph = khmer.load_countgraph()
        assert 0, "this should fail"
    except TypeError as err:
        print(str(err))
Exemplo n.º 19
0
def test_counting_gz_file_type_check():
    inpath = utils.get_test_data('goodversion-k12.ht.gz')

    try:
        kh = khmer.load_countgraph(inpath)
        assert 0, "this should fail"
    except OSError as e:
        print(str(e))
Exemplo n.º 20
0
def test_load_gz_notexist_should_fail():
    savepath = utils.get_temp_filename('tempcountingsave0.ht.gz')

    try:
        hi = khmer.load_countgraph(savepath)
        assert 0, "load should fail"
    except OSError as e:
        print(str(e))
Exemplo n.º 21
0
def test_load_gz_notexist_should_fail():
    savepath = utils.get_temp_filename('tempcountingsave0.ht.gz')

    try:
        hi = khmer.load_countgraph(savepath)
        assert 0, "load should fail"
    except OSError as e:
        print(str(e))
Exemplo n.º 22
0
def test_load_notexist_should_fail():
    savepath = utils.get_temp_filename('tempnodegraphsave0.htable')

    try:
        hi = khmer.load_countgraph(savepath)
        assert 0, "load should fail"
    except OSError:
        pass
Exemplo n.º 23
0
def test_counting_gz_file_type_check():
    inpath = utils.get_test_data('goodversion-k12.ht.gz')

    try:
        kh = khmer.load_countgraph(inpath)
        assert 0, "this should fail"
    except OSError as e:
        print(str(e))
Exemplo n.º 24
0
def test_load_notexist_should_fail():
    savepath = utils.get_temp_filename('tempnodegraphsave0.htable')

    try:
        hi = khmer.load_countgraph(savepath)
        assert 0, "load should fail"
    except OSError:
        pass
def main():
    parser = build_counting_multifile_args()
    parser.add_argument('--coverage',
                        '-C',
                        dest='coverage',
                        default=DEFAULT_COVERAGE,
                        type=int)
    args = parser.parse_args()

    counting_ht = args.input_table
    infiles = args.input_filenames

    print('file with ht: %s' % counting_ht)

    print('loading hashtable')
    ht = khmer.load_countgraph(counting_ht)
    K = ht.ksize()

    xxxfp = None

    print("K:", K)

    # the filtering function.
    def process_fn(record):
        name = record['name']
        seq = record['sequence']

        med, avg, dev = ht.get_median_count(seq)
        pct = dev / avg * 100

        xxxfp.write('%s %s %s %s %s\n' % (med, avg, dev, pct, name))

        if random.randint(1, med) > args.coverage or pct > 100:
            return None, None

        return name, seq

    # the filtering loop
    for infile in infiles:
        print('filtering', infile)
        xxxfp = open(os.path.basename(infile) + '.medpctfilt.stats', 'w')
        outfile = os.path.basename(infile) + '.medpctfilt'
        outfp = open(outfile, 'w')

        for n, record in enumerate(screed.open(infile)):
            if n % 100000 == 0:
                print('...', n)

            name, seq = process_fn(record)
            if name and seq:
                print('>%s\n%s' % (name, seq), file=outfp)

        print('output in', outfile)
Exemplo n.º 26
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-m', '--min-coverage', type=int, default=None)
    parser.add_argument('-M', '--max-coverage', type=int, default=None)
    parser.add_argument('input_count_graph')
    parser.add_argument('input_readfile')
    parser.add_argument('output_readfile')
    args = parser.parse_args()

    print('min_coverage: %s' % args.min_coverage, file=sys.stderr)
    print('max_coverage: %s' % args.max_coverage, file=sys.stderr)

    if not (args.min_coverage or args.max_coverage):
        print("neither min nor max coverage specified!? exiting!",
              file=sys.stderr)
        sys.exit(1)

    if args.min_coverage and args.max_coverage and \
       args.max_coverage < args.min_coverage:
        print("min_coverage > max_coverage!? exiting!", file=sys.stderr)
        sys.exit(1)

    htable = khmer.load_countgraph(args.input_count_graph)
    output_file = args.output_readfile
    output_fp = open(output_file, 'w')

    n_kept = 0
    n = 0
    for n, record in enumerate(screed.open(args.input_readfile)):
        if n % 100000 == 0:
            print('...', n, n_kept, file=sys.stderr)

        seq = record.sequence.upper()
        seq = seq.replace('N', 'A')

        try:
            med, _, _ = htable.get_median_count(seq)
        except ValueError:
            continue

        keep = True
        if args.min_coverage and med < args.min_coverage:
            keep = False

        if args.max_coverage and med > args.max_coverage:
            keep = False

        if keep:
            n_kept += 1

            output_fp.write(output_single(record))

    print('consumed %d reads; kept %d' % (n, n_kept), file=sys.stderr)
Exemplo n.º 27
0
def main():
    parser = khmer_args.build_counting_args(
        "Correct reads against an already-computed table",
        citations=['counting', 'SeqAn'])

    parser.add_argument("--trusted-cov",
                        dest="trusted_cov",
                        type=int,
                        default=DEFAULT_CUTOFF)
    parser.add_argument("--theta", dest="bits_theta", type=float, default=1.0)
    parser.add_argument('-o',
                        '--output',
                        dest='output_file',
                        help="output file for histogram; defaults to "
                        "<first filename>.corr in cwd.",
                        type=khFileType('w'),
                        default=None)

    parser.add_argument('counts_table')
    parser.add_argument('readfile')

    args = parser.parse_args()

    print('loading counts')
    ht = khmer.load_countgraph(args.counts_table)

    aligner = khmer.ReadAligner(ht, args.trusted_cov, args.bits_theta)

    print("trusted:", args.trusted_cov)

    corrfp = args.output_file
    if not corrfp:
        outfile = os.path.basename(args.readfile) + '.corr'
        corrfp = open(outfile, 'w')

    n_corrected = 0
    for n, read in enumerate(screed.open(args.readfile)):
        if n % 10000 == 0:
            print('...', n, n_corrected, file=sys.stderr)
        seq = read.sequence.replace('N', 'A')

        # build the alignment...
        score, graph_alignment, read_alignment, truncated = \
            aligner.align(seq)

        if not truncated:
            graph_seq = graph_alignment.replace("-", "")
            if graph_seq != seq:
                n_corrected += 1

            seq = graph_seq

        corrfp.write(output_single(read, seq))
Exemplo n.º 28
0
def main():
    args = sanitize_help(get_parser()).parse_args()
    if not args.quiet:
        info('filter-abund.py', ['counting'])

    configure_logging(args.quiet)

    infiles = args.input_filename
    if ('-' in infiles or '/dev/stdin' in infiles) and not \
       args.single_output_file:
        log_error("Accepting input from stdin; output filename must "
                  "be provided with -o.")
        sys.exit(1)

    for filename in infiles:
        check_input_files(filename, args.force)

    check_space(infiles, args.force)

    log_info('loading countgraph: {graph}', graph=args.input_graph)
    countgraph = khmer.load_countgraph(args.input_graph)
    ksize = countgraph.ksize()

    log_info("K: {ksize}", ksize=ksize)

    if args.single_output_file:
        outfile = args.single_output_file.name
        outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip)

    # the filtering loop
    for infile in infiles:
        log_info('filtering {infile}', infile=infile)
        if not args.single_output_file:
            outfile = os.path.basename(infile) + '.abundfilt'
            outfp = open(outfile, 'wb')
            outfp = get_file_writer(outfp, args.gzip, args.bzip)

        paired_iter = broken_paired_reader(ReadParser(infile),
                                           min_length=ksize,
                                           force_single=True)

        for n, is_pair, read1, read2 in paired_iter:
            assert not is_pair
            assert read2 is None

            trimmed_record, _ = trim_record(countgraph, read1, args.cutoff,
                                            args.variable_coverage,
                                            args.normalize_to)
            if trimmed_record:
                write_record(trimmed_record, outfp)

        log_info('output in {outfile}', outfile=outfile)
Exemplo n.º 29
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-m', '--min-coverage', type=int, default=None)
    parser.add_argument('-M', '--max-coverage', type=int, default=None)
    parser.add_argument('input_count_graph')
    parser.add_argument('input_readfile')
    parser.add_argument('output_readfile')
    args = parser.parse_args()

    print('min_coverage: %s' % args.min_coverage, file=sys.stderr)
    print('max_coverage: %s' % args.max_coverage, file=sys.stderr)

    if not (args.min_coverage or args.max_coverage):
        print("neither min nor max coverage specified!? exiting!", file=sys.stderr)
        sys.exit(1)

    if args.min_coverage and args.max_coverage and \
       args.max_coverage < args.min_coverage:
        print("min_coverage > max_coverage!? exiting!", file=sys.stderr)
        sys.exit(1)

    htable = khmer.load_countgraph(args.input_count_graph)
    output_file = args.output_readfile
    output_fp = open(output_file, 'w')

    n_kept = 0
    n = 0
    for n, record in enumerate(screed.open(args.input_readfile)):
        if n % 100000 == 0:
            print('...', n, n_kept, file=sys.stderr)

        seq = record.sequence.upper()
        seq = seq.replace('N', 'A')

        try:
            med, _, _ = htable.get_median_count(seq)
        except ValueError:
            continue

        keep = True
        if args.min_coverage and med < args.min_coverage:
            keep = False

        if args.max_coverage and med > args.max_coverage:
            keep = False

        if keep:
            n_kept += 1

            output_fp.write(output_single(record))

    print('consumed %d reads; kept %d' % (n, n_kept), file=sys.stderr)
def main():
    files = sys.argv[2:]

    total_reads = len(files) * [0]
    n_consumed = len(files) * [0]
    n_seq_kept = len(files) * [0]

    print('loading ht')
    ht = khmer.load_countgraph(sys.argv[1])

    for i, infile in enumerate(files):
        print('outputting', infile + '.freq')
        ht.output_fasta_kmer_pos_freq(infile, infile + ".freq")
Exemplo n.º 31
0
def test_load_truncated():
    inpath = utils.get_test_data('random-20-a.fa')
    savepath = utils.get_temp_filename('save.ht')
    truncpath = utils.get_temp_filename('trunc.ht')

    sizes = khmer.get_n_primes_near_x(3, 200)

    hi = khmer._Countgraph(12, sizes)
    hi.consume_seqfile(inpath)
    hi.save(savepath)

    data = open(savepath, 'rb').read()
    for i in range(len(data)):
        fp = open(truncpath, 'wb')
        fp.write(data[:i])
        fp.close()

        try:
            khmer.load_countgraph(truncpath)
            assert 0, "this should not be reached!"
        except OSError as err:
            print(str(err))
Exemplo n.º 32
0
def main():
    args = get_parser().parse_args()
    infiles = [args.input_count_graph_filename] + args.input_sequence_filenames
    for infile in infiles:
        check_input_files(infile, False)
    counts = khmer.load_countgraph(args.input_count_graph_filename)
    results = find_N_most_abundant_kmers(args.input_sequence_filenames,
                                         args.N, counts)

    results_df = pd.DataFrame({'kmer': [str(k) for k in results.keys()],
                               'count': [int(c) for c in results.values()]})
    results_df.sort_values(by='count', inplace=True, ascending=False)
    results_df.to_csv(args.output, index=False)
Exemplo n.º 33
0
def test_load_truncated():
    inpath = utils.get_test_data('random-20-a.fa')
    savepath = utils.get_temp_filename('save.ht')
    truncpath = utils.get_temp_filename('trunc.ht')

    sizes = khmer.get_n_primes_near_x(3, 200)

    hi = khmer._Countgraph(12, sizes)
    hi.consume_fasta(inpath)
    hi.save(savepath)

    data = open(savepath, 'rb').read()
    for i in range(len(data)):
        fp = open(truncpath, 'wb')
        fp.write(data[:i])
        fp.close()

        try:
            khmer.load_countgraph(truncpath)
            assert 0, "this should not be reached!"
        except OSError as err:
            print(str(err))
Exemplo n.º 34
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('table')
    parser.add_argument('sequences')
    parser.add_argument('-C', '--cutoff', default=3, type=int)
    parser.add_argument('--coverage', default=20, type=int)
    parser.add_argument('-V', '--variable', default=False, action='store_true')
    parser.add_argument('-o', '--outfile', type=argparse.FileType('w'),
                        default=sys.stdout)

    args = parser.parse_args()

    kh = khmer.load_countgraph(args.table)
    n_skipped_variable = 0
    n_total = 0

    print >>sys.stderr, "K:", kh.ksize()
    print >>sys.stderr, "CUTOFF:", args.cutoff
    if args.variable:
        print >>sys.stderr, "variable coverage flag set;"
        print >>sys.stderr, "NORMALIZE_LIMIT:", args.coverage
    else:
        print >>sys.stderr, "assuming even coverage - no -V"

    for n, record in enumerate(screed.open(args.sequences)):
        if n % 100000 == 0:
            print >>sys.stderr, '...', n
        seq = record.sequence.upper().replace('N', 'A')

        n_total += 1

        varskip = False
        if args.variable:
            med, _, _ = kh.get_median_count(seq)
            if med < args.coverage:
                varskip = True
                n_skipped_variable += 1
            
        name = record.name.split()[0]
        if varskip:
            print >>args.outfile, name, 'V'
        else:
            #posns = find_spectral_error_positions(kh, seq, args.cutoff)
            posns = kh.find_spectral_error_positions(seq, args.cutoff)
            posns = add_n_posns(posns, record.sequence)
            print >>args.outfile, name, ",".join(map(str, posns))


    if args.variable:
        sys.stderr.write('Skipped %d reads of %d total due to -V\n' % \
                         (n_skipped_variable, n_total))
Exemplo n.º 35
0
def main():
    args = sanitize_help(get_parser()).parse_args()

    configure_logging(args.quiet)

    infiles = args.input_filename
    if ('-' in infiles or '/dev/stdin' in infiles) and not \
       args.single_output_file:
        log_error("Accepting input from stdin; output filename must "
                  "be provided with -o.")
        sys.exit(1)

    for filename in infiles:
        check_input_files(filename, args.force)

    check_space(infiles, args.force)

    log_info('loading countgraph: {graph}', graph=args.input_graph)
    countgraph = khmer.load_countgraph(args.input_graph)
    ksize = countgraph.ksize()

    log_info("K: {ksize}", ksize=ksize)

    if args.single_output_file:
        outfile = args.single_output_file.name
        outfp = get_file_writer(args.single_output_file, args.gzip, args.bzip)

    # the filtering loop
    for infile in infiles:
        log_info('filtering {infile}', infile=infile)
        if not args.single_output_file:
            outfile = os.path.basename(infile) + '.abundfilt'
            outfp = open(outfile, 'wb')
            outfp = get_file_writer(outfp, args.gzip, args.bzip)

        paired_iter = broken_paired_reader(ReadParser(infile),
                                           min_length=ksize,
                                           force_single=True)

        for n, is_pair, read1, read2 in paired_iter:
            assert not is_pair
            assert read2 is None

            trimmed_record, _ = trim_record(countgraph, read1, args.cutoff,
                                            args.variable_coverage,
                                            args.normalize_to)
            if trimmed_record:
                write_record(trimmed_record, outfp)

        log_info('output in {outfile}', outfile=outfile)
Exemplo n.º 36
0
def main():
    parser = build_counting_multifile_args()
    parser.add_argument('--coverage', '-C', dest='coverage',
                        default=DEFAULT_COVERAGE, type=int)
    args = parser.parse_args()

    counting_ht = args.input_table
    infiles = args.input_filenames

    print('file with ht: %s' % counting_ht)

    print('loading hashtable')
    ht = khmer.load_countgraph(counting_ht)
    K = ht.ksize()

    xxxfp = None

    print("K:", K)

    # the filtering function.
    def process_fn(record):
        name = record['name']
        seq = record['sequence']

        med, avg, dev = ht.get_median_count(seq)
        pct = dev / avg * 100

        xxxfp.write('%s %s %s %s %s\n' % (med, avg, dev, pct, name))

        if random.randint(1, med) > args.coverage or pct > 100:
            return None, None

        return name, seq

    # the filtering loop
    for infile in infiles:
        print('filtering', infile)
        xxxfp = open(os.path.basename(infile) + '.medpctfilt.stats', 'w')
        outfile = os.path.basename(infile) + '.medpctfilt'
        outfp = open(outfile, 'w')

        for n, record in enumerate(screed.open(infile)):
            if n % 100000 == 0:
                print('...', n)

            name, seq = process_fn(record)
            if name and seq:
                print('>%s\n%s' % (name, seq), file=outfp)

        print('output in', outfile)
Exemplo n.º 37
0
def main():
    hashfile = sys.argv[1]
    filename = sys.argv[2]
    outfile = os.path.basename(filename)

    print('loading kh file', hashfile)
    ht = khmer.load_countgraph(hashfile)

    x = ht.fasta_count_kmers_by_position(filename, 100, 1)
    write_dist(x, open(outfile + '.pos.abund=1', 'w'))
    print('wrote', outfile + '.pos.abund=1')

    y = ht.fasta_count_kmers_by_position(filename, 100, 255)
    write_dist(y, open(outfile + '.pos.abund=255', 'w'))
    print('wrote', outfile + '.pos.abund=255')
Exemplo n.º 38
0
    def do_test(ctfile):
        print('working with', ctfile)
        inpath = utils.get_test_data('random-20-a.fa')
        savepath = utils.get_temp_filename(ctfile)

        orig = khmer.Countgraph(12, 1e5, 4)
        orig.consume_fasta(inpath)
        orig.save(savepath)

        loaded = khmer.load_countgraph(savepath)

        orig_count = orig.n_occupied()
        loaded_count = loaded.n_occupied()
        assert orig_count == 3886, orig_count
        assert loaded_count == orig_count, loaded_count
Exemplo n.º 39
0
def test_save_load_occupied_small(ctfile):
    print('working with', ctfile)
    inpath = utils.get_test_data('random-20-a.fa')
    savepath = utils.get_temp_filename(ctfile)

    orig = khmer.SmallCountgraph(12, 1e5, 4)
    orig.consume_seqfile(inpath)
    orig.save(savepath)

    loaded = khmer.load_countgraph(savepath, small=True)

    orig_count = orig.n_occupied()
    loaded_count = loaded.n_occupied()
    assert orig_count == 3886, orig_count
    assert loaded_count == orig_count, loaded_count
Exemplo n.º 40
0
def main():
    parser = argparse.ArgumentParser(
        description="Output k-mer abundance distribution.")

    parser.add_argument('hashname')
    parser.add_argument('seqfile')
    parser.add_argument('histout')

    args = parser.parse_args()
    hashfile = args.hashname
    seqfile = args.seqfile
    histout = args.histout

    outfp = open(histout, 'w')

    print('hashtable from', hashfile)
    ht = khmer.load_countgraph(hashfile)

    hist = {}

    for i in range(65536):
        hist[i] = 0

    for n, record in enumerate(screed.open(seqfile)):
        if n > 0 and n % 100000 == 0:
            print('...', n)

        seq = record.sequence.replace('N', 'A')

        try:
            med, _, _ = ht.get_median_count(seq)
        except ValueError:
            continue

        hist[med] = hist[med] + 1

    histlist = list(hist.items())
    histlist.sort()

    maxk = max(hist.keys())
    sumk = sum(hist.values())

    sofar = 0
    for n, m in histlist:
        sofar += m
        percent = float(sofar) / sumk
        outfp.write('%d %d %d %.3f\n' % (n, m, sofar, percent))
    outfp.close()
Exemplo n.º 41
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("--trusted-cov", dest="trusted_cov", type=int,
                        default=DEFAULT_CUTOFF)
    parser.add_argument("--theta", dest="bits_theta", type=float, default=1.0)
    parser.add_argument('-o', '--output', dest='output_file',
                        help="output file for histogram; defaults to "
                             "<first filename>.errhist in cwd.",
                        type=argparse.FileType('w'), default=None)

    parser.add_argument('counts_table')
    parser.add_argument('readfile')
    
    args = parser.parse_args()

    print('loading counts')
    ht = khmer.load_countgraph(args.counts_table)

    aligner = khmer.ReadAligner(ht,
                                args.trusted_cov,
                                args.bits_theta)

    print("trusted:", args.trusted_cov)

    corrfp = args.output_file
    if not corrfp:
        outfile = os.path.basename(args.readfile) + '.corr'
        corrfp = open(outfile, 'w')

    n_corrected = 0
    for n, read in enumerate(screed.open(args.readfile)):
        if n % 10000 == 0:
            print('...', n, n_corrected, file=sys.stderr)
        seq = read.sequence.replace('N', 'A')

        # build the alignment...
        score, graph_alignment, read_alignment, truncated = \
               aligner.align(seq)
        
        if not truncated:
            graph_seq = graph_alignment.replace("-", "")
            if graph_seq != seq:
                n_corrected += 1

            seq = graph_seq

        corrfp.write(output_single(read, seq))
Exemplo n.º 42
0
def main():
    parser = argparse.ArgumentParser(
        description="Output k-mer abundance distribution.")

    parser.add_argument('hashname')
    parser.add_argument('seqfile')
    parser.add_argument('histout')

    args = parser.parse_args()
    hashfile = args.hashname
    seqfile = args.seqfile
    histout = args.histout

    outfp = open(histout, 'w')

    print('hashtable from', hashfile)
    ht = khmer.load_countgraph(hashfile)

    hist = {}

    for i in range(65536):
        hist[i] = 0

    for n, record in enumerate(screed.open(seqfile)):
        if n > 0 and n % 100000 == 0:
            print('...', n)

        seq = record.sequence.replace('N', 'A')

        try:
            med, _, _ = ht.get_median_count(seq)
        except ValueError:
            continue

        hist[med] = hist[med] + 1

    histlist = list(hist.items())
    histlist.sort()

    maxk = max(hist.keys())
    sumk = sum(hist.values())

    sofar = 0
    for n, m in histlist:
        sofar += m
        percent = float(sofar) / sumk
        outfp.write('%d %d %d %.3f\n' % (n, m, sofar, percent))
    outfp.close()
Exemplo n.º 43
0
def test_save_load_large(ctfile):
    inpath = utils.get_test_data('random-20-a.fa')
    savepath = utils.get_temp_filename(ctfile)

    sizes = khmer.get_n_primes_near_x(1, 2**31 + 1000)

    orig = khmer._Countgraph(12, sizes)
    orig.consume_seqfile(inpath)
    orig.save(savepath)

    loaded = khmer.load_countgraph(savepath)

    orig_count = orig.n_occupied()
    loaded_count = loaded.n_occupied()
    assert orig_count == 3966, orig_count
    assert loaded_count == orig_count, loaded_count
Exemplo n.º 44
0
    def do_test(ctfile):
        inpath = utils.get_test_data('random-20-a.fa')
        savepath = utils.get_temp_filename(ctfile)

        sizes = khmer.get_n_primes_near_x(1, 2 ** 31 + 1000)

        orig = khmer._Countgraph(12, sizes)
        orig.consume_fasta(inpath)
        orig.save(savepath)

        loaded = khmer.load_countgraph(savepath)

        orig_count = orig.n_occupied()
        loaded_count = loaded.n_occupied()
        assert orig_count == 3966, orig_count
        assert loaded_count == orig_count, loaded_count
Exemplo n.º 45
0
def test_normalize_by_median_no_bigcount():
    infile = utils.copy_test_data('test-abund-read-2.fa')
    hashfile = utils.get_temp_filename('test-out.ct')
    in_dir = os.path.dirname(infile)

    _make_counting(infile, K=8)

    script = 'normalize-by-median.py'
    args = ['-C', '1000', '-k 8', '--savegraph', hashfile, infile]

    (status, out, err) = utils.runscript(script, args, in_dir)
    assert status == 0, (out, err)
    print((out, err))

    assert os.path.exists(hashfile), hashfile
    kh = khmer.load_countgraph(hashfile)

    assert kh.get('GGTTGACG') == 255
Exemplo n.º 46
0
def test_normalize_by_median_no_bigcount():
    infile = utils.get_temp_filename('test.fa')
    hashfile = utils.get_temp_filename('test-out.ct')
    in_dir = os.path.dirname(infile)

    shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
    _make_counting(infile, K=8)

    script = 'normalize-by-median.py'
    args = ['-C', '1000', '-k 8', '--savegraph', hashfile, infile]

    (status, out, err) = utils.runscript(script, args, in_dir)
    assert status == 0, (out, err)
    print((out, err))

    assert os.path.exists(hashfile), hashfile
    kh = khmer.load_countgraph(hashfile)

    assert kh.get('GGTTGACG') == 255
Exemplo n.º 47
0
def main():
    parser = build_counting_args()
    parser.add_argument('--coverage',
                        '-C',
                        dest='coverage',
                        default=DEFAULT_COVERAGE,
                        type=int)
    args = parser.parse_args()

    counting_ht = args.input_table
    infiles = args.input_filenames

    print('file with ht: %s' % counting_ht)

    print('loading hashtable')
    ht = khmer.load_countgraph(counting_ht)
    K = ht.ksize()

    print("K:", K)

    # the filtering function.
    def process_fn(record):
        name = record['name']
        seq = record['sequence']

        med, avg, dev = ht.get_median_count(seq)

        if random.randint(1, med) > args.coverage:
            return None, None

        return name, seq

    # the filtering loop
    for infile in infiles:
        print('filtering', infile)
        outfile = os.path.basename(infile) + '.medfilt'
        outfp = open(outfile, 'w')

        tsp = ThreadedSequenceProcessor(process_fn)
        tsp.start(verbose_loader(infile), outfp)

        print('output in', outfile)
Exemplo n.º 48
0
def test_maxcount_with_bigcount_save():
    # hashtable should not saturate, if use_bigcount is set.
    kh = khmer.Countgraph(4, 4**4, 4)
    kh.set_use_bigcount(True)

    for _ in range(0, 1000):
        kh.count('AAAA')
        c = kh.get('AAAA')

    savepath = utils.get_temp_filename('tempcountingsave.ht')
    kh.save(savepath)

    try:
        kh = khmer.load_countgraph(savepath)
    except OSError as err:
        assert 0, "Should not produce an OSError: " + str(err)

    c = kh.get('AAAA')
    assert c == 1000, "should be able to count to 1000: %d" % c
    assert c != MAX_COUNT, c
Exemplo n.º 49
0
def test_maxcount_with_bigcount_save():
    # hashtable should not saturate, if use_bigcount is set.
    kh = khmer.Countgraph(4, 4 ** 4, 4)
    kh.set_use_bigcount(True)

    for _ in range(0, 1000):
        kh.count('AAAA')
        c = kh.get('AAAA')

    savepath = utils.get_temp_filename('tempcountingsave.ht')
    kh.save(savepath)

    try:
        kh = khmer.load_countgraph(savepath)
    except OSError as err:
        assert 0, "Should not produce an OSError: " + str(err)

    c = kh.get('AAAA')
    assert c == 1000, "should be able to count to 1000: %d" % c
    assert c != MAX_COUNT, c
Exemplo n.º 50
0
def test_normalize_by_median_no_bigcount():
    infile = utils.get_temp_filename("test.fa")
    hashfile = utils.get_temp_filename("test-out.ct")
    outfile = infile + ".keep"
    in_dir = os.path.dirname(infile)

    shutil.copyfile(utils.get_test_data("test-abund-read-2.fa"), infile)
    counting_ht = _make_counting(infile, K=8)

    script = "normalize-by-median.py"
    args = ["-C", "1000", "-k 8", "--savegraph", hashfile, infile]

    (status, out, err) = utils.runscript(script, args, in_dir)
    assert status == 0, (out, err)
    print((out, err))

    assert os.path.exists(hashfile), hashfile
    kh = khmer.load_countgraph(hashfile)

    assert kh.get("GGTTGACG") == 255
Exemplo n.º 51
0
def test_nobigcount_save():
    kh = khmer.Countgraph(4, 4 ** 4, 4)
    # kh.set_use_bigcount(False) <-- this is the default

    savepath = utils.get_temp_filename('tempcountingsave.ht')
    kh.save(savepath)

    try:
        kh = khmer.load_countgraph(savepath)
    except OSError as err:
        assert 0, 'Should not produce an OSError: ' + str(err)

    # set_use_bigcount should still be False after load (i.e. should be saved)

    assert kh.get('AAAA') == 0

    for _ in range(0, 1000):
        kh.count('AAAA')
        kh.get('AAAA')

    assert kh.get('AAAA') == MAX_COUNT
Exemplo n.º 52
0
def test_load_gz_truncated_should_fail():
    inpath = utils.get_test_data('random-20-a.fa')
    savepath = utils.get_temp_filename('tempcountingsave0.ht.gz')

    hi = khmer.Countgraph(12, 1000, 2)
    hi.consume_seqfile(inpath)
    hi.save(savepath)

    fp = open(savepath, 'rb')
    data = fp.read()
    fp.close()

    fp = open(savepath, 'wb')
    fp.write(data[:1000])
    fp.close()

    try:
        hi = khmer.load_countgraph(savepath)
        assert 0, "load should fail"
    except OSError as e:
        print(str(e))
Exemplo n.º 53
0
def test_nobigcount_save():
    kh = khmer.Countgraph(4, 4**4, 4)
    # kh.set_use_bigcount(False) <-- this is the default

    savepath = utils.get_temp_filename('tempcountingsave.ht')
    kh.save(savepath)

    try:
        kh = khmer.load_countgraph(savepath)
    except OSError as err:
        assert 0, 'Should not produce an OSError: ' + str(err)

    # set_use_bigcount should still be False after load (i.e. should be saved)

    assert kh.get('AAAA') == 0

    for _ in range(0, 1000):
        kh.count('AAAA')
        kh.get('AAAA')

    assert kh.get('AAAA') == MAX_COUNT
Exemplo n.º 54
0
def test_load_gz_truncated_should_fail():
    inpath = utils.get_test_data('random-20-a.fa')
    savepath = utils.get_temp_filename('tempcountingsave0.ht.gz')

    hi = khmer.Countgraph(12, 1000, 2)
    hi.consume_fasta(inpath)
    hi.save(savepath)

    fp = open(savepath, 'rb')
    data = fp.read()
    fp.close()

    fp = open(savepath, 'wb')
    fp.write(data[:1000])
    fp.close()

    try:
        hi = khmer.load_countgraph(savepath)
        assert 0, "load should fail"
    except OSError as e:
        print(str(e))
Exemplo n.º 55
0
def test_bigcount_save():
    # hashtable should not saturate, if use_bigcount is set.
    kh = khmer.Countgraph(4, 4**4, 4)
    kh.set_use_bigcount(True)

    savepath = utils.get_temp_filename('tempcountingsave.ht')
    kh.save(savepath)

    try:
        kh = khmer.load_countgraph(savepath)
    except OSError as err:
        assert 0, "Should not produce an OSError: " + str(err)

    # set_use_bigcount should still be True after load (i.e. should be saved)

    assert kh.get('AAAA') == 0

    for _ in range(0, 1000):
        kh.count('AAAA')
        kh.get('AAAA')

    assert kh.get('AAAA') == 1000
Exemplo n.º 56
0
def test_save_load_gz():
    inpath = utils.get_test_data('random-20-a.fa')
    savepath = utils.get_temp_filename('tempcountingsave2.ht.gz')

    sizes = list(PRIMES_1m)
    sizes.append(1000005)

    hi = khmer._Countgraph(12, sizes)
    hi.consume_seqfile(inpath)
    hi.save(savepath)

    try:
        ht = khmer.load_countgraph(savepath)
    except OSError as err:
        assert 0, 'Should not produce an OSError: ' + str(err)

    tracking = khmer._Nodegraph(12, sizes)
    x = hi.abundance_distribution(inpath, tracking)

    tracking = khmer._Nodegraph(12, sizes)
    y = ht.abundance_distribution(inpath, tracking)

    assert sum(x) == 3966, sum(x)
    assert x == y, (x, y)