Exemplo n.º 1
0
def test_load_gz():
    inpath = utils.get_test_data('random-20-a.fa')

    savepath = utils.get_temp_filename('tempcountingsave1.ht')
    loadpath = utils.get_temp_filename('tempcountingsave1.ht.gz')

    sizes = list(PRIMES_1m)
    sizes.append(1000005)

    # save uncompressed hashtable.
    hi = khmer._Countgraph(12, sizes)
    hi.consume_fasta(inpath)
    hi.save(savepath)

    # compress.
    in_file = open(savepath, 'rb')
    out_file = gzip.open(loadpath, 'wb')
    out_file.writelines(in_file)
    out_file.close()
    in_file.close()

    # load compressed hashtable.
    try:
        ht = khmer.load_countgraph(loadpath)
    except OSError as err:
        assert 0, "Should not produce an OSError: " + str(err)

    tracking = khmer._Nodegraph(12, sizes)
    x = hi.abundance_distribution(inpath, tracking)

    tracking = khmer._Nodegraph(12, sizes)
    y = ht.abundance_distribution(inpath, tracking)

    assert sum(x) == 3966, sum(x)
    assert x == y, (x, y)
Exemplo n.º 2
0
def test_save_load_gz():
    inpath = utils.get_test_data('random-20-a.fa')
    savepath = utils.get_temp_filename('tempcountingsave2.ht.gz')

    sizes = list(PRIMES_1m)
    sizes.append(1000005)

    hi = khmer._Countgraph(12, sizes)
    hi.consume_fasta(inpath)
    hi.save(savepath)

    ht = khmer._Countgraph(12, sizes)
    try:
        ht.load(savepath)
    except OSError as err:
        assert 0, 'Should not produce an OSError: ' + str(err)

    tracking = khmer._Nodegraph(12, sizes)
    x = hi.abundance_distribution(inpath, tracking)

    tracking = khmer._Nodegraph(12, sizes)
    y = ht.abundance_distribution(inpath, tracking)

    assert sum(x) == 3966, sum(x)
    assert x == y, (x, y)
Exemplo n.º 3
0
def test_bloom_c_2():  # simple one
    ksize = 4

    # use only 1 hashtable, no bloom filter
    nodegraph = khmer._Nodegraph(ksize, [11])
    nodegraph.count('AAAA')  # 00 00 00 00 = 0
    nodegraph.count('ACTG')  # 00 10 01 11 =
    assert nodegraph.n_unique_kmers() == 2
    nodegraph.count('AACG')  # 00 00 10 11 = 11  # collision  with 1st kmer
    assert nodegraph.n_unique_kmers() == 2
    nodegraph.count('AGAC')   # 00  11 00 10 # collision  with 2nd kmer
    assert nodegraph.n_unique_kmers() == 2

    # use two hashtables with 11,13
    other_nodegraph = khmer._Nodegraph(ksize, [11, 13])
    other_nodegraph.count('AAAA')  # 00 00 00 00 = 0

    other_nodegraph.count('ACTG')  # 00 10 01 11 = 2*16 +4 +3 = 39
    assert other_nodegraph.n_unique_kmers() == 2
    # 00 00 10 11 = 11  # collision with only 1st kmer
    other_nodegraph.count('AACG')
    assert other_nodegraph.n_unique_kmers() == 3
    other_nodegraph.count('AGAC')
    # 00  11 00 10  3*16 +2 = 50
    # collision with both 2nd and 3rd kmers

    assert other_nodegraph.n_unique_kmers() == 3
Exemplo n.º 4
0
def test_abund_dist_gz_bigcount_compressed_first():
    infile = utils.copy_test_data('test-abund-read-2.fa')
    script = 'load-into-counting.py'
    htfile = utils.get_temp_filename('test_ct.gz')
    args = ['-x', str(1e7), '-N', str(2), '-k', str(2), htfile, infile]
    utils.runscript(script, args)  # create a bigcount table
    assert os.path.exists(htfile)
    data = gzip.open(htfile, 'rb').read()  # read compressed bigcount table

    outfile = utils.get_temp_filename('test_ct')
    f_out = open(outfile, 'wb')  # output the bigcount table
    f_out.write(data)
    f_out.close()
    # load the compressed bigcount table
    try:
        countgraph = khmer.load_countgraph(outfile)
    except OSError as err:
        assert 0, 'Should not produce OSError: ' + str(err)

    assert countgraph.n_occupied() != 0
    hashsizes = countgraph.hashsizes()
    kmer_size = countgraph.ksize()
    tracking = khmer._Nodegraph(kmer_size, hashsizes)
    abundances = countgraph.abundance_distribution(infile, tracking)
    # calculate abundance distribution for compressed bigcount table
    flag = False
    # check if abundance is > 255
    # if ok  gzipped bigcount was loaded correctly
    for _, i in enumerate(abundances):
        print(_, i)
        if _ > 255 and i > 0:
            flag = True
            break
    assert flag
Exemplo n.º 5
0
 def test_save_load_merge_nexist(self):
     ht = khmer._Nodegraph(20, [1])
     try:
         ht.load_subset_partitionmap('this does not exist')
         assert 0, "this should not succeed"
     except OSError as e:
         print(str(e))
Exemplo n.º 6
0
def main():
    info('count-kmers.py', ['counting'])
    args = get_parser().parse_args()

    print ('hashtable from', args.input_count_graph_filename,
           file=sys.stderr)
    countgraph = khmer.load_countgraph(
        args.input_count_graph_filename)

    kmer_size = countgraph.ksize()
    hashsizes = countgraph.hashsizes()
    tracking = khmer._Nodegraph(  # pylint: disable=protected-access
        kmer_size, hashsizes)

    if args.output_file is None:
        args.output_file = sys.stdout
    writer = csv.writer(args.output_file)

    for filename in args.input_sequence_filenames:
        for record in screed.open(filename):
            seq = record.sequence.replace('N', 'A')
            for i in range(len(seq) - kmer_size + 1):
                kmer = seq[i:i+kmer_size]
                if not tracking.get(kmer):
                    tracking.count(kmer)
                    writer.writerow([kmer, str(countgraph.get(kmer))])

    print ('Total number of unique k-mers: {0}'.format(
        countgraph.n_unique_kmers()), file=sys.stderr)
Exemplo n.º 7
0
def test_save_load_tagset_trunc():
    nodegraph = khmer._Nodegraph(32, [1])

    outfile = utils.get_temp_filename('tagset')

    nodegraph.add_tag('A' * 32)
    nodegraph.add_tag('G' * 32)
    nodegraph.save_tagset(outfile)

    # truncate tagset file...
    fp = open(outfile, 'rb')
    data = fp.read()
    fp.close()

    for i in range(len(data)):
        fp = open(outfile, 'wb')
        fp.write(data[:i])
        fp.close()

        # try loading it...
        try:
            nodegraph.load_tagset(outfile)
            assert 0, "this test should fail"
        except OSError as err:
            print(str(err), i)

    # try loading it...
    try:
        nodegraph.load_tagset(outfile)
        assert 0, "this test should fail"
    except OSError:
        pass
Exemplo n.º 8
0
def test__get_set_tag_density():
    nodegraph = khmer._Nodegraph(32, [1])

    orig = nodegraph._get_tag_density()
    assert orig != 2
    nodegraph._set_tag_density(2)
    assert nodegraph._get_tag_density() == 2
Exemplo n.º 9
0
def test_extract_unique_paths_2():
    kh = khmer._Nodegraph(10, [5, 7, 11, 13])

    kh.consume('ATGGAGAGAC')
    x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
    print(x)
    assert x == ['TGGAGAGACACAGATAGACAGGAGTGGCGATG']  # all but the 1st k-mer
Exemplo n.º 10
0
    def test_count_A(self):
        A_filename = utils.get_test_data('all-A.fa')

        tracking = khmer._Nodegraph(4, [5])
        dist = self.kh.abundance_distribution(A_filename, tracking)

        assert sum(dist) == 1
        assert dist[10] == 1
Exemplo n.º 11
0
def test_find_stoptags():
    nodegraph = khmer._Nodegraph(5, [1])
    nodegraph.add_stop_tag("AAAAA")

    assert nodegraph.identify_stoptags_by_position("AAAAA") == [0]
    assert nodegraph.identify_stoptags_by_position("AAAAAA") == [0, 1]
    assert nodegraph.identify_stoptags_by_position("TTTTT") == [0]
    assert nodegraph.identify_stoptags_by_position("TTTTTT") == [0, 1]
Exemplo n.º 12
0
def test_save_load_tagset_notexist():
    nodegraph = khmer._Nodegraph(32, [1])

    outfile = utils.get_temp_filename('tagset')
    try:
        nodegraph.load_tagset(outfile)
        assert 0, "this test should fail"
    except OSError as e:
        print(str(e))
Exemplo n.º 13
0
def test_tagset_filetype_check():
    nodegraph = khmer._Nodegraph(31, [1])

    inpath = utils.get_test_data('goodversion-k32.stoptags')
    try:
        nodegraph.load_tagset(inpath)
        assert 0, "this should fail"
    except OSError as e:
        print(str(e))
Exemplo n.º 14
0
def test_extract_unique_paths_0():
    kh = khmer._Nodegraph(10, [5, 7, 11, 13])

    x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
    assert x == ['ATGGAGAGACACAGATAGACAGGAGTGGCGATG']

    kh.consume('ATGGAGAGACACAGATAGACAGGAGTGGCGATG')
    x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
    assert not x
Exemplo n.º 15
0
def test_count_kmer_degree():
    inpfile = utils.get_test_data('all-A.fa')
    nodegraph = khmer._Nodegraph(4, [3, 5])
    nodegraph.consume_fasta(inpfile)

    assert nodegraph.kmer_degree('AAAA') == 2
    assert nodegraph.kmer_degree('AAAT') == 1
    assert nodegraph.kmer_degree('AATA') == 0
    assert nodegraph.kmer_degree('TAAA') == 1
Exemplo n.º 16
0
def test_hashbits_file_version_check():
    nodegraph = khmer._Nodegraph(12, [1])

    inpath = utils.get_test_data('badversion-k12.htable')

    try:
        nodegraph.load(inpath)
        assert 0, "this should fail"
    except OSError as e:
        print(str(e))
Exemplo n.º 17
0
def test_count_within_radius_simple():
    inpfile = utils.get_test_data('all-A.fa')
    nodegraph = khmer._Nodegraph(4, [3, 5])

    print(nodegraph.consume_fasta(inpfile))
    n = nodegraph.count_kmers_within_radius('AAAA', 1)
    assert n == 1

    n = nodegraph.count_kmers_within_radius('AAAA', 10)
    assert n == 1
Exemplo n.º 18
0
def test_stoptags_file_version_check():
    nodegraph = khmer._Nodegraph(32, [1])

    inpath = utils.get_test_data('badversion-k32.stoptags')

    try:
        nodegraph.load_stop_tags(inpath)
        assert 0, "this should fail"
    except OSError as e:
        print(str(e))
Exemplo n.º 19
0
def test_fakelump_load_stop_tags_notexist():
    fakelump_fa_foo = utils.get_temp_filename('fakelump.fa.stopfoo')

    # ok, now try loading these stop tags; should fail.
    ht = khmer._Nodegraph(32, [5, 7, 11, 13])

    try:
        ht.load_stop_tags(fakelump_fa_foo)
        assert 0, "this test should fail"
    except OSError:
        pass
Exemplo n.º 20
0
def test_read_cleaning_abundance_distribution(Countingtype):
    infile = utils.get_test_data('valid-read-testing.fq')

    x = Countingtype(15, PRIMES_1m)
    y = _Nodegraph(15, PRIMES_1m)

    x.consume_seqfile(infile)

    dist = x.abundance_distribution(infile, y)
    assert dist[1] == 35                  # k-mers with non-ACGTN => ignored.
    assert dist[2] == 69
Exemplo n.º 21
0
def test_abund_dist_A(tabletype):
    A_filename = utils.get_test_data('all-A.fa')

    kh = tabletype(4, PRIMES_1m)
    tracking = khmer._Nodegraph(4, PRIMES_1m)

    kh.consume_seqfile(A_filename)
    dist = kh.abundance_distribution(A_filename, tracking)

    print(dist[:10])
    assert sum(dist) == 1
    assert dist[0] == 0
Exemplo n.º 22
0
def test_nodegraph_file_type_check():
    kh = khmer._Countgraph(12, [1])
    savepath = utils.get_temp_filename('tempcountingsave0.ct')
    kh.save(savepath)

    nodegraph = khmer._Nodegraph(12, [1])

    try:
        nodegraph.load(savepath)
        assert 0, "this should fail"
    except OSError as e:
        print(str(e))
Exemplo n.º 23
0
def test_filter_if_present():
    nodegraph = khmer._Nodegraph(32, [3, 5])

    maskfile = utils.get_test_data('filter-test-A.fa')
    inputfile = utils.get_test_data('filter-test-B.fa')
    outfile = utils.get_temp_filename('filter')

    nodegraph.consume_fasta(maskfile)
    nodegraph.filter_if_present(inputfile, outfile)

    records = list(screed.open(outfile))
    assert len(records) == 1
    assert records[0]['name'] == '3'
Exemplo n.º 24
0
def main():
    info('count-kmers-single.py', ['counting'])
    args = get_parser().parse_args()

    check_input_files(args.input_sequence_filename, False)

    print ('making k-mer countgraph', file=sys.stderr)
    countgraph = khmer.Countgraph(args.ksize, args.max_tablesize,
                                            args.n_tables)
    # @CTB countgraph.set_use_bigcount(args.bigcount)

    kmer_size = countgraph.ksize()
    hashsizes = countgraph.hashsizes()
    tracking = khmer._Nodegraph(  # pylint: disable=protected-access
        kmer_size, hashsizes)

    print ('kmer_size: %s' % countgraph.ksize(), file=sys.stderr)
    print ('k-mer countgraph sizes: %s' % (countgraph.hashsizes(),),
           file=sys.stderr)

    if args.output_file is None:
        args.output_file = sys.stdout
    writer = csv.writer(args.output_file)

    # start loading
    rparser = khmer.ReadParser(args.input_sequence_filename)
    threads = []
    print ('consuming input, round 1 -- %s' % (args.input_sequence_filename),
           file=sys.stderr)
    for _ in range(args.threads):
        thread = \
            threading.Thread(
                target=countgraph.consume_fasta_with_reads_parser,
                args=(rparser, )
            )
        threads.append(thread)
        thread.start()

    for thread in threads:
        thread.join()

    for record in screed.open(args.input_sequence_filename):
        seq = record.sequence.replace('N', 'A')
        for i in range(len(seq) - kmer_size + 1):
            kmer = seq[i:i+kmer_size]
            if not tracking.get(kmer):
                tracking.count(kmer)
                writer.writerow([kmer, str(countgraph.get(kmer))])

    print ('Total number of unique k-mers: {0}'.format(
        countgraph.n_unique_kmers()), file=sys.stderr)
Exemplo n.º 25
0
def _build_testfiles():
    # nodegraph file

    inpath = utils.get_test_data('random-20-a.fa')
    hi = khmer._Nodegraph(12, 2)
    hi.consume_fasta(inpath)
    hi.save('/tmp/goodversion-k12.htable')

    # tagset file

    nodegraph = khmer._Nodegraph(32, [1])

    nodegraph.add_tag('A' * 32)
    nodegraph.add_tag('G' * 32)
    nodegraph.save_tagset('/tmp/goodversion-k32.tagset')

    # stoptags file

    fakelump_fa = utils.get_test_data('fakelump.fa')

    nodegraph = khmer.Nodegraph(32, 4, 4)
    nodegraph.consume_fasta_and_tag(fakelump_fa)

    subset = nodegraph.do_subset_partition(0, 0)
    nodegraph.merge_subset(subset)

    EXCURSION_DISTANCE = 40
    EXCURSION_KMER_THRESHOLD = 82
    EXCURSION_KMER_COUNT_THRESHOLD = 1
    counting = khmer.Countgraph(32, 4, 4)

    nodegraph.repartition_largest_partition(None, counting,
                                            EXCURSION_DISTANCE,
                                            EXCURSION_KMER_THRESHOLD,
                                            EXCURSION_KMER_COUNT_THRESHOLD)

    nodegraph.save_stop_tags('/tmp/goodversion-k32.stoptags')
Exemplo n.º 26
0
def test_load_partitioned():
    inpfile = utils.get_test_data('combine_parts_1.fa')
    nodegraph = khmer._Nodegraph(32, [1])

    nodegraph.consume_partitioned_fasta(inpfile)
    assert nodegraph.count_partitions() == (2, 0)

    first_seq = "CATGCAGAAGTTCCGCAACCATACCGTTCAGT"
    assert nodegraph.get(first_seq)

    second_seq = "CAAATGTACATGCACTTAAAATCATCCAGCCG"
    assert nodegraph.get(second_seq)

    third_s = "CATGCAGAAGTTCCGCAACCATACCGTTCAGTTCCTGGTGGCTA"[-32:]
    assert nodegraph.get(third_s)
Exemplo n.º 27
0
    def test_not_output_unassigned(self):

        filename = utils.get_test_data('random-20-a.fa')

        ht = khmer._Nodegraph(21, [5, 7, 11, 13])
        ht.consume_seqfile_and_tag(filename)

        output_file = utils.get_temp_filename('parttest')
        ht.output_partitions(filename, output_file, False)

        len1 = len(list(screed.open(filename)))
        len2 = len(list(screed.open(output_file)))

        assert len1 > 0
        assert len2 == 0, len2
Exemplo n.º 28
0
    def test_not_output_unassigned(self):

        filename = utils.get_test_data('random-20-a.fa')

        ht = khmer._Nodegraph(21, [5, 7, 11, 13])
        ht.consume_seqfile_and_tag(filename)

        output_file = utils.get_temp_filename('parttest')
        ht.output_partitions(filename, output_file, False)

        len1 = len(list(screed.open(filename)))
        len2 = len(list(screed.open(output_file)))

        assert len1 > 0
        assert len2 == 0, len2
Exemplo n.º 29
0
def test_save_load():
    inpath = utils.get_test_data('random-20-a.fa')
    savepath = utils.get_temp_filename('tempcountingsave0.ht')

    sizes = list(PRIMES_1m)
    sizes.append(1000005)

    hi = khmer._Countgraph(12, sizes)
    hi.consume_fasta(inpath)
    hi.save(savepath)

    try:
        ht = khmer.load_countgraph(savepath)
    except OSError as err:
        assert 0, 'Should not produce an OSError: ' + str(err)

    tracking = khmer._Nodegraph(12, sizes)
    x = hi.abundance_distribution(inpath, tracking)

    tracking = khmer._Nodegraph(12, sizes)
    y = ht.abundance_distribution(inpath, tracking)

    assert sum(x) == 3966, sum(x)
    assert x == y, (x, y)
Exemplo n.º 30
0
def test_n_occupied_2():  # simple one
    ksize = 4

    nodegraph = khmer._Nodegraph(ksize, [11])
    nodegraph.count('AAAA')  # 00 00 00 00 = 0
    assert nodegraph.n_occupied() == 1

    nodegraph.count('ACTG')  # 00 10 01 11 =
    assert nodegraph.n_occupied() == 2

    nodegraph.count('AACG')  # 00 00 10 11 = 11  # collision 1

    assert nodegraph.n_occupied() == 2
    nodegraph.count('AGAC')   # 00  11 00 10 # collision 2
    assert nodegraph.n_occupied() == 2, nodegraph.n_occupied()
Exemplo n.º 31
0
def test_n_occupied_2_add_is_count():  # 'add' synonym for 'count'
    ksize = 4

    nodegraph = khmer._Nodegraph(ksize, [11])
    nodegraph.add('AAAA')  # 00 00 00 00 = 0
    assert nodegraph.n_occupied() == 1

    nodegraph.add('ACTG')  # 00 10 01 11 =
    assert nodegraph.n_occupied() == 2

    nodegraph.add('AACG')  # 00 00 10 11 = 11  # collision 1

    assert nodegraph.n_occupied() == 2
    nodegraph.add('AGAC')   # 00  11 00 10 # collision 2
    assert nodegraph.n_occupied() == 2, nodegraph.n_occupied()
def test_load_gz():
    inpath = utils.get_test_data('random-20-a.fa')

    savepath = utils.get_temp_filename('tempcountingsave1.ht')
    loadpath = utils.get_temp_filename('tempcountingsave1.ht.gz')

    sizes = list(PRIMES_1m)
    sizes.append(1000005)

    # save uncompressed hashtable.
    hi = khmer._Countgraph(12, sizes)
    hi.consume_fasta(inpath)
    hi.save(savepath)

    # compress.
    in_file = open(savepath, 'rb')
    out_file = gzip.open(loadpath, 'wb')
    out_file.writelines(in_file)
    out_file.close()
    in_file.close()

    # load compressed hashtable.
    ht = khmer._Countgraph(12, sizes)
    try:
        ht.load(loadpath)
    except OSError as err:
        assert 0, "Should not produce an OSError: " + str(err)

    tracking = khmer._Nodegraph(12, sizes)
    x = hi.abundance_distribution(inpath, tracking)

    tracking = khmer._Nodegraph(12, sizes)
    y = ht.abundance_distribution(inpath, tracking)

    assert sum(x) == 3966, sum(x)
    assert x == y, (x, y)
Exemplo n.º 33
0
def test_consume_absentfasta_with_reads_parser():
    nodegraph = khmer._Nodegraph(31, [1])
    try:
        nodegraph.consume_fasta_with_reads_parser()
        assert 0, "this should fail"
    except TypeError as err:
        print(str(err))
    try:
        readparser = ReadParser(utils.get_test_data('empty-file'))
        nodegraph.consume_fasta_with_reads_parser(readparser)
        assert 0, "this should fail"
    except OSError as err:
        print(str(err))
    except ValueError as err:
        print(str(err))
    def test_output_unassigned(self):
        import screed

        filename = utils.get_test_data('random-20-a.fa')

        ht = khmer._Nodegraph(21, [5, 7, 11, 13])
        ht.consume_fasta_and_tag(filename)

        output_file = utils.get_temp_filename('part0test')
        ht.output_partitions(filename, output_file, True)

        len1 = len(list(screed.open(filename)))
        len2 = len(list(screed.open(output_file)))

        assert len1 > 0
        assert len1 == len2, (len1, len2)
Exemplo n.º 35
0
def test_n_occupied_2():  # simple one
    ksize = 4
    htable_size = 10  # use 11
    num_nodegraphs = 1

    nodegraph = khmer._Nodegraph(ksize, [11])
    nodegraph.count('AAAA')  # 00 00 00 00 = 0
    assert nodegraph.n_occupied() == 1

    nodegraph.count('ACTG')  # 00 10 01 11 =
    assert nodegraph.n_occupied() == 2

    nodegraph.count('AACG')  # 00 00 10 11 = 11  # collision 1

    assert nodegraph.n_occupied() == 2
    nodegraph.count('AGAC')  # 00  11 00 10 # collision 2
    assert nodegraph.n_occupied() == 2, nodegraph.n_occupied()
Exemplo n.º 36
0
def test_stop_tags_truncate_check():
    nodegraph = khmer._Nodegraph(32, [1])

    inpath = utils.get_test_data('goodversion-k32.tagset')
    data = open(inpath, 'rb').read()

    truncpath = utils.get_temp_filename('zzz')
    for i in range(len(data)):
        fp = open(truncpath, 'wb')
        fp.write(data[:i])
        fp.close()

        try:
            nodegraph.load_stop_tags(truncpath)
            assert 0, "expect failure of previous command"
        except OSError as e:
            print(i, str(e))
Exemplo n.º 37
0
def test_fakelump_load_stop_tags_trunc():
    fakelump_fa = utils.get_test_data('fakelump.fa')
    fakelump_fa_foo = utils.get_temp_filename('fakelump.fa.stopfoo')

    ht = khmer.Nodegraph(32, 1e5, 4)
    ht.consume_fasta_and_tag(fakelump_fa)

    subset = ht.do_subset_partition(0, 0)
    ht.merge_subset(subset)

    (n_partitions, _) = ht.count_partitions()
    assert n_partitions == 1, n_partitions

    # now, break partitions on any k-mer that you see more than once
    # on big excursions, where big excursions are excursions 40 out
    # that encounter more than 82 k-mers.  This should specifically
    # identify our connected sequences in fakelump...

    EXCURSION_DISTANCE = 40
    EXCURSION_KMER_THRESHOLD = 82
    EXCURSION_KMER_COUNT_THRESHOLD = 1
    counting = khmer._Countgraph(32, [5, 7, 11, 13])

    ht.repartition_largest_partition(None, counting, EXCURSION_DISTANCE,
                                     EXCURSION_KMER_THRESHOLD,
                                     EXCURSION_KMER_COUNT_THRESHOLD)

    ht.save_stop_tags(fakelump_fa_foo)
    data = open(fakelump_fa_foo, 'rb').read()

    fp = open(fakelump_fa_foo, 'wb')
    fp.write(data[:10])
    fp.close()

    # ok, now try loading these stop tags; should fail.
    ht = khmer._Nodegraph(32, [5, 7, 11, 13])
    ht.consume_fasta_and_tag(fakelump_fa)

    try:
        ht.load_stop_tags(fakelump_fa_foo)
        assert 0, "this test should fail"
    except OSError:
        pass
Exemplo n.º 38
0
def test_save_load_tagset():
    nodegraph = khmer._Nodegraph(32, [1])

    outfile = utils.get_temp_filename('tagset')

    nodegraph.add_tag('A' * 32)
    nodegraph.save_tagset(outfile)

    nodegraph.add_tag('G' * 32)

    nodegraph.load_tagset(outfile)  # implicitly => clear_tags=True
    nodegraph.save_tagset(outfile)

    # if tags have been cleared, then the new tagfile will be larger (34 bytes)
    # else smaller (26 bytes).

    fp = open(outfile, 'rb')
    data = fp.read()
    fp.close()
    assert len(data) == 30, len(data)
Exemplo n.º 39
0
def test_save_load_tagset_noclear():
    nodegraph = khmer._Nodegraph(32, [1])

    outfile = utils.get_temp_filename('tagset')

    nodegraph.add_tag('A' * 32)
    nodegraph.save_tagset(outfile)

    nodegraph.add_tag('G' * 32)

    nodegraph.load_tagset(outfile, False)  # set clear_tags => False; zero tags
    nodegraph.save_tagset(outfile)

    # if tags have been cleared, then the new tagfile will be large (34 bytes);
    # else small (26 bytes).

    fp = open(outfile, 'rb')
    data = fp.read()
    fp.close()
    assert len(data) == 38, len(data)
Exemplo n.º 40
0
def test_output_partitions():
    filename = utils.get_test_data('test-output-partitions.fa')

    ht = khmer._Nodegraph(10, [1])
    ht.set_partition_id('TTAGGACTGC', 2)
    ht.set_partition_id('TGCGTTTCAA', 3)
    ht.set_partition_id('ATACTGTAAA', 4)

    outfile = utils.get_temp_filename('part')
    ht.output_partitions(filename, outfile)

    data = open(outfile).read()
    assert len(data)

    records = [r for r in screed.open(outfile)]
    names = [r.name for r in records]
    parts = [n.rsplit('\t', 1)[1] for n in names]

    assert parts[0] == '2'
    assert parts[1] == '3'
    assert parts[2] == '4'
Exemplo n.º 41
0
def test_kmer_neighbors():
    inpfile = utils.get_test_data('all-A.fa')
    nodegraph = khmer._Nodegraph(4, [3, 5])
    nodegraph.consume_fasta(inpfile)

    h = khmer.forward_hash('AAAA', 4)
    print(type('AAAA'))
    assert nodegraph.neighbors(h) == [0, 0]  # AAAA on both sides
    assert nodegraph.neighbors('AAAA') == [0, 0]  # AAAA on both sides

    h = khmer.forward_hash('AAAT', 4)
    assert nodegraph.neighbors(h) == [0]  # AAAA on one side
    assert nodegraph.neighbors('AAAT') == [0]  # AAAA on one side

    h = khmer.forward_hash('AATA', 4)
    assert nodegraph.neighbors(h) == []  # no neighbors
    assert nodegraph.neighbors('AATA') == []  # AAAA on one side

    h = khmer.forward_hash('TAAA', 4)
    assert nodegraph.neighbors(h) == [0]  # AAAA on both sides
    assert nodegraph.neighbors('TAAA') == [0]  # AAAA on both sides
Exemplo n.º 42
0
    def test_save_merge_from_disk_ksize(self):
        ht = khmer.Nodegraph(20, 4 ** 4 + 1, 2)
        filename = utils.get_test_data('test-graph2.fa')

        (total_reads, _) = ht.consume_seqfile_and_tag(filename)
        assert total_reads == 3, total_reads

        divvy = ht.divide_tags_into_subsets(1)
        print(divvy)
        (a, b, _) = divvy

        outfile1 = utils.get_temp_filename('x.pmap')
        x = ht.do_subset_partition(a, b)
        ht.save_subset_partitionmap(x, outfile1)
        del x

        ht = khmer._Nodegraph(19, [1])
        try:
            ht.merge_subset_from_disk(outfile1)
            assert 0, "this should fail"
        except OSError as e:
            print(str(e))
Exemplo n.º 43
0
def test_kmer_neighbors_wrong_ksize():
    inpfile = utils.get_test_data('all-A.fa')
    nodegraph = khmer._Nodegraph(4, [3, 5])
    nodegraph.consume_fasta(inpfile)

    try:
        nodegraph.neighbors('AAAAA')
        assert 0, "neighbors() should fail with too long string"
    except ValueError:
        pass

    try:
        nodegraph.neighbors(b'AAAAA')
        assert 0, "neighbors() should fail with too long string"
    except ValueError:
        pass

    try:
        nodegraph.neighbors({})
        assert 0, "neighbors() should fail with non hash/str arg"
    except ValueError:
        pass
Exemplo n.º 44
0
def test_combine_pe():
    inpfile = utils.get_test_data('combine_parts_1.fa')
    nodegraph = khmer._Nodegraph(32, [1])

    nodegraph.consume_partitioned_fasta(inpfile)
    assert nodegraph.count_partitions() == (2, 0)

    first_seq = "CATGCAGAAGTTCCGCAACCATACCGTTCAGT"
    pid1 = nodegraph.get_partition_id(first_seq)

    second_seq = "CAAATGTACATGCACTTAAAATCATCCAGCCG"
    pid2 = nodegraph.get_partition_id(second_seq)

    assert pid1 == 2
    assert pid2 == 80293

    nodegraph.join_partitions(pid1, pid2)

    pid1 = nodegraph.get_partition_id(first_seq)
    pid2 = nodegraph.get_partition_id(second_seq)

    assert pid1 == pid2
    assert nodegraph.count_partitions() == (1, 0)
Exemplo n.º 45
0
def _build_testfiles():
    # nodegraph file

    inpath = utils.get_test_data('random-20-a.fa')
    hi = khmer.Nodegraph(12, 2)
    hi.consume_fasta(inpath)
    hi.save('/tmp/goodversion-k12.htable')

    # tagset file

    nodegraph = khmer._Nodegraph(32, [1])

    nodegraph.add_tag('A' * 32)
    nodegraph.add_tag('G' * 32)
    nodegraph.save_tagset('/tmp/goodversion-k32.tagset')

    # stoptags file

    fakelump_fa = utils.get_test_data('fakelump.fa')

    nodegraph = khmer.Nodegraph(32, 4, 4)
    nodegraph.consume_fasta_and_tag(fakelump_fa)

    subset = nodegraph.do_subset_partition(0, 0)
    nodegraph.merge_subset(subset)

    EXCURSION_DISTANCE = 40
    EXCURSION_KMER_THRESHOLD = 82
    EXCURSION_KMER_COUNT_THRESHOLD = 1
    counting = khmer.Countgraph(32, 4, 4)

    nodegraph.repartition_largest_partition(None, counting, EXCURSION_DISTANCE,
                                            EXCURSION_KMER_THRESHOLD,
                                            EXCURSION_KMER_COUNT_THRESHOLD)

    nodegraph.save_stop_tags('/tmp/goodversion-k32.stoptags')
Exemplo n.º 46
0
def test_get_ksize():
    kh = khmer._Nodegraph(22, [1])
    assert kh.ksize() == 22
Exemplo n.º 47
0
def test_bad_create():
    try:
        nodegraph = khmer._Nodegraph(5, [])
    except ValueError as err:
        assert 'tablesizes needs to be one or more numbers' in str(err)
Exemplo n.º 48
0
def main():
    args = sanitize_help(get_parser()).parse_args()

    configure_logging(args.quiet)

    infiles = [args.input_count_graph_filename, args.input_sequence_filename]
    for infile in infiles:
        check_input_files(infile, False)

    log_info('Loading counting graph from {graph}',
             graph=args.input_count_graph_filename)
    countgraph = khmer.load_countgraph(args.input_count_graph_filename)

    if not countgraph.get_use_bigcount() and args.bigcount:
        log_warn("WARNING: The loaded graph has bigcount DISABLED while "
                 "bigcount reporting is ENABLED--counts higher than 255 will "
                 "not be reported.")

    countgraph.set_use_bigcount(args.bigcount)

    kmer_size = countgraph.ksize()
    hashsizes = countgraph.hashsizes()
    tracking = khmer._Nodegraph(  # pylint: disable=protected-access
        kmer_size, hashsizes)

    log_info('K: {ksize}', ksize=kmer_size)
    log_info('outputting to {output}', output=args.output_histogram_filename)

    if args.output_histogram_filename in ('-', '/dev/stdout'):
        pass
    elif os.path.exists(args.output_histogram_filename):
        if not args.squash_output:
            log_error('ERROR: {output} exists; not squashing.',
                      output=args.output_histogram_filename)
            sys.exit(1)

        log_info('** squashing existing file {output}',
                 output=args.output_histogram_filename)

    log_info('preparing hist...')
    abundances = countgraph.abundance_distribution(
        args.input_sequence_filename, tracking)
    total = sum(abundances)

    if 0 == total:
        log_error("ERROR: abundance distribution is uniformly zero; "
                  "nothing to report.")
        log_error("\tPlease verify that the input files are valid.")
        sys.exit(1)

    if args.output_histogram_filename in ('-', '/dev/stdout'):
        countgraph_fp = sys.stdout
    else:
        countgraph_fp = open(args.output_histogram_filename, 'w')
    countgraph_fp_csv = csv.writer(countgraph_fp)
    # write headers:
    countgraph_fp_csv.writerow(
        ['abundance', 'count', 'cumulative', 'cumulative_fraction'])

    sofar = 0
    for _, i in enumerate(abundances):
        if i == 0 and not args.output_zero:
            continue

        sofar += i
        frac = sofar / float(total)

        countgraph_fp_csv.writerow([_, i, sofar, round(frac, 3)])

        if sofar == total:
            break
Exemplo n.º 49
0
def test__get_set_tag_density():
    nodegraph = khmer._Nodegraph(32, [1])
    orig = nodegraph._get_tag_density()
    assert orig != 2
    nodegraph._set_tag_density(2)
    assert nodegraph._get_tag_density() == 2
Exemplo n.º 50
0
def test_add_stop_tag():
    nodegraph = khmer._Nodegraph(6, [1])

    nodegraph.add_stop_tag('AATAAG')
    print(nodegraph.get_stop_tags())
    assert nodegraph.get_stop_tags() == ['AATAAG']
Exemplo n.º 51
0
def test_find_stoptagsecond_seq():
    nodegraph = khmer._Nodegraph(4, [1])
    nodegraph.add_stop_tag("ATGC")

    x = nodegraph.identify_stoptags_by_position("ATGCATGCGCAT")
    assert x == [0, 2, 4, 8], x
Exemplo n.º 52
0
def test_consume_partitioned_fail():
    inpfile = utils.get_test_data('test-reads.fa')
    nodegraph = khmer._Nodegraph(32, [1])

    with pytest.raises(ValueError):
        nodegraph.consume_partitioned_fasta(inpfile)
Exemplo n.º 53
0
def test_bad_primes_list():
    try:
        coutingtable = khmer._Nodegraph(31, ["a", "b", "c"], 1)
        assert 0, "Bad primes list should fail"
    except TypeError as e:
        print(str(e))