Пример #1
0
def test_bloom_c_2():  # simple one
    K = 4
    HT_SIZE = 10  # use 11
    N_HT1 = 1  # hashtable size = 11
    N_HT2 = 2  # hashtable size = 11,13

    # use only 1 hashtable, no bloom filter
    ht1 = khmer.LabelHash(K, HT_SIZE, N_HT1)
    ht1.count('AAAA')  # 00 00 00 00 = 0
    ht1.count('ACTG')  # 00 10 01 11 =
    assert ht1.n_unique_kmers() == 2
    ht1.count('AACG')  # 00 00 10 11 = 11  # collision  with 1st kmer
    assert ht1.n_unique_kmers() == 2
    ht1.count('AGAC')  # 00  11 00 10 # collision  with 2nd kmer
    assert ht1.n_unique_kmers() == 2

    # use two hashtables with 11,13
    ht2 = khmer.LabelHash(K, HT_SIZE, N_HT2)
    ht2.count('AAAA')  # 00 00 00 00 = 0

    ht2.count('ACTG')  # 00 10 01 11 = 2*16 +4 +3 = 39
    assert ht2.n_unique_kmers() == 2
    ht2.count('AACG')  # 00 00 10 11 = 11  # collision with only 1st kmer
    assert ht2.n_unique_kmers() == 3
    ht2.count('AGAC')  # 00  11 00 10  3*16 +2 = 50
    # collision with both 2nd and 3rd kmers

    assert ht2.n_unique_kmers() == 3
Пример #2
0
def test_count_within_radius_big():
    inpfile = utils.get_test_data('random-20-a.fa')
    ht = khmer.LabelHash(20, 1e6, 4)

    ht.consume_fasta(inpfile)
    n = ht.count_kmers_within_radius('CGCAGGCTGGATTCTAGAGG', int(1e6))
    assert n == 3960

    ht = khmer.LabelHash(21, 1e6, 4)
    ht.consume_fasta(inpfile)
    n = ht.count_kmers_within_radius('CGCAGGCTGGATTCTAGAGGC', int(1e6))
    assert n == 39
Пример #3
0
def test_tag_across_stoptraverse():
    filename = utils.get_test_data('random-20-a.fa')

    K = 20  # size of kmer
    HT_SIZE = 100000  # size of hashtable
    N_HT = 3  # number of hashtables

    ht = khmer.LabelHash(K, HT_SIZE, N_HT)

    # without tagging/joining across consume, this breaks into two partition;
    # with, it is one partition.
    ht.add_stop_tag('CCGAATATATAACAGCGACG')

    ht.consume_fasta_and_tag_with_stoptags(filename)  # DO join reads across

    subset = ht.do_subset_partition(0, 0)
    n, _ = ht.count_partitions()
    assert n == 99  # reads only connected by traversal...

    n, _ = ht.subset_count_partitions(subset)
    assert n == 2  # but need main to cross stoptags.

    ht.merge_subset(subset)

    n, _ = ht.count_partitions()  # ta-da!
    assert n == 1, n
Пример #4
0
def test__get_set_tag_density():
    ht = khmer.LabelHash(32, 1, 1)

    orig = ht._get_tag_density()
    assert orig != 2
    ht._set_tag_density(2)
    assert ht._get_tag_density() == 2
Пример #5
0
def test_extract_unique_paths_2():
    kh = khmer.LabelHash(10, 1e5, 4)

    kh.consume('ATGGAGAGAC')
    x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
    print x
    assert x == ['TGGAGAGACACAGATAGACAGGAGTGGCGATG']  # all but the 1st k-mer
Пример #6
0
def test_find_radius_for_volume():
    inpfile = utils.get_test_data('all-A.fa')
    ht = khmer.LabelHash(4, 1e6, 2)
    ht.consume_fasta(inpfile)

    assert ht.find_radius_for_volume('AAAA', 0, 100) == 0
    assert ht.find_radius_for_volume('AAAA', 1, 100) == 0
    assert ht.find_radius_for_volume('AAAA', 2, 100) == 100
Пример #7
0
def test_find_stoptags():
    ht = khmer.LabelHash(5, 1, 1)
    ht.add_stop_tag("AAAAA")

    assert ht.identify_stoptags_by_position("AAAAA") == [0]
    assert ht.identify_stoptags_by_position("AAAAAA") == [0, 1]
    assert ht.identify_stoptags_by_position("TTTTT") == [0]
    assert ht.identify_stoptags_by_position("TTTTTT") == [0, 1]
Пример #8
0
def test_count_kmer_degree():
    inpfile = utils.get_test_data('all-A.fa')
    ht = khmer.LabelHash(4, 1e6, 2)
    ht.consume_fasta(inpfile)

    assert ht.kmer_degree('AAAA') == 2
    assert ht.kmer_degree('AAAT') == 1
    assert ht.kmer_degree('AATA') == 0
    assert ht.kmer_degree('TAAA') == 1
Пример #9
0
def test_extract_unique_paths_0():
    kh = khmer.LabelHash(10, 1e5, 4)

    x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
    assert x == ['ATGGAGAGACACAGATAGACAGGAGTGGCGATG']

    kh.consume('ATGGAGAGACACAGATAGACAGGAGTGGCGATG')
    x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
    assert not x
Пример #10
0
def test_count_within_radius_simple():
    inpfile = utils.get_test_data('all-A.fa')
    ht = khmer.LabelHash(4, 1e6, 2)

    print ht.consume_fasta(inpfile)
    n = ht.count_kmers_within_radius('AAAA', 1)
    assert n == 1

    n = ht.count_kmers_within_radius('AAAA', 10)
    assert n == 1
Пример #11
0
def test_extract_unique_paths_4():
    kh = khmer.LabelHash(10, 1e5, 4)

    kh.consume('ATGGAGAGAC')
    kh.consume('AGTGGCGATG')

    kh.consume('ATAGACAGGA')

    x = kh.extract_unique_paths('ATGGAGAGACACAGATAGACAGGAGTGGCGATG', 10, 1)
    print x
    assert x == ['TGGAGAGACACAGATAGACAGG', 'TAGACAGGAGTGGCGAT']
Пример #12
0
def test_filter_if_present():
    ht = khmer.LabelHash(32, 1e6, 2)

    maskfile = utils.get_test_data('filter-test-A.fa')
    inputfile = utils.get_test_data('filter-test-B.fa')
    outfile = utils.get_temp_filename('filter')

    ht.consume_fasta(maskfile)
    ht.filter_if_present(inputfile, outfile)

    records = list(fasta_iter(open(outfile)))
    assert len(records) == 1
    assert records[0]['name'] == '3'
Пример #13
0
def test_load_partitioned():
    inpfile = utils.get_test_data('combine_parts_1.fa')
    ht = khmer.LabelHash(32, 1, 1)

    ht.consume_partitioned_fasta(inpfile)
    assert ht.count_partitions() == (2, 0)

    s1 = "CATGCAGAAGTTCCGCAACCATACCGTTCAGT"
    assert ht.get(s1)

    s2 = "CAAATGTACATGCACTTAAAATCATCCAGCCG"
    assert ht.get(s2)

    s3 = "CATGCAGAAGTTCCGCAACCATACCGTTCAGTTCCTGGTGGCTA"[-32:]
    assert ht.get(s3)
Пример #14
0
def test_n_occupied_1():
    filename = utils.get_test_data('random-20-a.fa')

    K = 20  # size of kmer
    HT_SIZE = 100000  # size of hashtable
    N_HT = 1  # number of hashtables

    # test modified c++ n_occupied code
    ht1 = khmer.LabelHash(K, HT_SIZE, N_HT)

    for n, record in enumerate(fasta_iter(open(filename))):
        ht1.consume(record['sequence'])

    # this number calculated independently
    assert ht1.n_occupied() == 3877
Пример #15
0
def test_simple_median():
    hi = khmer.LabelHash(6, 1e6, 2)

    (median, average, stddev) = hi.get_median_count("AAAAAA")
    print median, average, stddev
    assert median == 0
    assert average == 0.0
    assert stddev == 0.0

    hi.consume("AAAAAA")
    (median, average, stddev) = hi.get_median_count("AAAAAA")
    print median, average, stddev
    assert median == 1
    assert average == 1.0
    assert stddev == 0.0
Пример #16
0
def test_bloom_c_1():
    # test c++ code to count unique kmers using bloom filter

    filename = utils.get_test_data('random-20-a.fa')

    K = 20  # size of kmer
    HT_SIZE = 100000  # size of hashtable
    N_HT = 3  # number of hashtables

    ht3 = khmer.LabelHash(K, HT_SIZE, N_HT)

    for n, record in enumerate(fasta_iter(open(filename))):
        ht3.consume(record['sequence'])

    assert ht3.n_occupied() == 3882
    assert ht3.n_unique_kmers() == 3960
Пример #17
0
def test_circumference():
    ht = khmer.LabelHash(4, 1e6, 2)

    ht.count('ATGC')
    ht.count('GATG')
    ht.count('ATGG')

    x = ht.count_kmers_on_radius('GATG', 1, 200)
    assert x == 2

    ht.count('ATGA')
    x = ht.count_kmers_on_radius('GATG', 1, 200)
    assert x == 3, x

    ht.count('TGAT')
    x = ht.count_kmers_on_radius('GATG', 1, 200)
    assert x == 4, x
Пример #18
0
def test_n_occupied_2():  # simple one
    K = 4
    HT_SIZE = 10  # use 11
    N_HT = 1

    ht1 = khmer.LabelHash(K, HT_SIZE, N_HT)
    ht1.count('AAAA')  # 00 00 00 00 = 0
    assert ht1.n_occupied() == 1

    ht1.count('ACTG')  # 00 10 01 11 =
    assert ht1.n_occupied() == 2

    ht1.count('AACG')  # 00 00 10 11 = 11  # collision 1

    assert ht1.n_occupied() == 2
    ht1.count('AGAC')  # 00  11 00 10 # collision 2
    assert ht1.n_occupied() == 2
Пример #19
0
def test_notag_across_stoptraverse():
    filename = utils.get_test_data('random-20-a.fa')

    K = 20  # size of kmer
    HT_SIZE = 100000  # size of hashtable
    N_HT = 3  # number of hashtables

    ht = khmer.LabelHash(K, HT_SIZE, N_HT)

    # connecting k-mer at the beginning/end of a read: breaks up into two.
    ht.add_stop_tag('TTGCATACGTTGAGCCAGCG')

    ht.consume_fasta_and_tag_with_stoptags(filename)

    subset = ht.do_subset_partition(0, 0)
    ht.merge_subset(subset)

    n, _ = ht.count_partitions()
    assert n == 2, n
Пример #20
0
def test_stop_traverse():
    filename = utils.get_test_data('random-20-a.fa')

    K = 20  # size of kmer
    HT_SIZE = 100000  # size of hashtable
    N_HT = 3  # number of hashtables

    ht = khmer.LabelHash(K, HT_SIZE, N_HT)

    # without tagging/joining across consume, this breaks into two partition;
    # with, it is one partition.
    ht.add_stop_tag('TTGCATACGTTGAGCCAGCG')

    ht.consume_fasta_and_tag(filename)  # DO NOT join reads across stoptags
    subset = ht.do_subset_partition(0, 0, True)
    ht.merge_subset(subset)

    n, _ = ht.count_partitions()
    assert n == 2, n
Пример #21
0
def test_save_load_tagset():
    ht = khmer.LabelHash(32, 1, 1)

    outfile = utils.get_temp_filename('tagset')

    ht.add_tag('A' * 32)
    ht.save_tagset(outfile)

    ht.add_tag('G' * 32)

    ht.load_tagset(outfile)  # implicitly => clear_tags=True
    ht.save_tagset(outfile)

    # if tags have been cleared, then the new tagfile will be larger (34 bytes)
    # else smaller (26 bytes).

    fp = open(outfile, 'rb')
    data = fp.read()
    fp.close()
    assert len(data) == 26, len(data)
Пример #22
0
def test_save_load_tagset_noclear():
    ht = khmer.LabelHash(32, 1, 1)

    outfile = utils.get_temp_filename('tagset')

    ht.add_tag('A' * 32)
    ht.save_tagset(outfile)

    ht.add_tag('G' * 32)

    ht.load_tagset(outfile, False)  # set clear_tags => False; zero tags
    ht.save_tagset(outfile)

    # if tags have been cleared, then the new tagfile will be large (34 bytes);
    # else small (26 bytes).

    fp = open(outfile, 'rb')
    data = fp.read()
    fp.close()
    assert len(data) == 34, len(data)
Пример #23
0
def test_find_unpart_fail():
    filename = utils.get_test_data('random-20-a.odd.fa')
    filename2 = utils.get_test_data('random-20-a.odd.fa')  # <- switch to odd

    K = 20  # size of kmer
    HT_SIZE = 100000  # size of hashtable
    N_HT = 3  # number of hashtables

    ht = khmer.LabelHash(K, HT_SIZE, N_HT)
    ht.consume_fasta_and_tag(filename)

    subset = ht.do_subset_partition(0, 0)
    ht.merge_subset(subset)

    n, _ = ht.count_partitions()
    assert n == 49

    ht.find_unpart(filename2, True, False)
    n, _ = ht.count_partitions()
    assert n == 49, n  # only 49 sequences worth of tags
Пример #24
0
def test_find_unpart_notraverse():
    filename = utils.get_test_data('random-20-a.odd.fa')
    filename2 = utils.get_test_data('random-20-a.even.fa')

    K = 20  # size of kmer
    HT_SIZE = 100000  # size of hashtable
    N_HT = 3  # number of hashtables

    ht = khmer.LabelHash(K, HT_SIZE, N_HT)
    ht.consume_fasta_and_tag(filename)

    subset = ht.do_subset_partition(0, 0)
    ht.merge_subset(subset)

    n, _ = ht.count_partitions()
    assert n == 49

    ht.find_unpart(filename2, False, False)  # <-- don't traverse
    n, _ = ht.count_partitions()
    assert n == 99, n  # all sequences disconnected
Пример #25
0
def test_bloom_python_1():
    # test python code to count unique kmers using bloom filter
    filename = utils.get_test_data('random-20-a.fa')

    K = 20  # size of kmer
    HT_SIZE = 100000  # size of hashtable
    N_HT = 3  # number of hashtables

    ht2 = khmer.LabelHash(K, HT_SIZE, N_HT)

    n_unique = 0
    for n, record in enumerate(fasta_iter(open(filename))):
        sequence = record['sequence']
        seq_len = len(sequence)
        for n in range(0, seq_len + 1 - K):
            kmer = sequence[n:n + K]
            if (not ht2.get(kmer)):
                n_unique += 1
            ht2.count(kmer)

    assert n_unique == 3960
    assert ht2.n_occupied() == 3882
    assert ht2.n_unique_kmers() == 3960  # this number equals to n_unique
Пример #26
0
def test_combine_pe():
    inpfile = utils.get_test_data('combine_parts_1.fa')
    ht = khmer.LabelHash(32, 1, 1)

    ht.consume_partitioned_fasta(inpfile)
    assert ht.count_partitions() == (2, 0)

    s1 = "CATGCAGAAGTTCCGCAACCATACCGTTCAGT"
    pid1 = ht.get_partition_id(s1)

    s2 = "CAAATGTACATGCACTTAAAATCATCCAGCCG"
    pid2 = ht.get_partition_id(s2)

    assert pid1 == 2
    assert pid2 == 80293

    ht.join_partitions(pid1, pid2)

    pid1 = ht.get_partition_id(s1)
    pid2 = ht.get_partition_id(s2)

    assert pid1 == pid2
    assert ht.count_partitions() == (1, 0)
Пример #27
0
def main():
    info('sweep-reads-buffered.py', ['sweep'])
    parser = get_parser()
    args = parser.parse_args()

    if args.min_tablesize < MIN_HSIZE:
        args.min_tablesize = MIN_HSIZE
    if args.ksize < MIN_KSIZE:
        args.ksize = MIN_KSIZE

    report_on_config(args, hashtype='hashbits')

    K = args.ksize
    HT_SIZE = args.min_tablesize
    N_HT = args.n_tables

    traversal_range = args.traversal_range
    input_fastp = args.input_fastp

    if not args.outdir:
        outdir = os.path.dirname(input_fastp)
    else:
        outdir = args.outdir

    max_buffers = args.max_buffers
    output_pref = args.output_prefix
    buf_size = args.buffer_size
    max_reads = args.max_reads

    check_file_status(args.input_fastp)
    check_valid_file_exists(args.input_files)
    all_input_files = [input_fastp]
    all_input_files.extend(args.input_files)

    # Check disk space availability
    check_space(all_input_files)

    # figure out input file type (FA/FQ) -- based on first file
    ix = iter(screed.open(args.input_files[0]))
    record = ix.next()
    del ix

    extension = 'fa'
    if hasattr(record, 'accuracy'):  # fastq!
        extension = 'fq'

    output_buffer = ReadBufferManager(max_buffers, max_reads, buf_size,
                                      output_pref, outdir, extension)

    # consume the partitioned fasta with which to label the graph
    ht = khmer.LabelHash(K, HT_SIZE, N_HT)
    try:
        print >> sys.stderr, 'consuming input sequences...'
        if args.label_by_pid:
            print >> sys.stderr, '...labeling by partition id (pid)'
            ht.consume_partitioned_fasta_and_tag_with_labels(input_fastp)
        elif args.label_by_seq:
            print >> sys.stderr, '...labeling by sequence'
            for n, record in enumerate(screed.open(input_fastp)):
                if n % 50000 == 0:
                    print >>sys.stderr, \
                        '...consumed {n} sequences...'.format(n=n)
                ht.consume_sequence_and_tag_with_labels(record.sequence, n)
        else:
            print >>sys.stderr, \
                '...labeling to create groups of size {s}'.format(
                    s=args.group_size)
            label = -1
            g = 0
            try:
                outfp = open(
                    '{pref}_base_{g}.{ext}'.format(pref=output_pref,
                                                   g=g,
                                                   ext=extension), 'wb')
                for n, record in enumerate(screed.open(input_fastp)):
                    if n % args.group_size == 0:
                        label += 1
                        if label > g:
                            g = label
                            outfp = open(
                                '{pref}_base_{g}.{ext}'.format(
                                    pref=output_pref, g=g, ext=extension),
                                'wb')
                    if n % 50000 == 0:
                        print >>sys.stderr, \
                            '...consumed {n} sequences...'.format(n=n)
                    ht.consume_sequence_and_tag_with_labels(
                        record.sequence, label)

                    if hasattr(record, 'accuracy'):
                        outfp.write('@{name}\n{seq}+{accuracy}\n'.format(
                            name=record.name,
                            seq=record.sequence,
                            accuracy=record.accuracy))
                    else:
                        outfp.write('>{name}\n{seq}\n'.format(
                            name=record.name, seq=record.sequence))

            except IOError as e:
                print >> sys.stderr, '!! ERROR !!', e
                print >> sys.stderr, '...error splitting input. exiting...'

    except IOError as e:
        print >> sys.stderr, '!! ERROR: !!', e
        print >> sys.stderr, '...error consuming \
                            {i}. exiting...'.format(i=input_fastp)

    print >> sys.stderr, 'done consuming input sequence. \
                        added {t} tags and {l} \
                        labels...'.format(t=ht.n_tags(), l=ht.n_labels())

    label_dict = defaultdict(int)
    label_number_dist = []

    n_orphaned = 0
    n_labeled = 0
    n_mlabeled = 0

    total_t = time.clock()
    start_t = time.clock()
    for read_file in args.input_files:
        print >> sys.stderr, '** sweeping {read_file} for labels...'.format(
            read_file=read_file)
        file_t = 0.0
        try:
            read_fp = screed.open(read_file)
        except IOError as error:
            print >> sys.stderr, '!! ERROR: !!', error
            print >> sys.stderr, '*** Could not open {fn}, skipping...'.format(
                fn=read_file)
        else:
            for _, record in enumerate(read_fp):
                if _ % 50000 == 0:
                    end_t = time.clock()
                    batch_t = end_t - start_t
                    file_t += batch_t
                    print >>sys.stderr, '\tswept {n} reads [{nc} labeled, \
                                         {no} orphaned] \
                                        ** {sec}s ({sect}s total)' \
                                        .format(n=_, nc=n_labeled,
                                                no=n_orphaned,
                                                sec=batch_t, sect=file_t)
                    start_t = time.clock()
                seq = record.sequence
                name = record.name
                try:
                    labels = ht.sweep_label_neighborhood(seq, traversal_range)
                except ValueError as e:
                    pass
                else:
                    if hasattr(record, 'accuracy'):
                        seq_str = fmt_fastq(name, seq, record.accuracy, labels)
                    else:
                        seq_str = fmt_fasta(name, seq, labels)
                    label_number_dist.append(len(labels))
                    if labels:
                        n_labeled += 1
                        if len(labels) > 1:
                            output_buffer.queue(seq_str, 'multi')
                            n_mlabeled += 1
                            label_dict['multi'] += 1
                        else:
                            output_buffer.queue(seq_str, labels[0])
                            label_dict[labels[0]] += 1
                    else:
                        n_orphaned += 1
                        output_buffer.queue(seq_str, 'orphaned')
                        label_dict['orphaned'] += 1
            print >> sys.stderr, '** End of file {fn}...'.format(fn=read_file)
            output_buffer.flush_all()
            read_fp.close()

    # gotta output anything left in the buffers at the end!
    print >> sys.stderr, '** End of run...'
    output_buffer.flush_all()
    total_t = time.clock() - total_t

    if output_buffer.num_write_errors > 0 or output_buffer.num_file_errors > 0:
        print >> sys.stderr, '! WARNING: Sweep finished with errors !'
        print >> sys.stderr, '** {writee} reads not written'.format(
            writee=output_buffer.num_write_errors)
        print >> sys.stderr, '** {filee} errors opening files'.format(
            filee=output_buffer.num_file_errors)

    print >> sys.stderr, 'swept {n_reads} for labels...'.format(
        n_reads=n_labeled + n_orphaned)
    print >> sys.stderr, '...with {nc} labeled and {no} orphaned'.format(
        nc=n_labeled, no=n_orphaned)
    print >> sys.stderr, '...and {nmc} multilabeled'.format(nmc=n_mlabeled)

    print >> sys.stderr, '** outputting label number distribution...'
    fn = os.path.join(outdir, '{pref}.dist.txt'.format(pref=output_pref))
    with open(fn, 'wb') as outfp:
        for nc in label_number_dist:
            outfp.write('{nc}\n'.format(nc=nc))

    fn = os.path.join(outdir, '{pref}.counts.csv'.format(pref=output_pref))
    print >> sys.stderr, '** outputting label read counts...'
    with open(fn, 'wb') as outfp:
        for k in label_dict:
            outfp.write('{l},{c}\n'.format(l=k, c=label_dict[k]))
Пример #28
0
def test_find_stoptags2():
    ht = khmer.LabelHash(4, 1, 1)
    ht.add_stop_tag("ATGC")

    x = ht.identify_stoptags_by_position("ATGCATGCGCAT")
    assert x == [0, 2, 4, 8], x
Пример #29
0
def test_get_ksize():
    kh = khmer.LabelHash(22, 1, 1)
    assert kh.ksize() == 22
Пример #30
0
def test_get_hashsizes():
    kh = khmer.LabelHash(22, 100, 4)
    assert kh.hashsizes() == [101, 103, 107, 109], kh.hashsizes()