Пример #1
0
def test_read_pair_iterator_in_error_mode():
    assert 0

    rparser = \
        ReadParser(utils.get_test_data("test-abund-read-paired.fa"))

    # If walks like an iterator and quacks like an iterator...
    rpi = rparser.iter_read_pairs()
    assert "__iter__" in dir(rpi)
    assert "next" in dir(rpi)

    # Are the alleged pairs actually pairs?
    read_pairs_1 = []
    for read_1, read_2 in rpi:
        read_pairs_1.append([read_1, read_2])
        assert read_1.name[: 19] == read_2.name[: 19]

    # Reload parser.
    # Note: No 'rewind' or 'reset' capability at the time of this writing.
    rparser = \
        ReadParser(utils.get_test_data("test-abund-read-paired.fa"))

    # Ensure that error mode is the default mode.
    read_pairs_2 = []
    for read_1, read_2 \
            in rparser.iter_read_pairs(ReadParser.PAIR_MODE_ERROR_ON_UNPAIRED):
        read_pairs_2.append([read_1, read_2])
    matches = \
        map(
            lambda rp1, rp2: rp1[0].name == rp2[0].name,
            read_pairs_1, read_pairs_2
        )
    assert all(matches)  # Assert ALL the matches. :-]
Пример #2
0
def test_count_overlap():
    seqfile1 = utils.get_temp_filename('test-overlap1.fa')
    in_dir = os.path.dirname(seqfile1)
    seqfile2 = utils.get_temp_filename('test-overlap2.fa', in_dir)
    outfile = utils.get_temp_filename('overlap.out', in_dir)
    curvefile = utils.get_temp_filename('overlap.out.curve', in_dir)
    shutil.copy(utils.get_test_data('test-overlap1.fa'), seqfile1)
    shutil.copy(utils.get_test_data('test-overlap2.fa'), seqfile2)
    htfile = _make_graph(seqfile1, ksize=20)
    script = scriptpath('count-overlap.py')
    args = ['--ksize', '20', '--n_tables', '2', '--min-tablesize', '10000000',
            htfile + '.pt', seqfile2, outfile]
    (status, out, err) = runscript(script, args, in_dir)
    assert status == 0
    assert os.path.exists(outfile), outfile
    data = [x.strip() for x in open(outfile)]
    data = set(data)
    assert '# of unique k-mers in dataset2: 759047' in data
    assert '# of overlap unique k-mers: 245621' in data
    assert os.path.exists(curvefile), curvefile
    data = [x.strip() for x in open(curvefile)]
    data = set(data)
    assert '178633 1155' in data
    assert '496285 2970' in data
    assert '752053 238627' in data
Пример #3
0
def test_split_paired_reads_1_fa():
    # test input file
    infile = utils.get_test_data("paired.fa")

    ex_outfile1 = utils.get_test_data("paired.fa.1")
    ex_outfile2 = utils.get_test_data("paired.fa.2")

    # actual output files...
    outfile1 = utils.get_temp_filename("paired.fa.1")
    in_dir = os.path.dirname(outfile1)
    outfile2 = utils.get_temp_filename("paired.fa.2", in_dir)

    script = scriptpath("split-paired-reads.py")
    args = [infile]

    runscript(script, args, in_dir)

    assert os.path.exists(outfile1), outfile1
    assert os.path.exists(outfile2), outfile2

    n = 0
    for r, q in zip(screed.open(ex_outfile1), screed.open(outfile1)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
    assert n > 0

    n = 0
    for r, q in zip(screed.open(ex_outfile2), screed.open(outfile2)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
    assert n > 0
Пример #4
0
def test_normalize_by_median_force():
    CUTOFF = '1'

    corrupt_infile = utils.get_temp_filename('test-corrupt.fq')
    good_infile = utils.get_temp_filename('test-good.fq',
                                          tempdir=os.path.dirname(
                                              corrupt_infile))

    in_dir = os.path.dirname(good_infile)

    shutil.copyfile(utils.get_test_data('test-error-reads.fq'), corrupt_infile)
    shutil.copyfile(utils.get_test_data('test-fastq-reads.fq'), good_infile)

    script = scriptpath('normalize-by-median.py')
    args = ['-f', '-C', CUTOFF, '-k', '17', corrupt_infile, good_infile]

    (status, out, err) = runscript(script, args, in_dir)

    test_ht = khmer.load_counting_hash(corrupt_infile + '.ct.failed')
    test_good_read = 'CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT'
    test_good_read2 = 'TAGTATCATCAAGGTTCAAGATGTTAATGAATAACAATTGCGCAGCAA'
    assert test_ht.count(test_good_read[:17]) > 0
    assert test_ht.count(test_good_read2[:17]) > 0
    assert os.path.exists(corrupt_infile + '.ct.failed')
    assert '*** Skipping' in err
    assert '** IOErrors' in err
Пример #5
0
def test_with_multiple_threads(testfile="test-reads.fq.bz2"):

    import operator
    import threading

    reads_count_1thr = 0
    rparser = ReadParser(utils.get_test_data(testfile))
    for read in rparser:
        reads_count_1thr += 1

    def count_reads(rparser, counters, tnum):
        counters[tnum] = reduce(operator.add, (1 for read in rparser))

    N_THREADS = 4
    threads = []
    reads_counts_per_thread = [0] * N_THREADS
    rparser = ReadParser(utils.get_test_data(testfile))
    for tnum in xrange(N_THREADS):
        t = \
            threading.Thread(
                target=count_reads,
                args=[rparser, reads_counts_per_thread, tnum]
            )
        threads.append(t)
        t.start()
    for t in threads:
        t.join()

    assert reads_count_1thr == sum(reads_counts_per_thread), \
        reads_counts_per_thread
Пример #6
0
def test_extract_paired_reads_1_fa():
    # test input file
    infile = utils.get_test_data('paired-mixed.fa')

    ex_outfile1 = utils.get_test_data('paired-mixed.fa.pe')
    ex_outfile2 = utils.get_test_data('paired-mixed.fa.se')

    # actual output files...
    outfile1 = utils.get_temp_filename('paired-mixed.fa.pe')
    in_dir = os.path.dirname(outfile1)
    outfile2 = utils.get_temp_filename('paired-mixed.fa.se', in_dir)

    script = scriptpath('extract-paired-reads.py')
    args = [infile]

    runscript(script, args, in_dir)

    assert os.path.exists(outfile1), outfile1
    assert os.path.exists(outfile2), outfile2

    n = 0
    for r, q in zip(screed.open(ex_outfile1), screed.open(outfile1)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
    assert n > 0

    n = 0
    for r, q in zip(screed.open(ex_outfile2), screed.open(outfile2)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
    assert n > 0
Пример #7
0
def test_split_paired_reads_2_fq():
    # test input file
    infile = utils.get_test_data('paired.fq')

    ex_outfile1 = utils.get_test_data('paired.fq.1')
    ex_outfile2 = utils.get_test_data('paired.fq.2')

    # actual output files...
    outfile1 = utils.get_temp_filename('paired.fq.1')
    in_dir = os.path.dirname(outfile1)
    outfile2 = utils.get_temp_filename('paired.fq.2', in_dir)

    script = scriptpath('split-paired-reads.py')
    args = [infile]

    runscript(script, args, in_dir)

    assert os.path.exists(outfile1), outfile1
    assert os.path.exists(outfile2), outfile2

    n = 0
    for r, q in zip(screed.open(ex_outfile1), screed.open(outfile1)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
        assert r.accuracy == q.accuracy
    assert n > 0

    n = 0
    for r, q in zip(screed.open(ex_outfile2), screed.open(outfile2)):
        n += 1
        assert r.name == q.name
        assert r.sequence == q.sequence
        assert r.accuracy == q.accuracy
    assert n > 0
Пример #8
0
def test_with_multiple_threads():

    import operator
    import threading

    reads_count_1thr = 0
    rparser = ReadParser(utils.get_test_data("test-reads.fq.bz2"))
    for read in rparser:
        reads_count_1thr += 1

    def count_reads(rparser, counters, tnum):
        counters[tnum] = reduce(operator.add, (1 for read in rparser))

    N_THREADS = 4
    config = khmer.get_config()
    bufsz = config.get_reads_input_buffer_size()
    config.set_reads_input_buffer_size(N_THREADS * 64 * 1024)
    threads = []
    reads_counts_per_thread = [0] * N_THREADS
    rparser = ReadParser(utils.get_test_data("test-reads.fq.bz2"), N_THREADS)
    for tnum in xrange(N_THREADS):
        t = \
            threading.Thread(
                target=count_reads,
                args=[rparser, reads_counts_per_thread, tnum]
            )
        threads.append(t)
        t.start()
    for t in threads:
        t.join()
    config.set_reads_input_buffer_size(bufsz)

    assert reads_count_1thr == sum(reads_counts_per_thread)
Пример #9
0
def test_make_initial_stoptags():
    # gen input files using load-graph.py -t
    # should keep test_data directory size down
    # or something like that
    # this assumes (obv.) load-graph works properly
    bzinfile = utils.get_temp_filename('test-reads.fq.bz2')
    shutil.copyfile(utils.get_test_data('test-reads.fq.bz2'), bzinfile)
    in_dir = os.path.dirname(bzinfile)

    genscript = scriptpath('load-graph.py')
    genscriptargs = ['-t', 'test-reads', 'test-reads.fq.bz2']
    utils.runscript(genscript, genscriptargs, in_dir)

    # test input file gen'd by load-graphs
    infile = utils.get_temp_filename('test-reads.pt')
    infile2 = utils.get_temp_filename('test-reads.tagset', in_dir)

    # get file to compare against
    ex_outfile = utils.get_test_data('test-reads.stoptags')

    # actual output file
    outfile1 = utils.get_temp_filename('test-reads.stoptags', in_dir)

    script = scriptpath('make-initial-stoptags.py')
    # make-initial-stoptags has weird file argument syntax
    # read the code before modifying
    args = ['test-reads']

    utils.runscript(script, args, in_dir)
    assert os.path.exists(outfile1), outfile1
Пример #10
0
def test_extract_long_sequences():

    script = scriptpath('extract-long-sequences.py')
    fq_infile = utils.get_temp_filename('test.fq')
    fa_infile = utils.get_temp_filename('test.fa')

    shutil.copyfile(utils.get_test_data('paired-mixed.fq'), fq_infile)
    shutil.copyfile(utils.get_test_data('paired-mixed.fa'), fa_infile)

    fq_outfile = fq_infile + '.keep.fq'
    fa_outfile = fa_infile + '.keep.fa'

    in_dir_fq = os.path.dirname(fq_infile)
    in_dir_fa = os.path.dirname(fa_infile)

    args = [fq_infile, '-l', '10', '-o', 'fq_outfile']
    (status, out, err) = runscript(script, args, in_dir_fa)

    countlines = sum(1 for line in open(fq_infile))
    assert countlines == 44, countlines

    args = [fa_infile, '-l', '10', '-o', 'fa_outfile']
    (status, out, err) = runscript(script, args, in_dir_fa)

    countlines = sum(1 for line in open(fa_infile))
    assert countlines == 22, countlines
Пример #11
0
def test_normalize_by_median_indent():
    infile = utils.get_test_data('paired-mixed.fa.pe')
    hashfile = utils.get_test_data('normC20k20.kh')
    script = scriptpath('normalize-by-median.py')
    args = ['--loadtable', hashfile, infile]
    (status, out, err) = utils.runscript(script, args)
    assert status == 0, (out, err)
    print(out, err)
Пример #12
0
def test_filter_if_present():
    ht = khmer.Hashbits(32, 1e6, 2)

    maskfile = utils.get_test_data("filter-test-A.fa")
    inputfile = utils.get_test_data("filter-test-B.fa")
    outfile = utils.get_temp_filename("filter")

    ht.consume_fasta(maskfile)
    ht.filter_if_present(inputfile, outfile)

    records = list(fasta_iter(open(outfile)))
    assert len(records) == 1
    assert records[0]["name"] == "3"
Пример #13
0
def test_filter_if_present():
    ht = khmer.LabelHash(32, 1e6, 2)

    maskfile = utils.get_test_data('filter-test-A.fa')
    inputfile = utils.get_test_data('filter-test-B.fa')
    outfile = utils.get_temp_filename('filter')

    ht.consume_fasta(maskfile)
    ht.filter_if_present(inputfile, outfile)

    records = list(fasta_iter(open(outfile)))
    assert len(records) == 1
    assert records[0]['name'] == '3'
Пример #14
0
def test_sweep_reads_fq():
    readfile = utils.get_temp_filename('reads.fa')
    contigfile = utils.get_temp_filename('contigs.fp')
    in_dir = os.path.dirname(contigfile)

    shutil.copyfile(utils.get_test_data('test-sweep-reads.fq'), readfile)
    shutil.copyfile(utils.get_test_data('test-sweep-contigs.fp'), contigfile)

    script = scriptpath('sweep-reads.py')
    args = ['-k', '25', '--prefix', 'test', '--label-by-pid',
            contigfile, readfile, 'junkfile.fa']

    status, out, err = utils.runscript(
        script, args, in_dir, fail_ok=True, sandbox=True)

    # check if the bad file was skipped without issue
    assert 'ERROR' in err, err
    assert 'skipping' in err, err

    out1 = os.path.join(in_dir, 'test_0.fq')
    out2 = os.path.join(in_dir, 'test_1.fq')
    mout = os.path.join(in_dir, 'test_multi.fq')
    oout = os.path.join(in_dir, 'test_orphaned.fq')

    assert os.path.exists(out1)
    assert os.path.exists(out2)
    assert os.path.exists(mout)
    assert os.path.exists(oout)
    print open(out1).read()

    print os.listdir(in_dir)

    seqs1 = set([r.name for r in screed.open(out1)])
    seqs2 = set([r.name for r in screed.open(out2)])
    seqsm = set([r.name for r in screed.open(mout)])
    seqso = set([r.name for r in screed.open(oout)])

    print seqs1
    print seqs2
    print seqsm
    print seqso
    assert seqs1 == set(['read1_p0\t0', 'read2_p0\t0'])
    assert seqs2 == set(['read3_p1\t1'])
    assert (seqsm == set(['read4_multi\t0\t1']) or
            seqsm == set(['read4_multi\t1\t0']))
    assert seqso == set(['read5_orphan'])

    seqs1 = set([r.quality for r in screed.open(out1)])
    seqs2 = set([r.quality for r in screed.open(out2)])
    seqsm = set([r.quality for r in screed.open(mout)])
    seqso = set([r.quality for r in screed.open(oout)])
Пример #15
0
def test_fakelump_stop():
    fakelump_fa = utils.get_test_data('fakelump.fa')
    fakelump_stoptags_txt = utils.get_test_data('fakelump.fa.stoptags.txt')

    ht = khmer.new_hashbits(32, 1e5, 4)
    ht.consume_fasta_and_tag(fakelump_fa)

    for line in open(fakelump_stoptags_txt):
        ht.add_stop_tag(line.strip())

    subset = ht.do_subset_partition(0, 0, True)
    ht.merge_subset(subset)

    (n_partitions, n_singletons) = ht.count_partitions()
    assert n_partitions == 3, n_partitions
Пример #16
0
def test_tiny_real_partitions():
    filename = utils.get_test_data('real-partition-tiny.fa')

    ht = khmer.new_hashbits(32, 8e1, 4)
    ht.consume_fasta_and_tag(filename)

    subset = ht.do_subset_partition(0, 0)
    ht.merge_subset(subset)

    outfile = utils.get_temp_filename('part')
    ht.output_partitions(filename, outfile)

    data = open(outfile).read()

    assert len(data)

    records = [r for r in screed.open(outfile)]
    names = [r.name for r in records]
    parts = [n.rsplit('\t', 1)[1] for n in names]

    assert len(parts) == 2, len(parts)
    assert len(set(parts)) == 1
    assert set(parts) != set(['0'])

    test_tiny_real_partitions.runme = True
Пример #17
0
    def test_abund(self):
        ht = khmer.new_hashtable(10, 4 ** 10)

        filename = utils.get_test_data('test-abund-read.fa')
        outname = utils.get_temp_filename('test_abund.out')

        ht.consume_fasta(filename)
        try:
            ht.consume_fasta()
            assert 0, "should fail"
        except TypeError as err:
            print str(err)
        try:
            ht.consume_fasta("nonexistent")
            assert 0, "should fail"
        except IOError as err:
            print str(err)
        ht.output_fasta_kmer_pos_freq(filename, outname)
        try:
            ht.output_fasta_kmer_pos_freq()
            assert 0, "should fail"
        except TypeError as err:
            print str(err)

        fd = open(outname, "r")

        output = fd.readlines()
        assert len(output) == 1

        output = output[0]
        output = output.strip().split()

        assert ['1'] * (114 - 10 + 1) == output

        fd.close()
Пример #18
0
def test_extract_partitions():
    seqfile = utils.get_test_data('random-20-a.fa')
    graphbase = _make_graph(seqfile, do_partition=True, annotate_partitions=True)
    in_dir = os.path.dirname(graphbase)

    # get the final part file
    partfile = os.path.join(in_dir, 'random-20-a.fa.part')

    # ok, now run extract-partitions.
    script = scriptpath('extract-partitions.py')
    args = ['extracted', partfile]
    
    (status, out, err) = runscript(script, args, in_dir)
    print out
    print err
    assert status == 0

    distfile = os.path.join(in_dir, 'extracted.dist')
    groupfile = os.path.join(in_dir, 'extracted.group0000.fa')
    assert os.path.exists(distfile)
    assert os.path.exists(groupfile)

    dist = open(distfile).readline()
    assert dist.strip() == '99 1 1 99'

    parts = [ r.name.split('\t')[1] for r in screed.open(partfile) ]
    assert len(parts) == 99, len(parts)
    parts = set(parts)
    assert len(parts) == 1, len(parts)
Пример #19
0
def test_load_graph():
    script = scriptpath('load-graph.py')
    args = ['-x', '1e7', '-N', '2', '-k', '20']

    outfile = utils.get_temp_filename('out')
    infile = utils.get_test_data('random-20-a.fa')

    args.extend([outfile, infile])

    (status, out, err) = runscript(script, args)
    assert status == 0

    ht_file = outfile + '.ht'
    assert os.path.exists(ht_file), ht_file

    tagset_file = outfile + '.tagset'
    assert os.path.exists(tagset_file), tagset_file

    ht = khmer.load_hashbits(ht_file)
    ht.load_tagset(tagset_file)

    # check to make sure we get the expected result for this data set
    # upon partitioning (all in one partition).  This is kind of a
    # roundabout way of checking that load-graph worked :)
    subset = ht.do_subset_partition(0, 0)
    x = ht.subset_count_partitions(subset)
    assert x == (1, 0), x
Пример #20
0
def test_filter_stoptags():
    infile = utils.get_temp_filename('test.fa')
    in_dir = os.path.dirname(infile)
    stopfile = utils.get_temp_filename('stoptags', in_dir)

    # first, copy test-abund-read-2.fa to 'test.fa' in the temp dir.
    shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)

    # now, create a file with some stop tags in it --
    K = 18
    kh = khmer.new_hashbits(K, 1, 1)
    kh.add_stop_tag('GTTGACGGGGCTCAGGGG')
    kh.save_stop_tags(stopfile)
    del kh
    
    # finally, run filter-stoptags.
    script = scriptpath('filter-stoptags.py')
    args = ['-k', str(K), stopfile, infile, infile]
    (status, out, err) = runscript(script, args, in_dir)
    print out
    print err
    assert status == 0

    # verify that the basic output file exists
    outfile = infile + '.stopfilt'
    assert os.path.exists(outfile), outfile

    # it should contain only one unique sequence, because we've trimmed
    # off everything after the beginning of the only long sequence in there.
    seqs = set([ r.sequence for r in screed.open(outfile) ])
    assert len(seqs) == 1, seqs
    assert 'GGTTGACGGGGCTCAGGG' in seqs, seqs
Пример #21
0
def test_partition_graph_nojoin_k21():
    # test with K=21
    graphbase = _make_graph(utils.get_test_data('random-20-a.fa'), K=21)
    in_dir = os.path.dirname(graphbase)

    script = scriptpath('partition-graph.py')
    args = [graphbase]

    (status, out, err) = runscript(script, args)
    assert status == 0

    script = scriptpath('merge-partitions.py')
    args = [graphbase, '-k', str(21)]
    (status, out, err) = runscript(script, args)
    print out
    print err
    assert status == 0

    final_pmap_file = graphbase + '.pmap.merged'
    assert os.path.exists(final_pmap_file)

    ht = khmer.load_hashbits(graphbase + '.ht')
    ht.load_partitionmap(final_pmap_file)

    x = ht.count_partitions()
    assert x == (99, 0)          # should be 99 partitions at K=21
Пример #22
0
def test_partition_graph_nojoin_stoptags():
    # test with stoptags
    graphbase = _make_graph(utils.get_test_data('random-20-a.fa'))
    in_dir = os.path.dirname(graphbase)

    # add in some stop tags
    ht = khmer.load_hashbits(graphbase + '.ht')
    ht.add_stop_tag('TTGCATACGTTGAGCCAGCG')
    stoptags_file = graphbase + '.stoptags'
    ht.save_stop_tags(stoptags_file)
    del ht

    # run script with stoptags option
    script = scriptpath('partition-graph.py')
    args = ['--stoptags', stoptags_file, graphbase]

    (status, out, err) = runscript(script, args)
    assert status == 0

    script = scriptpath('merge-partitions.py')
    args = [graphbase, '-k', str(20)]
    (status, out, err) = runscript(script, args)
    print out
    print err
    assert status == 0

    final_pmap_file = graphbase + '.pmap.merged'
    assert os.path.exists(final_pmap_file)

    ht = khmer.load_hashbits(graphbase + '.ht')
    ht.load_partitionmap(final_pmap_file)

    x = ht.count_partitions()
    assert x == (2, 0)          # should be 2 partitions
Пример #23
0
def test_n_labels():
    lh = LabelHash(20, 1e7, 4)
    filename = utils.get_test_data('test-labels.fa')
    lh.consume_fasta_and_tag_with_labels(filename)

    print lh.n_labels()
    assert lh.n_labels() == 4
Пример #24
0
def test_sweep_reads_3():

    infile = utils.get_temp_filename('seqs.fa')
    shutil.copyfile(utils.get_test_data('random-20-a.fa'), infile)
    wdir = os.path.dirname(infile)
    script = scriptpath('sweep-reads.py')
    args = ['-m', '75', '-k', '20', '-l', '1', '--prefix',
            'test', '--label-by-group', '10', infile, infile]
    status, out, err = utils.runscript(script, args, wdir, sandbox=True)

    for i in xrange(10):
        p = os.path.join(wdir, 'test_{i}.fa'.format(i=i))
        print p, err, out
        assert os.path.exists(p)
        os.remove(p)

    counts_fn = os.path.join(wdir, 'test.counts.csv')
    with open(counts_fn) as cfp:
        for line in cfp:
            _, _, c = line.partition(',')
            assert int(c) in [9, 10]

    assert os.path.exists(counts_fn)
    assert os.path.exists(os.path.join(wdir, 'test.dist.txt'))
    assert not os.path.exists(os.path.join(wdir, 'test_multi.fa'))
Пример #25
0
    def test_save_merge_from_disk_2(self):
        ht = khmer.new_hashbits(20, 4 ** 7 + 1)
        filename = utils.get_test_data('random-20-a.fa')

        (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename)

        subset_size = total_reads // 2 + total_reads % 2
        divvy = ht.divide_tags_into_subsets(subset_size)

        outfile1 = utils.get_temp_filename('x.pmap')
        outfile2 = utils.get_temp_filename('y.pmap')

        x = ht.do_subset_partition(divvy[0], divvy[1])
        ht.save_subset_partitionmap(x, outfile1)
        del x

        y = ht.do_subset_partition(divvy[1], 0)
        ht.save_subset_partitionmap(y, outfile2)
        del y

        ht.merge_subset_from_disk(outfile1)
        ht.merge_subset_from_disk(outfile2)

        outfile = utils.get_temp_filename('out.part')
        n_partitions = ht.output_partitions(filename, outfile)
        assert n_partitions == 1, n_partitions        # combined.
Пример #26
0
    def test_save_merge_from_disk(self):
        ht = khmer.new_hashbits(20, 4 ** 4 + 1)
        filename = utils.get_test_data('test-graph2.fa')

        (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename)
        assert total_reads == 3, total_reads

        divvy = ht.divide_tags_into_subsets(1)
        print divvy
        (a, b, c) = divvy

        outfile1 = utils.get_temp_filename('x.pmap')
        outfile2 = utils.get_temp_filename('y.pmap')

        x = ht.do_subset_partition(a, b)
        ht.save_subset_partitionmap(x, outfile1)
        del x

        y = ht.do_subset_partition(b, 0)
        ht.save_subset_partitionmap(y, outfile2)
        del y

        ht.merge_subset_from_disk(outfile1)
        ht.merge_subset_from_disk(outfile2)

        outfile = utils.get_temp_filename('out.part')
        n_partitions = ht.output_partitions(filename, outfile)
        assert n_partitions == 1, n_partitions        # combined.
Пример #27
0
def test_casava_1_8_pair_mating():

    import threading

    config = khmer.get_config()
    bufsz = config.get_reads_input_buffer_size()
    config.set_reads_input_buffer_size(128 * 1024)
    # Note: This file, when used in conjunction with a 64 KiB per-thread
    #       prefetch buffer, tests the paired read mating logic with the
    #       Casava >= 1.8 read name format.
    rparser = ReadParser(utils.get_test_data("test-reads.fq.bz2"), 2)

    def thread_1_runtime(rparser):
        for read in rparser:
            pass

    def thread_2_runtime(rparser):
        for readnum, read in enumerate(rparser):
            if 0 == readnum:
                assert "895:1:1:1761:13189 2:N:0:NNNNN" == read.name

    t1 = threading.Thread(target=thread_1_runtime, args=[rparser])
    t2 = threading.Thread(target=thread_2_runtime, args=[rparser])

    t1.start()
    t2.start()

    t1.join()
    t2.join()

    config.set_reads_input_buffer_size(bufsz)
Пример #28
0
    def test_random_20_a_succ_IV_save(self):
        ht = khmer.new_hashbits(20, 4 ** 7 + 1)
        filename = utils.get_test_data('random-20-a.fa')

        savefile_ht = utils.get_temp_filename('ht')
        savefile_tags = utils.get_temp_filename('tags')
        outfile = filename + utils.get_temp_filename('out')

        total_reads, _ = ht.consume_fasta_and_tag(filename)

        ht.save(savefile_ht)
        ht.save_tagset(savefile_tags)

        del ht
        ht = khmer.new_hashbits(20, 4 ** 7 + 1)

        ht.load(savefile_ht)
        ht.load_tagset(savefile_tags)

        divvy = ht.divide_tags_into_subsets(1)
        divvy.append(0)

        subsets = []
        for i in range(len(divvy) - 1):
            x = ht.do_subset_partition(divvy[i], divvy[i + 1])
            subsets.append(x)

        for x in reversed(subsets):
            ht.merge_subset(x)

        n_partitions = ht.output_partitions(filename, outfile)
        assert n_partitions == 1, n_partitions
Пример #29
0
def test_tag_across_stoptraverse():
    filename = utils.get_test_data('random-20-a.fa')

    K = 20  # size of kmer
    HT_SIZE = 100000  # size of hashtable
    N_HT = 3  # number of hashtables

    ht = khmer.LabelHash(K, HT_SIZE, N_HT)

    # without tagging/joining across consume, this breaks into two partition;
    # with, it is one partition.
    ht.add_stop_tag('CCGAATATATAACAGCGACG')

    ht.consume_fasta_and_tag_with_stoptags(filename)  # DO join reads across

    subset = ht.do_subset_partition(0, 0)
    n, _ = ht.count_partitions()
    assert n == 99                       # reads only connected by traversal...

    n, _ = ht.subset_count_partitions(subset)
    assert n == 2                        # but need main to cross stoptags.

    ht.merge_subset(subset)

    n, _ = ht.count_partitions()         # ta-da!
    assert n == 1, n
Пример #30
0
def test_normalize_by_median_dumpfrequency():
    CUTOFF = '1'

    infiles = [utils.get_temp_filename('test-0.fq')]
    in_dir = os.path.dirname(infiles[0])
    for x in range(1, 5):
        infiles.append(utils.get_temp_filename('test-{x}.fq'.format(x=x),
                                               tempdir=in_dir))

    for infile in infiles:
        shutil.copyfile(utils.get_test_data('test-fastq-reads.fq'), infile)

    script = scriptpath('normalize-by-median.py')
    args = ['-d', '2', '-C', CUTOFF, '-k', '17']
    args.extend(infiles)

    (status, out, err) = runscript(script, args, in_dir)

    test_ht = khmer.load_counting_hash(os.path.join(in_dir, 'backup.ct'))
    test_good_read = 'CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT'
    test_good_read2 = 'TAGTATCATCAAGGTTCAAGATGTTAATGAATAACAATTGCGCAGCAA'
    assert test_ht.count(test_good_read[:17]) > 0
    assert test_ht.count(test_good_read2[:17]) > 0

    assert os.path.exists(os.path.join(in_dir, 'backup.ct'))
    assert out.count('Backup: Saving') == 2
    assert 'Nothing' in out
Пример #31
0
def test_normalize_by_median_fpr():
    MIN_TABLESIZE_PARAM = 1

    infile = utils.get_temp_filename('test-fpr.fq')
    in_dir = os.path.dirname(infile)
    shutil.copyfile(utils.get_test_data('test-fastq-reads.fq'), infile)

    script = scriptpath('normalize-by-median.py')
    args = ['-f', '-k 17', '-x ' + str(MIN_TABLESIZE_PARAM), infile]

    (status, out, err) = utils.runscript(script, args, in_dir, fail_ok=True)

    assert os.path.exists(infile + '.keep')
    assert 'fp rate estimated to be' in out, out
    assert '** ERROR: the k-mer counting table is too small' in err, err
Пример #32
0
def test_partition_graph_no_big_traverse():
    # do NOT exhaustively traverse
    graphbase = _make_graph(utils.get_test_data('biglump-random-20-a.fa'),
                            do_partition=True,
                            stop_big_traverse=True)

    final_pmap_file = graphbase + '.pmap.merged'
    assert os.path.exists(final_pmap_file)

    ht = khmer.load_hashbits(graphbase + '.pt')
    ht.load_tagset(graphbase + '.tagset')
    ht.load_partitionmap(final_pmap_file)

    x = ht.count_partitions()
    assert x[0] == 4, x  # should be four partitions, broken at knot.
Пример #33
0
def test_load_partitioned():
    inpfile = utils.get_test_data('combine_parts_1.fa')
    ht = khmer.new_hashbits(32, 1, 1)

    ht.consume_partitioned_fasta(inpfile)
    assert ht.count_partitions() == (2, 0)

    s1 = "CATGCAGAAGTTCCGCAACCATACCGTTCAGT"
    assert ht.get(s1)

    s2 = "CAAATGTACATGCACTTAAAATCATCCAGCCG"
    assert ht.get(s2)

    s3 = "CATGCAGAAGTTCCGCAACCATACCGTTCAGTTCCTGGTGGCTA"[-32:]
    assert ht.get(s3)
Пример #34
0
def test_n_occupied_1():
    filename = utils.get_test_data('random-20-a.fa')

    K = 20  # size of kmer
    HT_SIZE = 100000  # size of hashtable
    N_HT = 1  # number of hashtables

    # test modified c++ n_occupied code
    ht1 = khmer.new_hashbits(K, HT_SIZE, N_HT)

    for n, record in enumerate(fasta_iter(open(filename))):
        ht1.consume(record['sequence'])

    # this number calculated independently
    assert ht1.n_occupied() == 3877
Пример #35
0
def test_consume_absentfasta_with_reads_parser():
    presencetable = khmer.new_hashbits(31, 1, 1)
    try:
        presencetable.consume_fasta_with_reads_parser()
        assert 0, "this should fail"
    except TypeError as err:
        print str(err)
    try:
        readparser = ReadParser(utils.get_test_data('empty-file'))
        presencetable.consume_fasta_with_reads_parser(readparser)
        assert 0, "this should fail"
    except IOError as err:
        print str(err)
    except ValueError, err:
        print str(err)
Пример #36
0
def _make_graph(infilename,
                SIZE=1e7,
                N=2,
                K=20,
                do_partition=False,
                annotate_partitions=False,
                stop_big_traverse=False):
    script = scriptpath('load-graph.py')
    args = ['-x', str(SIZE), '-N', str(N), '-k', str(K)]

    outfile = utils.get_temp_filename('out')
    infile = utils.get_test_data(infilename)

    args.extend([outfile, infile])

    runscript(script, args)

    ht_file = outfile + '.ht'
    assert os.path.exists(ht_file), ht_file

    tagset_file = outfile + '.tagset'
    assert os.path.exists(tagset_file), tagset_file

    if do_partition:
        script = scriptpath('partition-graph.py')
        args = [outfile]
        if stop_big_traverse:
            args.insert(0, '--no-big-traverse')
        runscript(script, args)

        script = scriptpath('merge-partitions.py')
        args = [outfile, '-k', str(K)]
        runscript(script, args)

        final_pmap_file = outfile + '.pmap.merged'
        assert os.path.exists(final_pmap_file)

        if annotate_partitions:
            script = scriptpath('annotate-partitions.py')
            args = ["-k", str(K), outfile, infilename]

            in_dir = os.path.dirname(outfile)
            runscript(script, args, in_dir)

            baseinfile = os.path.basename(infilename)
            assert os.path.exists(os.path.join(in_dir, baseinfile + '.part'))

    return outfile
Пример #37
0
    def test_not_output_unassigned(self):
        import screed

        filename = utils.get_test_data('random-20-a.fa')

        ht = khmer.new_hashbits(21, 4, 4)
        ht.consume_fasta_and_tag(filename)

        output_file = utils.get_temp_filename('parttest')
        ht.output_partitions(filename, output_file, False)

        len1 = len(list(screed.open(filename)))
        len2 = len(list(screed.open(output_file)))

        assert len1 > 0
        assert len2 == 0, len2
Пример #38
0
def test_abundance_dist_single_nobigcount():
    infile = utils.get_temp_filename('test.fa')
    outfile = utils.get_temp_filename('test.dist')
    in_dir = os.path.dirname(infile)

    shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)

    script = scriptpath('abundance-dist-single.py')
    args = ['-x', '1e7', '-N', '2', '-k', '17', '-z', '-b', infile, outfile]
    utils.runscript(script, args, in_dir)

    fp = iter(open(outfile))
    line = fp.next().strip()
    assert line == '1 96 96 0.98', line
    line = fp.next().strip()
    assert line == '255 2 98 1.0', line
Пример #39
0
def test_bloom_c_1():
    # test c++ code to count unique kmers using bloom filter

    filename = utils.get_test_data('random-20-a.fa')

    K = 20  # size of kmer
    HT_SIZE = 100000  # size of hashtable
    N_HT = 3  # number of hashtables

    ht3 = khmer.Hashbits(K, HT_SIZE, N_HT)

    for n, record in enumerate(fasta_iter(open(filename))):
        ht3.consume(record['sequence'])

    assert ht3.n_occupied() == 3882
    assert ht3.n_unique_kmers() == 3960
Пример #40
0
def test_do_partition():
    seqfile = utils.get_test_data('random-20-a.fa')
    graphbase = utils.get_temp_filename('out')
    in_dir = os.path.dirname(graphbase)

    script = scriptpath('do-partition.py')
    args = ["-k", "20", graphbase, seqfile]

    utils.runscript(script, args, in_dir)

    partfile = os.path.join(in_dir, 'random-20-a.fa.part')

    parts = [r.name.split('\t')[1] for r in screed.open(partfile)]
    parts = set(parts)
    assert '2' in parts
    assert len(parts) == 1
Пример #41
0
def test_bigcount_abund_dist_2():
    kh = khmer.new_counting_hash(18, 1e7, 4)
    tracking = khmer.new_hashbits(18, 1e7, 4)
    kh.set_use_bigcount(True)

    seqpath = utils.get_test_data('test-abund-read.fa')

    kh.consume_fasta(seqpath)
    for i in range(1000):
        kh.count('GGTTGACGGGGCTCAGGG')

    dist = kh.abundance_distribution(seqpath, tracking)
    print kh.get('GGTTGACGGGGCTCAGGG')

    pdist = [(i, dist[i]) for i in range(len(dist)) if dist[i]]
    assert dist[1001] == 1, pdist
Пример #42
0
    def test_consume_build_readmask(self):
        ht = khmer.new_hashtable(10, 4**10)

        filename = utils.get_test_data('simple_2.fa')
        outname = utils.get_temp_filename('test_filter.out')

        # sequence #4 (index 3) is bad; the new readmask should have that.
        x = ht.consume_fasta_build_readmask(filename)
        (total_reads, n_consumed, readmask) = x

        assert total_reads == 4, total_reads
        assert n_consumed == 63, n_consumed
        assert readmask.get(0)
        assert readmask.get(1)
        assert readmask.get(2)
        assert not readmask.get(3)
Пример #43
0
def test_filter_abund_1_singlefile():
    infile = utils.get_temp_filename('test.fa')
    in_dir = os.path.dirname(infile)

    shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)

    script = scriptpath('filter-abund-single.py')
    args = ['-x', '1e7', '-N', '2', '-k', '17', infile]
    runscript(script, args, in_dir)

    outfile = infile + '.abundfilt'
    assert os.path.exists(outfile), outfile

    seqs = set([r.sequence for r in screed.open(outfile)])
    assert len(seqs) == 1, seqs
    assert 'GGTTGACGGGGCTCAGGG' in seqs
Пример #44
0
    def test_filter_n(self):
        ht = khmer.new_hashtable(10, 4**10)

        filename = utils.get_test_data('simple_2.fa')
        outname = utils.get_temp_filename('test_filter.out')

        (total_reads, n_consumed) = ht.consume_fasta(filename)
        assert total_reads == 4, total_reads
        assert n_consumed == 63, n_consumed

        (total_reads,
         n_seq_kept) = khmer.filter_fasta_file_any(ht, filename, total_reads,
                                                   outname, 1)
        assert n_seq_kept == 3, n_seq_kept

        names = load_fa_seq_names(outname)
        assert names == ['1', '2', '3']
Пример #45
0
    def test_consume_no_update_readmask(self):
        ht = khmer.new_hashtable(10, 4**10)

        filename = utils.get_test_data('simple_2.fa')
        outname = utils.get_temp_filename('test_filter.out')

        readmask = khmer.new_readmask(4)

        # sequence #4 (index 3) is bad; the new readmask should NOT have that.
        (total_reads, n_consumed) = ht.consume_fasta(filename, 0, 0, readmask,
                                                     False)
        assert total_reads == 4, total_reads
        assert n_consumed == 63, n_consumed
        assert readmask.get(0)
        assert readmask.get(1)
        assert readmask.get(2)
        assert readmask.get(3)  # NOT updated
Пример #46
0
def test_do_partition_2():
    # test with K=21 (no joining of sequences)
    seqfile = utils.get_test_data('random-20-a.fa')
    graphbase = utils.get_temp_filename('out')
    in_dir = os.path.dirname(graphbase)

    script = scriptpath('do-partition.py')
    args = ["-k", "21", graphbase, seqfile]

    runscript(script, args, in_dir)

    partfile = os.path.join(in_dir, 'random-20-a.fa.part')

    parts = [r.name.split('\t')[1] for r in screed.open(partfile)]
    parts = set(parts)

    assert len(parts) == 99, len(parts)
Пример #47
0
def test_filter_abund_2():
    infile = utils.get_temp_filename('test.fa')
    in_dir = os.path.dirname(infile)

    shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
    counting_ht = _make_counting(infile, K=17)

    script = scriptpath('filter-abund.py')
    args = ['-C', '1', counting_ht, infile, infile]
    utils.runscript(script, args, in_dir)

    outfile = infile + '.abundfilt'
    assert os.path.exists(outfile), outfile

    seqs = set([r.sequence for r in screed.open(outfile)])
    assert len(seqs) == 2, seqs
    assert 'GGTTGACGGGGCTCAGGG' in seqs
Пример #48
0
def test_extract_partitions_no_groups():
    empty_file = utils.get_temp_filename('empty-file')
    basefile = utils.get_test_data('empty-file')

    shutil.copyfile(basefile, empty_file)
    in_dir = os.path.dirname(empty_file)

    # ok, now run extract-partitions.
    script = scriptpath('extract-partitions.py')
    args = ['extracted', empty_file]

    utils.runscript(script, args, in_dir, fail_ok=True)

    # No group files should be created
    groupfile = os.path.join(in_dir, 'extracted.group0000.fa')

    assert not os.path.exists(groupfile)
Пример #49
0
def test_save_fail_readonly():
    lb_pre = LabelHash(20, 1e7, 4)
    filename = utils.get_test_data('test-labels.fa')
    lb_pre.consume_fasta_and_tag_with_labels(filename)

    # save labels to a file
    savepath = utils.get_temp_filename('saved.labels')
    fp = open(savepath, 'w')
    fp.close()

    os.chmod(savepath, 0x444)

    try:
        lb_pre.save_labels_and_tags(savepath)
        assert 0, "this should fail: read-only file"
    except IOError as err:
        print str(err)
Пример #50
0
def test_trim_low_abund_5_trim_high_abund():
    infile = utils.get_temp_filename('test.fa')
    in_dir = os.path.dirname(infile)

    shutil.copyfile(utils.get_test_data('test-abund-read-3.fa'), infile)

    args = ["-k", "17", "-x", "1e7", "-N", "2", '-V', infile]
    utils.runscript('trim-low-abund.py', args, in_dir, sandbox=True)

    outfile = infile + '.abundtrim'
    assert os.path.exists(outfile), outfile

    seqs = set([r.sequence for r in screed.open(outfile)])
    assert len(seqs) == 2, seqs

    # trimmed sequence @ error
    assert 'GGTTGACGGGGCTCAGGGGGCGGCTGACTCCGAGAGACAGC' in seqs
Пример #51
0
    def test_5_merge_046(self):
        ht = khmer.new_hashbits(20, 4**4 + 1)
        filename = utils.get_test_data('test-graph5.fa')

        (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename)
        assert total_reads == 6, total_reads

        divvy = ht.divide_tags_into_subsets(1)

        x = ht.do_subset_partition(divvy[0], divvy[4])
        ht.merge_subset(x)

        y = ht.do_subset_partition(divvy[4], 0)
        ht.merge_subset(y)

        outfile = utils.get_temp_filename('out.part')
        n_partitions = ht.output_partitions(filename, outfile)
        assert n_partitions == 1, n_partitions  # combined.
Пример #52
0
def test_load_graph_no_tags():
    script = scriptpath('load-graph.py')
    args = ['-x', '1e7', '-N', '2', '-k', '20', '-n']

    outfile = utils.get_temp_filename('out')
    infile = utils.get_test_data('random-20-a.fa')

    args.extend([outfile, infile])

    utils.runscript(script, args)

    ht_file = outfile + '.pt'
    assert os.path.exists(ht_file), ht_file

    tagset_file = outfile + '.tagset'
    assert not os.path.exists(tagset_file), tagset_file

    assert khmer.load_hashbits(ht_file)
Пример #53
0
def test_count_median():
    infile = utils.get_temp_filename('test.fa')
    outfile = infile + '.counts'

    shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)
    counting_ht = _make_counting(infile, K=8)

    script = scriptpath('count-median.py')
    args = [counting_ht, infile, outfile]
    utils.runscript(script, args)

    assert os.path.exists(outfile), outfile

    data = [x.strip() for x in open(outfile)]
    data = set(data)
    assert len(data) == 2, data
    assert 'seq 1001 1001.0 0.0 18' in data
    assert '895:1:37:17593:9954/1 1 103.803741455 303.702941895 114' in data
Пример #54
0
def test_badfasta_count_kmers_by_position():
    countingtable = khmer.new_counting_hash(4, 4**4, 4)
    try:
        countingtable.fasta_count_kmers_by_position()
    except TypeError as err:
        print str(err)

    filename = utils.get_test_data("test-short.fa")
    try:
        countingtable.fasta_count_kmers_by_position(filename, -1, 0)
        assert 0, "this should fail"
    except ValueError as err:
        print str(err)
    try:
        countingtable.fasta_count_kmers_by_position(filename, 0, -1)
        assert 0, "this should fail"
    except ValueError as err:
        print str(err)
Пример #55
0
def test_find_all_tags_list_error():
    ct = khmer.new_counting_hash(4, 4**4, 4)

    # load each sequence but do not build tags - everything should be empty.
    for record in screed.open(utils.get_test_data('test-graph2.fa')):
        ct.consume(record.sequence)

    try:
        ct.find_all_tags_list("ATA")
        assert False, "a ValueError should be raised for incorrect k-mer size"
    except ValueError:
        pass

    try:
        ct.find_all_tags_list("ATAGA")
        assert False, "a ValueError should be raised for incorrect k-mer size"
    except ValueError:
        pass
Пример #56
0
def test_abundance_dist_single():
    infile = utils.get_temp_filename('test.fa')
    outfile = utils.get_temp_filename('test.dist')
    in_dir = os.path.dirname(infile)

    shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)

    script = scriptpath('abundance-dist-single.py')
    args = ['-x', '1e7', '-N', '2', '-k', '17', '-z', '-t', infile, outfile]
    (status, out, err) = utils.runscript(script, args, in_dir)

    assert 'Total number of k-mers: 98' in err, err

    fp = iter(open(outfile))
    line = fp.next().strip()
    assert line == '1 96 96 0.98', line
    line = fp.next().strip()
    assert line == '1001 2 98 1.0', line
Пример #57
0
def test_abundance_dist_nobigcount():
    infile = utils.get_temp_filename('test.fa')
    outfile = utils.get_temp_filename('test.dist')
    in_dir = os.path.dirname(infile)

    shutil.copyfile(utils.get_test_data('test-abund-read-2.fa'), infile)

    htfile = _make_counting(infile, K=17, BIGCOUNT=False)

    script = scriptpath('abundance-dist.py')
    args = ['-z', htfile, infile, outfile]
    utils.runscript(script, args, in_dir)

    fp = iter(open(outfile))
    line = fp.next().strip()
    assert line == '1 96 96 0.98', line
    line = fp.next().strip()
    assert line == '255 2 98 1.0', line
Пример #58
0
    def test_abundance_by_pos(self):
        kh = self.kh

        for _ in range(0, 300):
            kh.count('ATCG')

        for _ in range(0, 10):
            kh.count('ATGG')

        short_filename = utils.get_test_data('test-short.fa')
        dist = kh.fasta_count_kmers_by_position(short_filename, 6, 10)
        assert dist[4] == 1
        assert sum(dist) == 1

        dist = kh.fasta_count_kmers_by_position(short_filename, 6, MAX_COUNT)
        assert dist[0] == 1, dist[0]
        assert dist[2] == 1
        assert sum(dist) == 2
Пример #59
0
    def test_3_merge_023(self):
        ht = khmer.new_hashbits(20, 4**10 + 1)
        filename = utils.get_test_data('test-graph2.fa')

        (total_reads, total_kmers) = ht.consume_fasta_and_tag(filename)
        assert total_reads == 3, total_reads

        (a, b, c) = ht.divide_tags_into_subsets(1)

        x = ht.do_subset_partition(b, c)
        ht.merge_subset(x)

        y = ht.do_subset_partition(a, b)
        ht.merge_subset(y)

        outfile = utils.get_temp_filename('out.part')
        n_partitions = ht.output_partitions(filename, outfile)
        assert n_partitions == 1, n_partitions  # combined.
Пример #60
0
    def test_random_20_a_succ_II(self):
        ht = khmer.new_hashbits(20, 4**7 + 1)
        filename = utils.get_test_data('random-20-a.fa')
        outfile = utils.get_temp_filename('out')

        total_reads, _ = ht.consume_fasta_and_tag(filename)

        subset_size = total_reads / 2 + total_reads % 2
        divvy = ht.divide_tags_into_subsets(subset_size)
        assert len(divvy) == 4

        x = ht.do_subset_partition(divvy[0], divvy[2])
        y = ht.do_subset_partition(divvy[2], 0)
        ht.merge_subset(x)
        ht.merge_subset(y)

        n_partitions = ht.output_partitions(filename, outfile)
        assert n_partitions == 1, n_partitions