예제 #1
0
def test_label_tag_correctness_save_load():
    lb_pre = GraphLabels.NodeGraphLabels(20, 1e7, 4)
    filename = utils.get_test_data('test-labels.fa')
    lb_pre.consume_seqfile_and_tag_with_labels(filename)

    # save labels to a file
    savepath = utils.get_temp_filename('saved.labels')
    lb_pre.save_labels_and_tags(savepath)

    # trash the old GraphLabels
    del lb_pre

    # create new, load labels & tags
    lb = GraphLabels.NodeGraphLabels(20, 1e7, 4)
    lb.load_labels_and_tags(savepath)

    # read A
    labels = list(
        lb.sweep_label_neighborhood(
            'ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAG'
            'CTAGGCTAGGTGTGCTCTGCCTAGAGCTAGGCTAGGTGT'))
    print(
        lb.sweep_tag_neighborhood(
            'TTCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAG'
            'CTAGGCTAGGTGTGCTCTGCTAGAGCTAGGCTAGGTGT'))
    print(labels)
    print(len('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAG') - 19)
    assert len(labels) == 2
    assert 0 in labels
    assert 1 in labels

    # read B
    labels = list(
        lb.sweep_label_neighborhood(
            'GCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGCTCTGCCTAGAGCTAGGCTAGGTGTTGGGGATAG'
            'ATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGA'))
    print(labels)
    assert len(labels) == 3
    assert 0 in labels
    assert 1 in labels
    assert 2 in labels

    # read C
    labels = list(
        lb.sweep_label_neighborhood(
            'TGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAG'
            'CTAGGCTAGGTGTTGGGGATAGATAGATAGATGAGTTGGGGATAGATAGATAGATGAGTGTAGATCCA'
            'ACAACACATACA'))
    print(labels)
    assert len(labels) == 2
    assert 1 in labels
    assert 2 in labels

    # read D
    labels = list(
        lb.sweep_label_neighborhood(
            'TATATATATAGCTAGCTAGCTAACTAGCTAGCATCGATCGATCGATC'))
    print(labels)
    assert len(labels) == 1
    assert 3 in labels
예제 #2
0
def test_n_labels():
    lh = GraphLabels(20, 1e7, 4)
    filename = utils.get_test_data('test-labels.fa')
    lh.consume_fasta_and_tag_with_labels(filename)

    print(lh.n_labels())
    assert lh.n_labels() == 4
예제 #3
0
def test_save_load_corrupted():
    lb_pre = GraphLabels(20, 1e7, 4)
    filename = utils.get_test_data('test-labels.fa')
    lb_pre.consume_fasta_and_tag_with_labels(filename)

    # save labels to a file
    savepath = utils.get_temp_filename('saved.labels')
    lb_pre.save_labels_and_tags(savepath)

    # trash the old GraphLabels
    del lb_pre

    lb = GraphLabels(20, 1e7, 4)

    # produce all possible truncated versions of this file
    data = open(savepath, 'rb').read()
    for i in range(len(data)):
        truncated = utils.get_temp_filename('trunc.labels')
        fp = open(truncated, 'wb')
        fp.write(data[:i])
        fp.close()

        try:
            lb.load_labels_and_tags(truncated)
            assert 0, "this should not succeed -- truncated file len %d" % (
                i, )
        except OSError as err:
            print('expected failure for', i, ': ', str(err))
예제 #4
0
def test_sweep_tag_neighborhood():
    lb = GraphLabels(20, 1e7, 4)
    filename = utils.get_test_data('single-read.fq')
    lb.graph.consume_fasta_and_tag(filename)

    tags = lb.sweep_tag_neighborhood('CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT')
    assert len(tags) == 1
    assert list(tags) == [173473779682]
예제 #5
0
def test_sweep_tag_neighborhood():
    lb = GraphLabels(20, 1e7, 4)
    filename = utils.get_test_data('single-read.fq')
    lb.graph.consume_fasta_and_tag(filename)

    tags = lb.sweep_tag_neighborhood('CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT')
    assert len(tags) == 1
    assert tags.pop() == 173473779682
예제 #6
0
def test_sweep_label_neighborhood():
    lb = GraphLabels(20, 1e7, 4)
    filename = utils.get_test_data('single-read.fq')
    lb.consume_fasta_and_tag_with_labels(filename)

    labels = lb.sweep_label_neighborhood('CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT')
    assert len(labels) == 1
    assert labels.pop() == 0
예제 #7
0
def test_sweep_label_neighborhood():
    lb = GraphLabels(20, 1e7, 4)
    filename = utils.get_test_data('single-read.fq')
    lb.consume_fasta_and_tag_with_labels(filename)

    labels = lb.sweep_label_neighborhood('CAGGCGCCCACCACCGTGCCCTCCAACCTGATGGT')
    assert len(labels) == 1
    assert labels.pop() == 0
예제 #8
0
def test_consume_partitioned_seqfile_and_label(Graphtype):
    infile = utils.get_test_data('valid-read-testing.fq')

    # read this in consume_and_tag
    graph = Graphtype(15, *params_1m)
    x = GraphLabels(graph)
    x.consume_partitioned_fasta_and_tag_with_labels(infile)

    assert x.n_labels == 9
예제 #9
0
def test_consume_partitioned_seqfile_and_label(Graphtype):
    infile = utils.get_test_data('valid-read-testing.fq')

    # read this in consume_and_tag
    graph = Graphtype(15, *params_1m)
    x = GraphLabels(graph)
    x.consume_partitioned_fasta_and_tag_with_labels(infile)

    assert x.n_labels == 9
예제 #10
0
def test_get_tag_labels():
    lb = GraphLabels(20, 1e7, 4)
    filename = utils.get_test_data('single-read.fq')
    lb.consume_fasta_and_tag_with_labels(filename)
    tag = 173473779682

    labels = lb.get_tag_labels(tag)
    assert len(labels) == 1
    assert labels.pop() == 0
예제 #11
0
def test_get_tag_labels():
    lb = GraphLabels(20, 1e7, 4)
    filename = utils.get_test_data('single-read.fq')
    lb.consume_fasta_and_tag_with_labels(filename)
    tag = 173473779682

    labels = lb.get_tag_labels(tag)
    assert len(labels) == 1
    assert labels.pop() == 0
예제 #12
0
def test_consume_sequence_and_tag_with_labels():
    lb = GraphLabels(20, 1e6, 4)
    label = 0
    sequence = 'ATGCATCGATCGATCGATCGATCGATCGATCGATCGATCG'

    lb.consume_sequence_and_tag_with_labels(sequence, label)
    labels = set()
    labels.update(lb.sweep_label_neighborhood(sequence))

    assert label in labels
    assert len(labels) == 1
예제 #13
0
def test_consume_sequence_and_tag_with_labels():
    lb = GraphLabels(20, 1e6, 4)
    label = 0
    sequence = 'ATGCATCGATCGATCGATCGATCGATCGATCGATCGATCG'

    n_consumed = lb.consume_sequence_and_tag_with_labels(sequence, label)
    labels = set()
    labels.update(lb.sweep_label_neighborhood(sequence))

    assert label in labels
    assert len(labels) == 1
예제 #14
0
def test_load_wrong_fileversion():
    lb = GraphLabels(20, 1e7, 4)

    # try to load a tagset from an old version
    filename = utils.get_test_data('badversion-k32.tagset')
    try:
        lb.load_labels_and_tags(filename)
        assert 0, "this should not succeed - bad file type"
    except OSError as err:
        print(str(err))
        assert "Incorrect file format version" in str(err)
예제 #15
0
def test_get_label_dict():
    lb = GraphLabels(20, 1e7, 4)
    filename = utils.get_test_data('test-labels.fa')
    lb.consume_fasta_and_tag_with_labels(filename)

    labels = lb.get_label_dict()
    expected = [0, 1, 2, 3]
    for e_label in expected:
        assert e_label in labels
    for a_label in labels:
        assert a_label in expected
예제 #16
0
def test_load_wrong_fileversion():
    lb = GraphLabels(20, 1e7, 4)

    # try to load a tagset from an old version
    filename = utils.get_test_data('badversion-k32.tagset')
    try:
        lb.load_labels_and_tags(filename)
        assert 0, "this should not succeed - bad file type"
    except OSError as err:
        print(str(err))
        assert "Incorrect file format version" in str(err)
예제 #17
0
def test_get_all_labels():
    lb = GraphLabels(20, 1e7, 4)
    filename = utils.get_test_data('test-labels.fa')
    lb.consume_fasta_and_tag_with_labels(filename)

    labels = lb.get_all_labels()
    expected = [0, 1, 2, 3]
    for e_label in expected:
        assert e_label in labels
    for a_label in labels:
        assert a_label in expected
예제 #18
0
def test_save_load_corrupted():
    lb_pre = GraphLabels(20, 1e7, 4)
    filename = utils.get_test_data('test-labels.fa')
    lb_pre.consume_fasta_and_tag_with_labels(filename)

    # save labels to a file
    savepath = utils.get_temp_filename('saved.labels')
    lb_pre.save_labels_and_tags(savepath)

    # trash the old GraphLabels
    del lb_pre

    lb = GraphLabels(20, 1e7, 4)

    # produce all possible truncated versions of this file
    data = open(savepath, 'rb').read()
    for i in range(len(data)):
        truncated = utils.get_temp_filename('trunc.labels')
        fp = open(truncated, 'wb')
        fp.write(data[:i])
        fp.close()

        try:
            lb.load_labels_and_tags(truncated)
            assert 0, "this should not succeed -- truncated file len %d" % (i,)
        except OSError as err:
            print('expected failure for', i, ': ', str(err))
예제 #19
0
def test_consume_partitioned_fasta_and_tag_with_labels():
    lb = GraphLabels(20, 1e7, 4)
    filename = utils.get_test_data('real-partition-small.fa')

    total_reads, n_consumed = lb.consume_partitioned_fasta_and_tag_with_labels(
        filename)
    labels = set()
    for record in screed.open(filename):
        seq = record.sequence
        labels.update(lb.sweep_label_neighborhood(seq, 0, False, False))
    # print(lb.n_labels())
    # print(labels)
    assert len(labels) == 1
    assert labels.pop() == 2
    assert lb.n_labels() == 1
def test_consume_partitioned_fasta_and_tag_with_labels():
    lb = GraphLabels(20, 1e7, 4)
    filename = utils.get_test_data('real-partition-small.fa')

    total_reads, n_consumed = lb.consume_partitioned_fasta_and_tag_with_labels(
        filename)
    labels = set()
    for record in screed.open(filename):
        seq = record.sequence
        labels.update(lb.sweep_label_neighborhood(seq, 0, False, False))
    # print(lb.n_labels())
    # print(labels)
    assert len(labels) == 1
    assert labels.pop() == 2
    assert lb.n_labels() == 1
예제 #21
0
def test_n_labels():
    lh = GraphLabels.NodeGraphLabels(20, 1e7, 4)
    filename = utils.get_test_data('test-labels.fa')
    lh.consume_seqfile_and_tag_with_labels(filename)

    print(lh.n_labels)
    assert lh.n_labels == 4
예제 #22
0
def test_save_fail_readonly():
    lb_pre = GraphLabels(20, 1e7, 4)
    filename = utils.get_test_data('test-labels.fa')
    lb_pre.consume_fasta_and_tag_with_labels(filename)

    # save labels to a file
    savepath = utils.get_temp_filename('saved.labels')
    fp = open(savepath, 'w')
    fp.close()

    os.chmod(savepath, 0x444)

    try:
        lb_pre.save_labels_and_tags(savepath)
        assert 0, "this should fail: read-only file"
    except OSError as err:
        print(str(err))
예제 #23
0
def test_link_tag_and_label_using_string():
    lb = GraphLabels(20, 1, 1)

    kmer = lb.reverse_hash(173473779682)
    lb.add_tag(kmer)
    lb.link_tag_and_label(kmer, 1)

    labels = lb.get_tag_labels(kmer)
    assert len(labels) == 1
    assert labels.pop() == 1
예제 #24
0
def test_link_tag_and_label_using_string_2():
    lb = GraphLabels(20, 1, 1)

    tag = 173473779682
    kmer = lb.reverse_hash(tag)
    lb.add_tag(kmer)
    lb.link_tag_and_label(kmer, 1)

    labels = lb.get_tag_labels(tag)  # <-- use 'tag' instead of 'kmer'
    assert len(labels) == 1
    assert labels.pop() == 1
예제 #25
0
def test_link_tag_and_label():
    lb = GraphLabels.NodeGraphLabels(20, 1, 1)

    tag = 173473779682
    lb.add_tag(tag)
    lb.link_tag_and_label(tag, 1)

    labels = list(lb.get_tag_labels(tag))
    assert len(labels) == 1
    assert labels.pop() == 1
예제 #26
0
def test_consume_sequence_and_tag_with_labels_2():
    lb = GraphLabels.NodeGraphLabels(20, 1e6, 4)
    label = 56  # randomly chosen / non-zero
    sequence = 'ATGCATCGATCGATCGATCGATCGATCGATCGATCGATCG'

    lb.consume_sequence_and_tag_with_labels(sequence, label)
    labels = set()
    labels.update(lb.sweep_label_neighborhood(sequence))

    assert label in labels
    assert len(labels) == 1
예제 #27
0
def test_get_labels_save_load_wrong_ksize():
    lb_pre = GraphLabels(19, 1e7, 4)
    filename = utils.get_test_data('test-labels.fa')
    lb_pre.consume_fasta_and_tag_with_labels(filename)

    # save labels to a file
    savepath = utils.get_temp_filename('saved.labels')
    lb_pre.save_labels_and_tags(savepath)

    # trash the old GraphLabels
    del lb_pre

    # create new, load labels & tags
    lb = GraphLabels(20, 1e7, 4)
    try:
        lb.load_labels_and_tags(savepath)
        assert 0, "this should not succeed - different ksize"
    except OSError as err:
        print(str(err))
        assert "Incorrect k-mer size 19" in str(err)
예제 #28
0
def test_load_wrong_filetype():
    lb = GraphLabels(20, 1e7, 4)

    # try to load a tagset
    filename = utils.get_test_data('goodversion-k32.tagset')
    try:
        lb.load_labels_and_tags(filename)
        assert 0, "this should not succeed - bad file type"
    except OSError as err:
        print(str(err))
        assert "Incorrect file format type" in str(err)

    # try to load a nonsense file
    filename = utils.get_test_data('all-A.fa')
    try:
        lb.load_labels_and_tags(filename)
        assert 0, "this should not succeed - bad file signature"
    except OSError as err:
        print(str(err))
        assert "Incorrect file signature" in str(err)
예제 #29
0
def test_label_tag_correctness():
    lb = GraphLabels(20, 1e7, 4)
    filename = utils.get_test_data('test-labels.fa')
    lb.consume_fasta_and_tag_with_labels(filename)

    # read A
    labels = lb.sweep_label_neighborhood(
        'ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAG'
        'CTAGGCTAGGTGTGCTCTGCCTAGAGCTAGGCTAGGTGT')
    print(
        lb.sweep_tag_neighborhood(
            'TTCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAG'
            'CTAGGCTAGGTGTGCTCTGCTAGAGCTAGGCTAGGTGT'))
    print(labels)
    print(len('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAG') - 19)
    assert len(labels) == 2
    assert 0 in labels
    assert 1 in labels

    # read B
    labels = lb.sweep_label_neighborhood(
        'GCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGCTCTGCCTAGAGCTAGGCTAGGTGTTGGGGATAG'
        'ATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGA')
    print(labels)
    assert len(labels) == 3
    assert 0 in labels
    assert 1 in labels
    assert 2 in labels

    # read C
    labels = lb.sweep_label_neighborhood(
        'TGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAG'
        'CTAGGCTAGGTGTTGGGGATAGATAGATAGATGAGTTGGGGATAGATAGATAGATGAGTGTAGATCCA'
        'ACAACACATACA')
    print(labels)
    assert len(labels) == 2
    assert 1 in labels
    assert 2 in labels

    # read D
    labels = lb.sweep_label_neighborhood(
        'TATATATATAGCTAGCTAGCTAACTAGCTAGCATCGATCGATCGATC')
    print(labels)
    assert len(labels) == 1
    assert 3 in labels
예제 #30
0
def test_get_labels_save_load():
    lb_pre = GraphLabels.NodeGraphLabels(20, 1e7, 4)
    filename = utils.get_test_data('test-labels.fa')
    lb_pre.consume_seqfile_and_tag_with_labels(filename)

    # save labels to a file
    savepath = utils.get_temp_filename('saved.labels')
    lb_pre.save_labels_and_tags(savepath)

    # trash the old GraphLabels
    del lb_pre

    # create new, load labels & tags
    graph = Nodegraph(20, 1e7, 4)
    lb = GraphLabels.load(savepath, graph)

    labels = list(lb.labels())
    expected = [0, 1, 2, 3]
    for e_label in expected:
        assert e_label in labels
    for a_label in labels:
        assert a_label in expected
예제 #31
0
def test_consume_seqfile_and_tag_with_labels():
    lb = GraphLabels(20, 1e7, 4)
    read_1 = 'ACGTAACCGGTTAAACCCGGGTTTAAAACCCCGGGGTTTT'
    filename = utils.get_test_data('test-transcript.fa')

    total_reads, _ = lb.consume_seqfile_and_tag_with_labels(filename)
    print("doing get")
    assert lb.get(read_1[:20])
    assert total_reads == 3
    print("doing n_labels")
    print(lb.n_labels())
    print("doing all labels")
    print(lb.get_all_labels())
    print("get tagset")
    for tag in lb.get_tagset():
        print("forward hash")
        print(tag, khmer.forward_hash(tag, 20))
    for record in screed.open(filename):
        print("Sweeping tags")
        print(lb.sweep_tag_neighborhood(record.sequence, 40))
        print("Sweeping labels...")
        print(lb.sweep_label_neighborhood(record.sequence, 40))
    assert lb.n_labels() == 3
예제 #32
0
def test_get_labels_for_sequence():
    lb = GraphLabels.NodeGraphLabels(20, 1e7, 4)
    filename = utils.get_test_data('single-read.fq')
    lb.consume_seqfile_and_tag_with_labels(filename)

    seq = [r.sequence for r in screed.open(filename)][0]
    labels = list(lb.get_labels_for_sequence(seq))

    tag = 173473779682
    labels2 = list(lb.get_tag_labels(tag))

    assert labels == labels2
    assert len(labels) == 1
    assert labels.pop() == 0
예제 #33
0
def test_label_tag_correctness():
    lb = GraphLabels(20, 1e7, 4)
    filename = utils.get_test_data('test-labels.fa')
    lb.consume_fasta_and_tag_with_labels(filename)

    # read A
    labels = lb.sweep_label_neighborhood(
        'ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAG'
        'CTAGGCTAGGTGTGCTCTGCCTAGAGCTAGGCTAGGTGT')
    print(lb.sweep_tag_neighborhood(
        'TTCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAG'
        'CTAGGCTAGGTGTGCTCTGCTAGAGCTAGGCTAGGTGT'))
    print(labels)
    print(len('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAG') - 19)
    assert len(labels) == 2
    assert 0 in labels
    assert 1 in labels

    # read B
    labels = lb.sweep_label_neighborhood(
        'GCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGCTCTGCCTAGAGCTAGGCTAGGTGTTGGGGATAG'
        'ATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGA')
    print(labels)
    assert len(labels) == 3
    assert 0 in labels
    assert 1 in labels
    assert 2 in labels

    # read C
    labels = lb.sweep_label_neighborhood(
        'TGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAG'
        'CTAGGCTAGGTGTTGGGGATAGATAGATAGATGAGTTGGGGATAGATAGATAGATGAGTGTAGATCCA'
        'ACAACACATACA')
    print(labels)
    assert len(labels) == 2
    assert 1 in labels
    assert 2 in labels

    # read D
    labels = lb.sweep_label_neighborhood(
        'TATATATATAGCTAGCTAGCTAACTAGCTAGCATCGATCGATCGATC')
    print(labels)
    assert len(labels) == 1
    assert 3 in labels
예제 #34
0
def test_get_label_dict_save_load():
    lb_pre = GraphLabels(20, 1e7, 4)
    filename = utils.get_test_data('test-labels.fa')
    lb_pre.consume_fasta_and_tag_with_labels(filename)

    # save labels to a file
    savepath = utils.get_temp_filename('saved.labels')
    lb_pre.save_labels_and_tags(savepath)

    # trash the old GraphLabels
    del lb_pre

    # create new, load labels & tags
    lb = GraphLabels(20, 1e7, 4)
    lb.load_labels_and_tags(savepath)

    labels = lb.get_label_dict()
    expected = [0, 1, 2, 3]
    for e_label in expected:
        assert e_label in labels
    for a_label in labels:
        assert a_label in expected
예제 #35
0
def test_save_fail_readonly():
    lb_pre = GraphLabels.NodeGraphLabels(20, 1e7, 4)
    filename = utils.get_test_data('test-labels.fa')
    lb_pre.consume_seqfile_and_tag_with_labels(filename)

    # save labels to a file
    savepath = utils.get_temp_filename('saved.labels')
    fp = open(savepath, 'w')
    fp.close()

    os.chmod(savepath, 0x444)

    try:
        lb_pre.save_labels_and_tags(savepath)
        assert 0, "this should fail: read-only file"
    except OSError as err:
        print(str(err))
예제 #36
0
def test_get_label_dict_save_load_wrong_ksize():
    lb_pre = GraphLabels(19, 1e7, 4)
    filename = utils.get_test_data('test-labels.fa')
    lb_pre.consume_fasta_and_tag_with_labels(filename)

    # save labels to a file
    savepath = utils.get_temp_filename('saved.labels')
    lb_pre.save_labels_and_tags(savepath)

    # trash the old GraphLabels
    del lb_pre

    # create new, load labels & tags
    lb = GraphLabels(20, 1e7, 4)
    try:
        lb.load_labels_and_tags(savepath)
        assert 0, "this should not succeed - different ksize"
    except OSError as err:
        print(str(err))
        assert "Incorrect k-mer size 19" in str(err)
예제 #37
0
def test_load_wrong_filetype():
    lb = GraphLabels.NodeGraphLabels(20, 1e7, 4)

    # try to load a tagset
    filename = utils.get_test_data('goodversion-k32.tagset')
    try:
        lb.load_labels_and_tags(filename)
        assert 0, "this should not succeed - bad file type"
    except OSError as err:
        print(str(err))
        assert "Incorrect file format type" in str(err)

    # try to load a nonsense file
    filename = utils.get_test_data('all-A.fa')
    try:
        lb.load_labels_and_tags(filename)
        assert 0, "this should not succeed - bad file signature"
    except OSError as err:
        print(str(err))
        assert "Incorrect file signature" in str(err)
예제 #38
0
def test_consume_fasta_and_tag_with_labels():
    lb = GraphLabels(20, 1e7, 4)
    read_1 = 'ACGTAACCGGTTAAACCCGGGTTTAAAACCCCGGGGTTTT'
    filename = utils.get_test_data('test-transcript.fa')

    total_reads, n_consumed = lb.consume_fasta_and_tag_with_labels(filename)
    print("doing get")
    assert lb.graph.get(read_1[:20])
    assert total_reads == 3
    print("doing n_labels")
    print(lb.n_labels())
    print("doing label dict")
    print(lb.get_label_dict())
    print("get tagset")
    for tag in lb.graph.get_tagset():
        print("forward hash")
        print(tag, khmer.forward_hash(tag, 20))
    for record in screed.open(filename):
        print("Sweeping tags")
        print(lb.sweep_tag_neighborhood(record.sequence, 40))
        print("Sweeping labels...")
        print(lb.sweep_label_neighborhood(record.sequence, 40))
    assert lb.n_labels() == 3
예제 #39
0
def main():
    info('sweep-reads-buffered.py', ['sweep'])
    parser = sanitize_help(get_parser())
    args = parser.parse_args()

    if args.max_tablesize < MAX_HSIZE:
        args.max_tablesize = MAX_HSIZE
    if args.ksize < MIN_KSIZE:
        args.ksize = MIN_KSIZE

    report_on_config(args, graphtype='nodegraph')

    K = args.ksize
    HT_SIZE = args.max_tablesize
    N_HT = args.n_tables

    traversal_range = args.traversal_range
    input_fastp = args.input_fastp

    if not args.outdir:
        outdir = os.path.dirname(input_fastp)
    else:
        outdir = args.outdir

    max_buffers = args.max_buffers
    output_pref = args.output_prefix
    buf_size = args.buffer_size
    max_reads = args.max_reads

    check_input_files(args.input_fastp, args.force)
    check_valid_file_exists(args.input_files)
    all_input_files = [input_fastp]
    all_input_files.extend(args.input_files)

    # Check disk space availability
    check_space(all_input_files, args.force)

    # figure out input file type (FA/FQ) -- based on first file
    ix = iter(screed.open(args.input_files[0]))
    record = next(ix)
    del ix

    extension = 'fa'
    if hasattr(record, 'quality'):  # fastq!
        extension = 'fq'

    output_buffer = ReadBufferManager(max_buffers, max_reads, buf_size,
                                      output_pref, outdir, extension)

    # consume the partitioned fasta with which to label the graph
    ht = GraphLabels.NodeGraphLabels(K, HT_SIZE, N_HT)
    try:
        print('consuming input sequences...', file=sys.stderr)
        if args.label_by_pid:
            print('...labeling by partition id (pid)', file=sys.stderr)
            ht.consume_partitioned_fasta_and_tag_with_labels(input_fastp)
        elif args.label_by_seq:
            print('...labeling by sequence', file=sys.stderr)
            for n, record in enumerate(screed.open(input_fastp)):
                if n % 50000 == 0:
                    print('...consumed {n} sequences...'.format(n=n),
                          file=sys.stderr)
                ht.consume_sequence_and_tag_with_labels(record.sequence, n)
        else:
            print('...labeling to create groups of size {s}'.format(
                s=args.group_size),
                  file=sys.stderr)
            label = -1
            g = 0
            try:
                outfp = open(
                    '{pref}_base_{g}.{ext}'.format(pref=output_pref,
                                                   g=g,
                                                   ext=extension), 'wb')
                for n, record in enumerate(screed.open(input_fastp)):
                    if n % args.group_size == 0:
                        label += 1
                        if label > g:
                            g = label
                            outfp = open(
                                '{pref}_base_{g}.{ext}'.format(
                                    pref=output_pref, g=g, ext=extension),
                                'wb')
                    if n % 50000 == 0:
                        print('...consumed {n} sequences...'.format(n=n),
                              file=sys.stderr)
                    ht.consume_sequence_and_tag_with_labels(
                        record.sequence, label)

                    write_record(record, outfp)

            except (IOError, OSError) as e:
                print('!! ERROR !!', e, file=sys.stderr)
                print('...error splitting input. exiting...', file=sys.stderr)

    except (IOError, OSError) as e:
        print('!! ERROR: !!', e, file=sys.stderr)
        print('...error consuming \
                            {i}. exiting...'.format(i=input_fastp),
              file=sys.stderr)

    print('done consuming input sequence. \
                        added {t} tags and {l} \
                        labels...'.format(t=ht.graph.n_tags, l=ht.n_labels))

    label_dict = defaultdict(int)
    label_number_dist = []

    n_orphaned = 0
    n_labeled = 0
    n_mlabeled = 0

    total_t = time.clock()
    start_t = time.clock()
    for read_file in args.input_files:
        print('** sweeping {read_file} for labels...'.format(
            read_file=read_file),
              file=sys.stderr)
        file_t = 0.0
        try:
            read_fp = screed.open(read_file)
        except (IOError, OSError) as error:
            print('!! ERROR: !!', error, file=sys.stderr)
            print('*** Could not open {fn}, skipping...'.format(fn=read_file),
                  file=sys.stderr)
        else:
            for _, record in enumerate(read_fp):
                if _ % 50000 == 0:
                    end_t = time.clock()
                    batch_t = end_t - start_t
                    file_t += batch_t
                    print('\tswept {n} reads [{nc} labeled, \
                                         {no} orphaned] \
                                        ** {sec}s ({sect}s total)' \
                                        .format(n=_, nc=n_labeled,
                                                no=n_orphaned,
                                                sec=batch_t, sect=file_t), file=sys.stderr)
                    start_t = time.clock()
                seq = record.sequence
                name = record.name
                try:
                    labels = list(
                        ht.sweep_label_neighborhood(seq, traversal_range))
                except ValueError as e:
                    pass
                else:
                    if hasattr(record, 'quality'):
                        seq_str = fmt_fastq(name, seq, record.quality, labels)
                    else:
                        seq_str = fmt_fasta(name, seq, labels)
                    label_number_dist.append(len(labels))
                    if labels:
                        n_labeled += 1
                        if len(labels) > 1:
                            output_buffer.queue(seq_str, 'multi')
                            n_mlabeled += 1
                            label_dict['multi'] += 1
                        else:
                            output_buffer.queue(seq_str, labels[0])
                            label_dict[labels[0]] += 1
                    else:
                        n_orphaned += 1
                        output_buffer.queue(seq_str, 'orphaned')
                        label_dict['orphaned'] += 1
            print('** End of file {fn}...'.format(fn=read_file),
                  file=sys.stderr)
            output_buffer.flush_all()
            read_fp.close()

    # gotta output anything left in the buffers at the end!
    print('** End of run...', file=sys.stderr)
    output_buffer.flush_all()
    total_t = time.clock() - total_t

    if output_buffer.num_write_errors > 0 or output_buffer.num_file_errors > 0:
        print('! WARNING: Sweep finished with errors !', file=sys.stderr)
        print('** {writee} reads not written'.format(
            writee=output_buffer.num_write_errors),
              file=sys.stderr)
        print('** {filee} errors opening files'.format(
            filee=output_buffer.num_file_errors),
              file=sys.stderr)

    print('swept {n_reads} for labels...'.format(n_reads=n_labeled +
                                                 n_orphaned),
          file=sys.stderr)
    print('...with {nc} labeled and {no} orphaned'.format(nc=n_labeled,
                                                          no=n_orphaned),
          file=sys.stderr)
    print('...and {nmc} multilabeled'.format(nmc=n_mlabeled), file=sys.stderr)

    print('** outputting label number distribution...', file=sys.stderr)
    fn = os.path.join(outdir, '{pref}.dist.txt'.format(pref=output_pref))
    with open(fn, 'w', encoding='utf-8') as outfp:
        for nc in label_number_dist:
            outfp.write('{nc}\n'.format(nc=nc))

    fn = os.path.join(outdir, '{pref}.counts.csv'.format(pref=output_pref))
    print('** outputting label read counts...', file=sys.stderr)
    with open(fn, 'w', encoding='utf-8') as outfp:
        for k in label_dict:
            outfp.write('{l},{c}\n'.format(l=k, c=label_dict[k]))
예제 #40
0
def test_toobig():
    try:
        GraphLabels(20, 1e13, 1)
        assert 0, "This should fail."
    except MemoryError as err:
        print(str(err))
예제 #41
0
def test_error_create():
    try:
        GraphLabels.NodeGraphLabels(None, None, None)
        assert 0, "This should fail."
    except TypeError as err:
        print(str(err))
예제 #42
0
def test_label_tag_correctness_save_load():
    lb_pre = GraphLabels(20, 1e7, 4)
    filename = utils.get_test_data('test-labels.fa')
    lb_pre.consume_fasta_and_tag_with_labels(filename)

    # save labels to a file
    savepath = utils.get_temp_filename('saved.labels')
    lb_pre.save_labels_and_tags(savepath)

    # trash the old GraphLabels
    del lb_pre

    # create new, load labels & tags
    lb = GraphLabels(20, 1e7, 4)
    lb.load_labels_and_tags(savepath)

    # read A
    labels = lb.sweep_label_neighborhood(
        'ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAG'
        'CTAGGCTAGGTGTGCTCTGCCTAGAGCTAGGCTAGGTGT')
    print(lb.sweep_tag_neighborhood(
        'TTCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGGCTCTGCCTAGAG'
        'CTAGGCTAGGTGTGCTCTGCTAGAGCTAGGCTAGGTGT'))
    print(labels)
    print(len('ATCGTGTAAGCTATCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAG') - 19)
    assert len(labels) == 2
    assert 0 in labels
    assert 1 in labels

    # read B
    labels = lb.sweep_label_neighborhood(
        'GCGTAATCGTAAGCTCTGCCTAGAGCTAGGCTAGCTCTGCCTAGAGCTAGGCTAGGTGTTGGGGATAG'
        'ATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGA')
    print(labels)
    assert len(labels) == 3
    assert 0 in labels
    assert 1 in labels
    assert 2 in labels

    # read C
    labels = lb.sweep_label_neighborhood(
        'TGGGATAGATAGATAGATGACCTAGAGCTAGGCTAGGTGTTGGGGATAGATAGATAGATGACCTAGAG'
        'CTAGGCTAGGTGTTGGGGATAGATAGATAGATGAGTTGGGGATAGATAGATAGATGAGTGTAGATCCA'
        'ACAACACATACA')
    print(labels)
    assert len(labels) == 2
    assert 1 in labels
    assert 2 in labels

    # read D
    labels = lb.sweep_label_neighborhood(
        'TATATATATAGCTAGCTAGCTAACTAGCTAGCATCGATCGATCGATC')
    print(labels)
    assert len(labels) == 1
    assert 3 in labels