Exemplo n.º 1
0
def test_permutation_decoder():
    gutenberg = open_data("gutenberg.txt").read()
    flatland = open_data("EN-text/flatland.txt").read()

    pd = PermutationDecoder(canonicalize(gutenberg))
    assert pd.decode('aba') in ('ece', 'ete', 'tat', 'tit', 'txt')

    pd = PermutationDecoder(canonicalize(flatland))
    assert pd.decode('aba') in ('ded', 'did', 'ece', 'ele', 'eme', 'ere',
                                'eve', 'eye', 'iti', 'mom', 'ses', 'tat',
                                'tit')
Exemplo n.º 2
0
def test_samples():
    story = open_data("EN-text/flatland.txt").read()
    story += open_data("gutenberg.txt").read()
    wordseq = words(story)
    P1 = UnigramWordModel(wordseq)
    P2 = NgramWordModel(2, wordseq)
    P3 = NgramWordModel(3, wordseq)

    s1 = P1.samples(10)
    s2 = P3.samples(10)
    s3 = P3.samples(10)

    assert len(s1.split(' ')) == 10
    assert len(s2.split(' ')) == 10
    assert len(s3.split(' ')) == 10
Exemplo n.º 3
0
def test_viterbi_segmentation():
    flatland = open_data("EN-text/flatland.txt").read()
    wordseq = words(flatland)
    P = UnigramWordModel(wordseq)
    text = "itiseasytoreadwordswithoutspaces"

    s, p = viterbi_segment(text, P)
    assert s == [
        'it', 'is', 'easy', 'to', 'read', 'words', 'without', 'spaces'
    ]
Exemplo n.º 4
0
def test_text_models():
    flatland = open_data("EN-text/flatland.txt").read()
    wordseq = words(flatland)
    P1 = UnigramWordModel(wordseq)
    P2 = NgramWordModel(2, wordseq)
    P3 = NgramWordModel(3, wordseq)

    # Test top
    assert P1.top(5) == [(2081, 'the'), (1479, 'of'), (1021, 'and'),
                         (1008, 'to'), (850, 'a')]

    assert P2.top(5) == [(368, ('of', 'the')), (152, ('to', 'the')),
                         (152, ('in', 'the')), (86, ('of', 'a')),
                         (80, ('it', 'is'))]

    assert P3.top(5) == [(30, ('a', 'straight', 'line')),
                         (19, ('of', 'three', 'dimensions')),
                         (16, ('the', 'sense', 'of')),
                         (13, ('by', 'the', 'sense')),
                         (13, ('as', 'well', 'as'))]

    # Test isclose
    assert isclose(P1['the'], 0.0611, rel_tol=0.001)
    assert isclose(P2['of', 'the'], 0.0108, rel_tol=0.01)
    assert isclose(P3['so', 'as', 'to'], 0.000323, rel_tol=0.001)

    # Test cond_prob.get
    assert P2.cond_prob.get(('went', )) is None
    assert P3.cond_prob['in', 'order'].dictionary == {'to': 6}

    # Test dictionary
    test_string = 'unigram'
    wordseq = words(test_string)
    P1 = UnigramWordModel(wordseq)
    assert P1.dictionary == {('unigram'): 1}

    test_string = 'bigram text'
    wordseq = words(test_string)
    P2 = NgramWordModel(2, wordseq)
    assert P2.dictionary == {('bigram', 'text'): 1}

    test_string = 'test trigram text here'
    wordseq = words(test_string)
    P3 = NgramWordModel(3, wordseq)
    assert ('test', 'trigram', 'text') in P3.dictionary
    assert ('trigram', 'text', 'here') in P3.dictionary
Exemplo n.º 5
0
def test_rot13_decoding():
    flatland = open_data("EN-text/flatland.txt").read()
    ring = ShiftDecoder(flatland)
    msg = ring.decode(rot13('Hello, world!'))

    assert msg == 'Hello, world!'
Exemplo n.º 6
0
def test_shift_decoding():
    flatland = open_data("EN-text/flatland.txt").read()
    ring = ShiftDecoder(flatland)
    msg = ring.decode('Kyzj zj r jvtivk dvjjrxv.')

    assert msg == 'This is a secret message.'
Exemplo n.º 7
0
def test_parse_csv():
    Iris = open_data('iris.csv').read()
    assert parse_csv(Iris)[0] == [5.1, 3.5, 1.4, 0.2, 'setosa']