Exemplo n.º 1
0
    def test_01_word_trie(self):
        # small test
        l = lab.make_word_trie('toonces was a cat who could drive a car very fast until he crashed.')
        expected = read_expected('6.pickle')
        self.assertEqual(expected, dictify(l))

        l = lab.make_word_trie('a man at the market murmered that he had met a mermaid. '
                               'mark didnt believe the man had met a mermaid.')
        expected = read_expected('7.pickle')
        self.assertEqual(expected, dictify(l))

        l = lab.make_word_trie('what happened to the cat who had eaten the ball of yarn?  she had mittens!')
        expected = read_expected('8.pickle')
        self.assertEqual(expected, dictify(l))
Exemplo n.º 2
0
    def test_02_big_autocomplete_1(self):
        alphabet = a = "abcdefghijklmnopqrstuvwxyz"

        word_list = [
            "aa" + l1 + l2 + l3 + l4 for l1 in a for l2 in a for l3 in a
            for l4 in a
        ]
        word_list.extend(
            ["apple", "application", "apple", "apricot", "apricot", "apple"])
        word_list.append("bruteforceisbad")

        trie = lab.make_word_trie(' '.join(word_list))
        for i in range(10):
            result1 = lab.autocomplete(trie, 'ap', 1)
            result2 = lab.autocomplete(trie, 'ap', 2)
            result3 = lab.autocomplete(trie, 'ap', 3)
            result4 = lab.autocomplete(trie, 'ap')

            self.assertEqual(1, len(result1))
            self.assertEqual(2, len(result2))
            self.assertEqual(3, len(result3))
            self.assertEqual(3, len(result4))
            self.assertEqual(["apple"], result1)
            self.assertEqual(set(["apple", "apricot"]), set(result2))
            self.assertEqual(set(["apple", "apricot", "application"]),
                             set(result3))
            self.assertEqual(set(result4), set(result3))
Exemplo n.º 3
0
def test_autocomplete_big_2():
    nums = {
        't': [0, 1, 25, None],
        'th': [0, 1, 21, None],
        'the': [0, 5, 21, None],
        'thes': [0, 1, 21, None]
    }
    with open(os.path.join(TEST_DIRECTORY, 'testing_data', 'frankenstein.txt'),
              encoding='utf-8') as f:
        text = f.read()
    w = lab.make_word_trie(text)
    for i in sorted(nums):
        for n in nums[i]:
            result = lab.autocomplete(w, i, n)
            expected = read_expected('frank_autocomplete_%s_%s.pickle' %
                                     (i, n))
            assert len(expected) == len(
                result), ('missing' if len(result) < len(expected) else
                          'too many') + ' autocomplete results for ' + repr(
                              i) + ' with maxcount = ' + str(n)
            assert set(expected) == set(
                result), 'autocomplete included ' + repr(
                    set(result) - set(expected)) + ' instead of ' + repr(
                        set(expected) - set(result)) + ' for ' + repr(
                            i) + ' with maxcount = ' + str(n)
    with pytest.raises(TypeError):
        result = lab.autocomplete(w, ('tuple', ), None)
Exemplo n.º 4
0
 def test_02_big_autocomplete(self):
     nums = {
         't': [0, 1, 25, None],
         'th': [0, 1, 21, None],
         'the': [0, 5, 21, None],
         'thes': [0, 1, 21, None]
     }
     with open(os.path.join(TEST_DIRECTORY, 'testing_data',
                            'frankenstein.txt'),
               encoding='utf-8') as f:
         text = f.read()
     w = lab.make_word_trie(text)
     for i in sorted(nums):
         for n in nums[i]:
             result = lab.autocomplete(w, i, n)
             expected = read_expected('frank_autocomplete_%s_%s.pickle' %
                                      (i, n))
             self.assertEqual(len(result),
                              len(expected),
                              msg='wrong autocomplete of ' + repr(i) +
                              ' with maxcount = ' + str(n))
             self.assertEqual(set(result),
                              set(expected),
                              msg='wrong autocomplete of ' + repr(i) +
                              ' with maxcount = ' + str(n))
     with self.assertRaises(TypeError):
         result = lab.autocomplete(w, ('tuple', ), None)
Exemplo n.º 5
0
    def test_01_autocomplete(self):
        # Autocomplete on simple trie with less than N valid words
        trie = lab.make_word_trie("cat car carpet")
        result = lab.autocomplete(trie, 'car', 3)
        self.assertIsInstance(result, list, "result not a list.")
        for w in result:
            self.assertIsInstance(w, str, "expecting list of strings.")
        result.sort()
        expect = ["car", "carpet"]
        self.assertEqual(result,
                         expect,
                         msg="incorrect result from autocomplete.")

        trie = lab.make_word_trie("a an ant anteater a an ant a")
        result = lab.autocomplete(trie, 'a', 2)
        self.assertIsInstance(result, list, "result not a list.")
        for w in result:
            self.assertIsInstance(w, str, "expecting list of strings.")
        result.sort()
        expect_one_of = [["a", "an"], ["a", "ant"]]
        self.assertIn(result,
                      expect_one_of,
                      msg="incorrect result from autocomplete.")

        trie = lab.make_word_trie(
            "man mat mattress map me met a man a a a map man met")
        result = lab.autocomplete(trie, 'm', 3)
        self.assertIsInstance(result, list, "result not a list.")
        for w in result:
            self.assertIsInstance(w, str, "expecting list of strings.")
        result.sort()
        expect = ["man", "map", "met"]
        self.assertEqual(result,
                         expect,
                         msg="incorrect result from autocomplete.")

        trie = lab.make_word_trie("hello hell history")
        result = lab.autocomplete(trie, 'help', 3)
        self.assertIsInstance(result, list, "result not a list.")
        for w in result:
            self.assertIsInstance(w, str, "expecting list of strings.")
        expect = []
        self.assertEqual(result,
                         expect,
                         msg="incorrect result from autocomplete.")
        with self.assertRaises(TypeError):
            result = lab.autocomplete(trie, ('tuple', ), None)
Exemplo n.º 6
0
def load_corpus_file(path):
    corpus_name = ''.join(os.path.basename(path).split('.')[:-1])
    with open(path, encoding="utf-8") as f:
        text = f.read()
        wordTrie = lab.make_word_trie(text)
        sentenceTrie = lab.make_phrase_trie(text)
    corpusTries[corpus_name] = (wordTrie, sentenceTrie)
    return corpus_name
Exemplo n.º 7
0
 def test_01_autocorrect(self):
     # Autocorrect on cat in small corpus
     trie = lab.make_word_trie("cats cattle hat car act at chat crate act car act")
     result = lab.autocorrect(trie, 'cat',4)
     self.assertIsInstance(result,list,"result not a list.")
     for w in result:
         self.assertIsInstance(w,str,"expecting list of strings.")
     result.sort()
     expect = ["act", "car", "cats", "cattle"]
     self.assertEqual(expect,result,msg="incorrect result from autocorrect.")
Exemplo n.º 8
0
 def test_03_big_filter_2(self):
     patterns = ('*ing', '*ing?', '****ing', '**ing**', '????', 'mon*',
                 '*?*?*?*', '*???')
     with open(os.path.join(TEST_DIRECTORY, 'testing_data', 'frankenstein.txt'), encoding='utf-8') as f:
         text = f.read()
     w = lab.make_word_trie(text)
     for ix, i in enumerate(patterns):
         result = lab.word_filter(w, i)
         expected = read_expected('frank_filter_%s.pickle' % (ix, ))
         self.assertEqual(len(expected), len(result), msg='incorrect word_filter of '+repr(i))
         self.assertEqual(set(expected), set(result), msg='incorrect word_filter of '+repr(i))
Exemplo n.º 9
0
def test_autocomplete_small():
    # Autocomplete on simple tries with less than N valid words
    trie = lab.make_word_trie("cat car carpet")
    result = lab.autocomplete(trie, 'car', 3)
    assert set(result) == {"car", "carpet"}

    trie = lab.make_word_trie("a an ant anteater a an ant a")
    result = lab.autocomplete(trie, 'a', 2)
    assert set(result) in [{"a", "an"}, {"a", "ant"}]

    trie = lab.make_word_trie(
        "man mat mattress map me met a man a a a map man met")
    result = lab.autocomplete(trie, 'm', 3)
    assert set(result) == {"man", "map", "met"}

    trie = lab.make_word_trie("hello hell history")
    result = lab.autocomplete(trie, 'help', 3)
    assert result == []
    with pytest.raises(TypeError):
        result = lab.autocomplete(trie, ('tuple', ), None)
Exemplo n.º 10
0
 def test_03_big_autocomplete_2(self):
     with open(os.path.join(TEST_DIRECTORY, 'testing_data', 'frankenstein.txt'), encoding='utf-8') as f:
         text = f.read()
     w = lab.make_word_trie(text)
     the_word = 'accompany'
     for ix in range(len(the_word)+1):
         test = the_word[:ix]
         result = lab.autocomplete(w, test)
         expected = read_expected('frank_autocomplete_%s_%s.pickle' % (test, None))
         self.assertEqual(len(result), len(expected), msg='wrong autocomplete of '+repr(test))
         self.assertEqual(set(result), set(expected), msg='wrong autocomplete of '+repr(test))
Exemplo n.º 11
0
def test_big_corpora(bigtext):
    with open(os.path.join(TEST_DIRECTORY, 'testing_data', '%s.txt' % bigtext),
              encoding='utf-8') as f:
        text = f.read()
        w = lab.make_word_trie(text)
        p = lab.make_phrase_trie(text)

        w_e = read_expected('%s_words.pickle' % bigtext)
        p_e = read_expected('%s_phrases.pickle' % bigtext)

        assert w_e == dictify(w), 'word trie does not match for %s' % bigtext
        assert p_e == dictify(p), 'phrase trie does not match for %s' % bigtext
Exemplo n.º 12
0
def test_filter_big_2():
    patterns = ('*ing', '*ing?', '****ing', '**ing**', '????', 'mon*',
                '*?*?*?*', '*???')
    with open(os.path.join(TEST_DIRECTORY, 'testing_data', 'frankenstein.txt'),
              encoding='utf-8') as f:
        text = f.read()
    w = lab.make_word_trie(text)
    for ix, i in enumerate(patterns):
        result = lab.word_filter(w, i)
        expected = read_expected('frank_filter_%s.pickle' % (ix, ))
        assert len(expected) == len(result), 'incorrect word_filter of %r' % i
        assert set(expected) == set(result), 'incorrect word_filter of %r' % i
Exemplo n.º 13
0
    def test_03_big_corpora(self):
        for bigtext in ('holmes', 'earnest', 'frankenstein'):
            with open(os.path.join(TEST_DIRECTORY, 'testing_data', '%s.txt' % bigtext), encoding='utf-8') as f:
                text = f.read()
                w = lab.make_word_trie(text)
                p = lab.make_phrase_trie(text)

                w_e = read_expected('%s_words.pickle' % bigtext)
                p_e = read_expected('%s_phrases.pickle' % bigtext)

                self.assertEqual(w_e, dictify(w), 'word trie does not match for '+bigtext)
                self.assertEqual(p_e, dictify(p), 'phrase trie does not match for '+bigtext)
Exemplo n.º 14
0
    def test_02_big_filter_1(self):
        alphabet = a = "abcdefghijklmnopqrstuvwxyz"

        word_list = ["aa" + l1 + l2 + l3 + l4 for l1 in a for l2 in a for l3 in a for l4 in a]
        word_list.extend(["apple", "application", "apple", "apricot", "apricot", "apple"])
        word_list.append("bruteforceisbad")

        trie = lab.make_word_trie(' '.join(word_list))
        for i in range(20):
            result = lab.word_filter(trie, "ap*")
            expected = [('apple', 3), ('apricot', 2), ('application', 1)]
            self.assertEqual(len(expected), len(result), msg='incorrect word_filter of ap*')
            self.assertEqual(set(expected), set(result), msg='incorrect word_filter of ap*')
Exemplo n.º 15
0
 def test_02_big_autocorrect(self):
     nums = {'thin': [0, 8, 10, None],
             'tom': [0, 2, 4, None],
             'mon': [0, 2, 15, 17, 20, None]}
     with open(os.path.join(TEST_DIRECTORY, 'testing_data', 'frankenstein.txt'), encoding='utf-8') as f:
         text = f.read()
     w = lab.make_word_trie(text)
     for i in sorted(nums):
         for n in nums[i]:
             result = lab.autocorrect(w, i, n)
             expected = read_expected('frank_autocorrect_%s_%s.pickle' % (i, n))
             self.assertEqual(len(result), len(expected), msg='wrong autocorrect of '+repr(i)+' with maxcount = '+str(n))
             self.assertEqual(set(result), set(expected), msg='wrong autocorrect of '+repr(i)+' with maxcount = '+str(n))
Exemplo n.º 16
0
 def test_04_big_autocomplete_3(self):
     with open(os.path.join(TEST_DIRECTORY, 'testing_data', 'frankenstein.txt'), encoding='utf-8') as f:
         text = f.read()
     w = lab.make_word_trie(text)
     the_word = 'accompany'
     for ix in range(len(the_word)+1):
         test = the_word[:ix]
         result = lab.autocomplete(w, test)
         expected = read_expected('frank_autocomplete_%s_%s.pickle' % (test, None))
         self.assertEqual(len(expected), len(result), msg=('missing' if len(result) < len(expected)\
             else 'too many') + ' autocomplete results for ' + repr(test) + ' with maxcount = None')
         self.assertEqual(set(expected), set(result), msg='autocomplete included ' + repr(set(result) - set(expected))\
             + ' instead of ' + repr(set(expected) - set(result)) + ' for ' + repr(test) + ' with maxcount = None')
     with self.assertRaises(TypeError):
         result = lab.autocomplete(w, ('tuple', ), None)
Exemplo n.º 17
0
    def test_tiny2(self):
        trie = lab.make_word_trie('do down down drown drown drown doing doing \
                                  done done done dead dead dead dead at and cat cat car car car'
                                  )

        result = lab.autocomplete(trie, 'do', 2)
        expect = ['done', 'down']
        self.assertEqual(result, expect)

        result = lab.autocomplete(trie, 'd', 3)
        expect = ['dead', 'done', 'drown']
        self.assertEqual(result, expect)

        result = sorted(lab.autocomplete(trie, 'do', None))
        expect = sorted(['done', 'down', 'doing', 'do'])
        self.assertEqual(result, expect)
Exemplo n.º 18
0
    def test_01_filter(self):
        # Filter to select all words in trie
        trie = lab.make_word_trie(
            "man mat mattress map me met a man a a a map man met")
        result = lab.word_filter(trie, '*')
        self.assertIsInstance(result, list, "result not a list.")
        result.sort()
        expect = [("a", 4), ("man", 3), ("map", 2), ("mat", 1),
                  ("mattress", 1), ("me", 1), ("met", 2)]
        self.assertEqual(result, expect, msg="incorrect result from filter.")

        # All three-letter words in trie
        result = lab.word_filter(trie, '???')
        self.assertIsInstance(result, list, "result not a list.")
        result.sort()
        expect = [("man", 3), ("map", 2), ("mat", 1), ("met", 2)]
        self.assertEqual(result, expect, msg="incorrect result from filter.")

        # Words beginning with 'mat'
        result = lab.word_filter(trie, 'mat*')
        self.assertIsInstance(result, list, "result not a list.")
        result.sort()
        expect = [("mat", 1), ("mattress", 1)]
        self.assertEqual(result, expect, msg="incorrect result from filter.")

        # Words beginning with 'm', third letter is t
        result = lab.word_filter(trie, 'm?t*')
        self.assertIsInstance(result, list, "result not a list.")
        result.sort()
        expect = [("mat", 1), ("mattress", 1), ("met", 2)]
        self.assertEqual(result, expect, msg="incorrect result from filter.")

        # Words with at least 4 letters
        result = lab.word_filter(trie, '*????')
        self.assertIsInstance(result, list, "result not a list.")
        result.sort()
        expect = [("mattress", 1)]
        self.assertEqual(result, expect, msg="incorrect result from filter.")

        # All words
        result = lab.word_filter(trie, '**')
        self.assertIsInstance(result, list, "result not a list.")
        result.sort()
        expect = [("a", 4), ("man", 3), ("map", 2), ("mat", 1),
                  ("mattress", 1), ("me", 1), ("met", 2)]
        self.assertEqual(result, expect, msg="incorrect result from filter.")
Exemplo n.º 19
0
def test_filter_big_1():
    alphabet = a = "abcdefghijklmnopqrstuvwxyz"

    word_list = [
        "aa" + l1 + l2 + l3 + l4 for l1 in a for l2 in a for l3 in a
        for l4 in a
    ]
    word_list.extend(
        ["apple", "application", "apple", "apricot", "apricot", "apple"])
    word_list.append("bruteforceisbad")

    trie = lab.make_word_trie(' '.join(word_list))
    for i in range(1000):
        result = lab.word_filter(trie, "ap*")
        expected = {('apple', 3), ('apricot', 2), ('application', 1)}
        assert len(expected) == len(result), 'incorrect word_filter of ap*'
        assert set(expected) == set(result), 'incorrect word_filter of ap*'
Exemplo n.º 20
0
    def test_tiny1(self):
        trie = lab.make_word_trie("bat bat bark bar")

        result = lab.autocomplete(trie, 'ba', 1)
        expect = ['bat']
        self.assertEqual(result, expect)

        result = sorted(lab.autocomplete(trie, 'ba', 2))
        expect = sorted(['bat', 'bar'])
        self.assertEqual(result, expect)

        result = lab.autocomplete(trie, 'c', 2)
        expect = []
        self.assertEqual(result, expect)

        result = lab.autocomplete(trie, 'b', None)
        expect = ['bat', 'bar', 'bark']
        self.assertEqual(result, expect)
Exemplo n.º 21
0
 def test_02_big_autocorrect(self):
     nums = {
         'thin': [0, 8, 10, None],
         'tom': [0, 2, 4, None],
         'mon': [0, 2, 15, 17, 20, None]
     }
     with open(os.path.join(TEST_DIRECTORY, 'resources', 'testing_data',
                            'frankenstein.txt'),
               encoding='utf-8') as f:
         text = f.read()
     w = lab.make_word_trie(text)
     for i in sorted(nums):
         for n in nums[i]:
             result = lab.autocorrect(w, i, n)
             expected = read_expected('frank_autocorrect_%s_%s.pickle' %
                                      (i, n))
             self.assertEqual(len(result), len(expected), msg=('missing' if len(result) < len(expected)\
                 else 'too many') + ' autocorrect results for ' + repr(i) + ' with macount = ' + str(n))
             self.assertEqual(set(result), set(expected), msg='autocorrect included ' + repr(set(result) - set(expected))\
                 + ' instead of ' + repr(set(expected) - set(result)) + ' for ' + repr(i) + ' with maxcount = '+str(n))
Exemplo n.º 22
0
def test_filter_small():
    # Filter to select all words in trie
    trie = lab.make_word_trie(
        "man mat mattress map me met a man a a a map man met")
    result = lab.word_filter(trie, '*')
    assert isinstance(result, list)
    result.sort()
    assert result == [("a", 4), ("man", 3), ("map", 2), ("mat", 1),
                      ("mattress", 1), ("me", 1), ("met", 2)]

    # All three-letter words in trie
    result = lab.word_filter(trie, '???')
    assert isinstance(result, list)
    result.sort()
    assert result == [("man", 3), ("map", 2), ("mat", 1), ("met", 2)]

    # Words beginning with 'mat'
    result = lab.word_filter(trie, 'mat*')
    assert isinstance(result, list)
    result.sort()
    assert result == [("mat", 1), ("mattress", 1)]

    # Words beginning with 'm', third letter is t
    result = lab.word_filter(trie, 'm?t*')
    assert isinstance(result, list)
    result.sort()
    assert result == [("mat", 1), ("mattress", 1), ("met", 2)]

    # Words with at least 4 letters
    result = lab.word_filter(trie, '*????')
    assert isinstance(result, list)
    result.sort()
    assert result == [("mattress", 1)]

    # All words
    result = lab.word_filter(trie, '**')
    assert isinstance(result, list)
    result.sort()
    assert result == [("a", 4), ("man", 3), ("map", 2), ("mat", 1),
                      ("mattress", 1), ("me", 1), ("met", 2)]
Exemplo n.º 23
0
def test_autocomplete_big_1():
    alphabet = a = "abcdefghijklmnopqrstuvwxyz"

    word_list = [
        "aa" + l1 + l2 + l3 + l4 for l1 in a for l2 in a for l3 in a
        for l4 in a
    ]
    word_list.extend(
        ["apple", "application", "apple", "apricot", "apricot", "apple"])
    word_list.append("bruteforceisbad")

    trie = lab.make_word_trie(' '.join(word_list))
    for i in range(1000):
        result1 = lab.autocomplete(trie, 'ap', 1)
        result2 = lab.autocomplete(trie, 'ap', 2)
        result3 = lab.autocomplete(trie, 'ap', 3)
        result4 = lab.autocomplete(trie, 'ap')

        assert set(result1) == {'apple'}
        assert set(result2) == {'apple', 'apricot'}
        assert set(result4) == set(result3) == {
            'apple', 'apricot', 'application'
        }
Exemplo n.º 24
0
def test_autocorrect_small():
    # Autocorrect on cat in small corpus
    trie = lab.make_word_trie(
        "cats cattle hat car act at chat crate act car act")
    result = lab.autocorrect(trie, 'cat', 4)
    assert set(result) == {"act", "car", "cats", "cattle"}