def get_defects(usertext_filename, corpuses=None):
    """Return a list of bigrams that doesn't exist in the corpuses."""
    usertext_words = generate_words(usertext_filename)
    bigrams = get_bigrams(usertext_words)
    print('Found {0} bigrams in {1}'.format(len(list(get_bigrams(usertext_words))), usertext_filename))
    corpus = Corpus(corpuses)
    defects = []
    for bigram in bigrams:
        if not corpus.bigram_exists(bigram):
            defects.append(bigram)
    return defects
예제 #2
0
 def test_generate_words_from_one_corpus(self):
     expected = [
         b'I',
         b'like',
         b'AI',
         b'.',
         b'It',
         b"'",
         b's',
         b'the',
         b'ever',
         b'.',
         b'I',
         b'like',
         b'AI',
         b'.',
     ]
     actual = generate_words(self.corpus_one)
     self.assertEqual(expected, list(actual))
예제 #3
0
 def test_generate_words_from_multiple_corpus(self):
     expected = [
         b'I',
         b'like',
         b'AI',
         b'.',
         b'It',
         b"'",
         b's',
         b'the',
         b'ever',
         b'.',
         b'I',
         b'like',
         b'AI',
         b'.',
         b'Duckhunt',
         b'is',
         b'the',
         b'best',
         b'assignment',
         b'.',
         b'I',
         b'really',
         b'enjoy',
         b'shooting',
         b'ducks',
         b'.',
         b'Especially',
         b'with',
         b'hmm',
         b'.'
     ]
     files = [self.corpus_one, self.corpus_two]
     actual = generate_words(files)
     self.assertEqual(expected, list(actual))