def test_trigram(self): expected = TestNGramTransformer.tokensA + ['a+b', 'b+c'] + [ 'a+b+c' ] + TestNGramTransformer.tokensB + ['A+B', 'B+C'] + ['A+B+C'] t = NGramTransformer(Sentinel(), n=3) self.assertListEqual(expected, list(t.n_gram(TestNGramTransformer.segments)))
def test_bigram(self): expected = TestNGramTransformer.tokensA + [ 'a+b', 'b+c' ] + TestNGramTransformer.tokensB + [ 'A+B', 'B+C' ] t = NGramTransformer(Sentinel(), n=2) self.assertListEqual(expected, list(t.n_gram(TestNGramTransformer.segments)))
def test_extract(self): row = [TestNGramTransformer.segments, []] expected = [TestNGramTransformer.tokensA + TestNGramTransformer.tokensB, ['~void~']] t = NGramTransformer(Sentinel(), n=1) for i in range(len(row)): t._extract(row, i) self.assertListEqual(expected, row)
def test_extract(self): row = [TestNGramTransformer.segments, []] expected = [ TestNGramTransformer.tokensA + TestNGramTransformer.tokensB, ['~void~'] ] t = NGramTransformer(Sentinel(), n=1) for i in range(len(row)): t._extract(row, i) self.assertListEqual(expected, row)
def test_iter(self): expected = [ 1, TestNGramTransformer.tokensA + TestNGramTransformer.tokensB ] n = -1 rows = Rows([[1, TestNGramTransformer.segments]] * 3) for n, row in enumerate(NGramTransformer(rows, n=1)): self.assertListEqual(expected, row) self.assertEqual(2, n)
def test_unigram(self): expected = TestNGramTransformer.tokensA + TestNGramTransformer.tokensB t = NGramTransformer(Sentinel(), n=1) self.assertListEqual(expected, list(t.n_gram(TestNGramTransformer.segments)))
def test_setup(self): s = Sentinel() t = NGramTransformer(s) self.assertEqual(s, t.rows) self.assertEqual(1, t.N) self.assertEqual(None, t.text_columns)