def test_normalizer(self): class PrefixNormalizer(analysis.TokenNormalizer): def __init__(self, length, prefix=None): self.length = length super(PrefixNormalizer, self).__init__(prefix=prefix) def normalize(self, token): yield token[:self.length] # Create an analyzer and add a LowercaseNormalizer and this # PrefixNormalizer. analyzer = analysis.WhitespaceAnalyzer() ret = analyzer.add_token_normalizer(PrefixNormalizer(3)) ret = analyzer.add_token_normalizer(analysis.LowercaseNormalizer()) # Make sure add_token_normalizer returns the analyzer, for chaining self.assertIs(analyzer, ret) expected = [("PrefixNormalizer", u"Foo"), ("LowercaseNormalizer", u"foobarbaz")] result = analyzer.normalize_token(u"Foobarbaz") self.assertEqual(expected, result)
def test_empty_normalizer(self): class EmptyNormalizer(analysis.TokenNormalizer): def normalize(self, token): # yield nothing return (i for i in []) analyzer = analysis.WhitespaceAnalyzer() analyzer.add_token_normalizer(EmptyNormalizer()) self.assertEqual([], list(analyzer.normalize_token(u"Foobarbaz")))
def test_tokens(self): analyzer = analysis.WhitespaceAnalyzer() # WhitespaceAnalyzer simply splits on whitespace. expected = ["foo", "bar", "baz"] self.assertListEqual(expected, analyzer.tokens(u"foo bar baz")) self.assertListEqual(expected, analyzer.tokens(u"foo bar baz")) self.assertListEqual(expected, analyzer.tokens(u" foo bar baz")) self.assertListEqual(expected, analyzer.tokens(u"foo bar baz ")) self.assertListEqual(expected, analyzer.tokens(u"foo bar baz\n")) self.assertListEqual(expected, analyzer.tokens(u"foo\nbar baz")) self.assertListEqual(expected, analyzer.tokens(u"\nfoo bar baz"))
def test_query(self): analyzer = analysis.WhitespaceAnalyzer() query = analyzer.query(u"foo bar baz") self.assertIsInstance(query, search.Query) expected_terms = [ dict(term="foo", pos=0), dict(term="bar", pos=1), dict(term="baz", pos=2) ] self.assertItemsEqual(expected_terms, query.terms)
def test_normalizer_multiple(self): # Test a normalizer that maps a token to multiple things class BigramNormalizer(analysis.TokenNormalizer): def normalize(self, token): # Yield all 2-character sequences in token for i in xrange(len(token) - 1): yield token[i:i + 2] norm = BigramNormalizer() self.assertEqual(["te", "er", "rm"], list(norm.normalize(u"term"))) analyzer = analysis.WhitespaceAnalyzer() analyzer.add_token_normalizer(norm) expected = [("BigramNormalizer", u"te"), ("BigramNormalizer", u"er"), ("BigramNormalizer", u"rm")] self.assertEqual(expected, analyzer.normalize_token(u"term"))
def test_conflated_query(self): analyzer = analysis.WhitespaceAnalyzer() analyzer.add_token_normalizer(analysis.LowercaseNormalizer()) m = model.Model(analyzer, park.SQLiteStore(":memory:")) m.train(u"This is a test") m.train(u"this is a test") query = analyzer.query(u"this is a query", m) expected = [ dict(term="this", pos=0), dict(term="This", pos=0), dict(term="is", pos=1), dict(term="a", pos=2), dict(term="query", pos=3) ] self.assertListEqual(expected, query.terms)
def test_join(self): analyzer = analysis.WhitespaceAnalyzer() self.assertEqual("foo bar baz", analyzer.join(["foo", "bar", "baz"]))
def test_tokens_str(self): analyzer = analysis.WhitespaceAnalyzer() with self.assertRaises(TypeError): analyzer.tokens("non-unicode string")
def test_normalizer_str(self): analyzer = analysis.WhitespaceAnalyzer() with self.assertRaises(TypeError): analyzer.normalize_token("non-unicode")