Пример #1
0
    def test_normalizer(self):
        class PrefixNormalizer(analysis.TokenNormalizer):
            def __init__(self, length, prefix=None):
                self.length = length
                super(PrefixNormalizer, self).__init__(prefix=prefix)

            def normalize(self, token):
                yield token[:self.length]

        # Create an analyzer and add a LowercaseNormalizer and this
        # PrefixNormalizer.
        analyzer = analysis.WhitespaceAnalyzer()

        ret = analyzer.add_token_normalizer(PrefixNormalizer(3))
        ret = analyzer.add_token_normalizer(analysis.LowercaseNormalizer())

        # Make sure add_token_normalizer returns the analyzer, for chaining
        self.assertIs(analyzer, ret)

        expected = [("PrefixNormalizer", u"Foo"),
                    ("LowercaseNormalizer", u"foobarbaz")]

        result = analyzer.normalize_token(u"Foobarbaz")

        self.assertEqual(expected, result)
Пример #2
0
    def test_empty_normalizer(self):
        class EmptyNormalizer(analysis.TokenNormalizer):
            def normalize(self, token):
                # yield nothing
                return (i for i in [])

        analyzer = analysis.WhitespaceAnalyzer()
        analyzer.add_token_normalizer(EmptyNormalizer())

        self.assertEqual([], list(analyzer.normalize_token(u"Foobarbaz")))
Пример #3
0
    def test_tokens(self):
        analyzer = analysis.WhitespaceAnalyzer()

        # WhitespaceAnalyzer simply splits on whitespace.
        expected = ["foo", "bar", "baz"]

        self.assertListEqual(expected, analyzer.tokens(u"foo bar baz"))
        self.assertListEqual(expected, analyzer.tokens(u"foo  bar baz"))
        self.assertListEqual(expected, analyzer.tokens(u" foo bar baz"))
        self.assertListEqual(expected, analyzer.tokens(u"foo bar baz "))
        self.assertListEqual(expected, analyzer.tokens(u"foo bar baz\n"))
        self.assertListEqual(expected, analyzer.tokens(u"foo\nbar baz"))
        self.assertListEqual(expected, analyzer.tokens(u"\nfoo bar baz"))
Пример #4
0
    def test_query(self):
        analyzer = analysis.WhitespaceAnalyzer()

        query = analyzer.query(u"foo bar baz")

        self.assertIsInstance(query, search.Query)

        expected_terms = [
            dict(term="foo", pos=0),
            dict(term="bar", pos=1),
            dict(term="baz", pos=2)
        ]

        self.assertItemsEqual(expected_terms, query.terms)
Пример #5
0
    def test_normalizer_multiple(self):
        # Test a normalizer that maps a token to multiple things
        class BigramNormalizer(analysis.TokenNormalizer):
            def normalize(self, token):
                # Yield all 2-character sequences in token
                for i in xrange(len(token) - 1):
                    yield token[i:i + 2]

        norm = BigramNormalizer()
        self.assertEqual(["te", "er", "rm"], list(norm.normalize(u"term")))

        analyzer = analysis.WhitespaceAnalyzer()
        analyzer.add_token_normalizer(norm)

        expected = [("BigramNormalizer", u"te"), ("BigramNormalizer", u"er"),
                    ("BigramNormalizer", u"rm")]

        self.assertEqual(expected, analyzer.normalize_token(u"term"))
Пример #6
0
    def test_conflated_query(self):
        analyzer = analysis.WhitespaceAnalyzer()
        analyzer.add_token_normalizer(analysis.LowercaseNormalizer())

        m = model.Model(analyzer, park.SQLiteStore(":memory:"))
        m.train(u"This is a test")
        m.train(u"this is a test")

        query = analyzer.query(u"this is a query", m)

        expected = [
            dict(term="this", pos=0),
            dict(term="This", pos=0),
            dict(term="is", pos=1),
            dict(term="a", pos=2),
            dict(term="query", pos=3)
        ]

        self.assertListEqual(expected, query.terms)
Пример #7
0
 def test_join(self):
     analyzer = analysis.WhitespaceAnalyzer()
     self.assertEqual("foo bar baz", analyzer.join(["foo", "bar", "baz"]))
Пример #8
0
    def test_tokens_str(self):
        analyzer = analysis.WhitespaceAnalyzer()

        with self.assertRaises(TypeError):
            analyzer.tokens("non-unicode string")
Пример #9
0
    def test_normalizer_str(self):
        analyzer = analysis.WhitespaceAnalyzer()

        with self.assertRaises(TypeError):
            analyzer.normalize_token("non-unicode")