Exemplo n.º 1
0
    def testStopList(self):

        stopWords = ["good", "test", "analyzer"]
        stopWordsSet = StopFilter.makeStopSet(stopWords)

        newStop = StopAnalyzer(stopWordsSet)
        reader = StringReader("This is a good test of the english stop analyzer")
        stream = newStop.tokenStream("test", reader)
        self.assert_(stream is not None)
        stream.reset()

        termAtt = stream.getAttribute(CharTermAttribute.class_)

        while stream.incrementToken():
            text = termAtt.toString()
            self.assert_(text not in stopWordsSet)
Exemplo n.º 2
0
    def testStopListPositions(self):

        stopWords = ["good", "test", "analyzer"]
        stopWordsSet = StopFilter.makeStopSet(stopWords)

        newStop = StopAnalyzer(stopWordsSet)
        reader = StringReader("This is a good test of the english stop analyzer with positions")
        expectedIncr = [ 1,   1, 1,          3, 1,  1,      1,            2,   1]
        stream = newStop.tokenStream("test", reader)
        self.assert_(stream is not None)
        stream.reset()

        i = 0
        termAtt = stream.getAttribute(CharTermAttribute.class_)
        posIncrAtt = stream.addAttribute(PositionIncrementAttribute.class_)

        while stream.incrementToken():
            text = termAtt.toString()
            self.assert_(text not in stopWordsSet)
            self.assertEqual(expectedIncr[i],
                             posIncrAtt.getPositionIncrement())
            i += 1
Exemplo n.º 3
0
class StopAnalyzerTestCase(unittest.TestCase):
    """
    Unit tests ported from Java Lucene
    """

    def setUp(self):

        self.stop = StopAnalyzer()
        self.invalidTokens = StopAnalyzer.ENGLISH_STOP_WORDS_SET

    def testDefaults(self):

        self.assert_(self.stop is not None)
        reader = StringReader("This is a test of the english stop analyzer")
        stream = self.stop.tokenStream("test", reader)
        self.assert_(stream is not None)
        stream.reset()

        termAtt = stream.getAttribute(CharTermAttribute.class_)

        while stream.incrementToken():
            self.assert_(termAtt.toString() not in self.invalidTokens)

    def testStopList(self):

        stopWords = ["good", "test", "analyzer"]
        stopWordsSet = StopFilter.makeStopSet(stopWords)

        newStop = StopAnalyzer(stopWordsSet)
        reader = StringReader("This is a good test of the english stop analyzer")
        stream = newStop.tokenStream("test", reader)
        self.assert_(stream is not None)
        stream.reset()

        termAtt = stream.getAttribute(CharTermAttribute.class_)

        while stream.incrementToken():
            text = termAtt.toString()
            self.assert_(text not in stopWordsSet)

    def testStopListPositions(self):

        stopWords = ["good", "test", "analyzer"]
        stopWordsSet = StopFilter.makeStopSet(stopWords)

        newStop = StopAnalyzer(stopWordsSet)
        reader = StringReader("This is a good test of the english stop analyzer with positions")
        expectedIncr = [ 1,   1, 1,          3, 1,  1,      1,            2,   1]
        stream = newStop.tokenStream("test", reader)
        self.assert_(stream is not None)
        stream.reset()

        i = 0
        termAtt = stream.getAttribute(CharTermAttribute.class_)
        posIncrAtt = stream.addAttribute(PositionIncrementAttribute.class_)

        while stream.incrementToken():
            text = termAtt.toString()
            self.assert_(text not in stopWordsSet)
            self.assertEqual(expectedIncr[i],
                             posIncrAtt.getPositionIncrement())
            i += 1