def testPayloadsPos0(self):

        writer = self.getWriter(analyzer=TestPayloadAnalyzer())

        doc = Document()
        doc.add(Field("content", "a a b c d e a f g h i j a b k k",
                      TextField.TYPE_STORED))
        writer.addDocument(doc)
        reader = writer.getReader()
        writer.close()

        tp = MultiFields.getTermPositionsEnum(reader,
                                              MultiFields.getLiveDocs(reader),
                                              "content", BytesRef("a"))

        count = 0
        self.assert_(tp.nextDoc() != tp.NO_MORE_DOCS)
        # "a" occurs 4 times
        self.assertEqual(4, tp.freq())

        expected = 0
        self.assertEqual(expected, tp.nextPosition())
        self.assertEqual(1, tp.nextPosition())
        self.assertEqual(3, tp.nextPosition())
        self.assertEqual(6, tp.nextPosition())

        # only one doc has "a"
        self.assert_(tp.nextDoc() == tp.NO_MORE_DOCS)

        searcher = self.getSearcher(reader=reader)
    
        stq1 = SpanTermQuery(Term("content", "a"))
        stq2 = SpanTermQuery(Term("content", "k"))
        sqs = [stq1, stq2]
        snq = SpanNearQuery(sqs, 30, False)

        count = 0
        sawZero = False
        pspans = MultiSpansWrapper.wrap(searcher.getTopReaderContext(), snq)
        while pspans.next():
            payloads = pspans.getPayload()
            sawZero |= pspans.start() == 0

            it = payloads.iterator()
            while it.hasNext():
                count += 1
                it.next()

        self.assertEqual(5, count)
        self.assert_(sawZero)

        spans = MultiSpansWrapper.wrap(searcher.getTopReaderContext(), snq)
        count = 0
        sawZero = False
        while spans.next():
            count += 1
            sawZero |= spans.start() == 0

        self.assertEqual(4, count)
        self.assert_(sawZero)
		
        sawZero = False
        psu = PayloadSpanUtil(searcher.getTopReaderContext())
        pls = psu.getPayloadsForQuery(snq)
        count = pls.size()
        it = pls.iterator()
        while it.hasNext():
            bytes = JArray('byte').cast_(it.next())
            s = bytes.string_
            sawZero |= s == "pos: 0"

        self.assertEqual(5, count)
        self.assert_(sawZero)
示例#2
0
    def testSetPosition(self):

        class _tokenizer(PythonTokenizer):
            def __init__(_self):
                super(_tokenizer, _self).__init__()

                _self.TOKENS = ["1", "2", "3", "4", "5"]
                _self.INCREMENTS = [1, 2, 1, 0, 1]
                _self.i = 0
                _self.posIncrAtt = _self.addAttribute(PositionIncrementAttribute.class_)
                _self.termAtt = _self.addAttribute(CharTermAttribute.class_)
                _self.offsetAtt = _self.addAttribute(OffsetAttribute.class_)

            def incrementToken(_self):
                if _self.i == len(_self.TOKENS):
                    return False

                _self.clearAttributes()
                _self.termAtt.append(_self.TOKENS[_self.i])
                _self.offsetAtt.setOffset(_self.i, _self.i)
                _self.posIncrAtt.setPositionIncrement(_self.INCREMENTS[_self.i])
                _self.i += 1

                return True

            def reset(_self):
                super(_tokenizer, _self).reset()
                _self.i = 0

        class _analyzer(PythonAnalyzer):
            def createComponents(_self, fieldName):
                return Analyzer.TokenStreamComponents(_tokenizer())
            def initReader(_self, fieldName, reader):
                return reader

        writer = self.getWriter(analyzer=_analyzer())

        d = Document()
        d.add(Field("field", "bogus", TextField.TYPE_STORED))

        writer.addDocument(d)
        writer.commit()
        writer.close()

        searcher = self.getSearcher()
        reader = searcher.getIndexReader()
        pos = MultiFields.getTermPositionsEnum(reader, "field", BytesRef("1"))
        pos.nextDoc()
        # first token should be at position 0
        self.assertEqual(0, pos.nextPosition())

        pos = MultiFields.getTermPositionsEnum(reader, "field", BytesRef("2"))
        pos.nextDoc()
        # second token should be at position 2
        self.assertEqual(2, pos.nextPosition())

        b = PhraseQuery.Builder()
        b.add(Term("field", "1"))
        b.add(Term("field", "2"))
        hits = searcher.search(b.build(), 1000).scoreDocs
        self.assertEqual(0, len(hits))

        # same as previous, just specify positions explicitely.
        b = PhraseQuery.Builder()
        b.add(Term("field", "1"), 0)
        b.add(Term("field", "2"), 1)
        hits = searcher.search(b.build(), 1000).scoreDocs
        self.assertEqual(0, len(hits))

        # specifying correct positions should find the phrase.
        b = PhraseQuery.Builder()
        b.add(Term("field", "1"), 0)
        b.add(Term("field", "2"), 2)
        hits = searcher.search(b.build(), 1000).scoreDocs
        self.assertEqual(1, len(hits))

        b = PhraseQuery.Builder()
        b.add(Term("field", "2"))
        b.add(Term("field", "3"))
        hits = searcher.search(b.build(), 1000).scoreDocs
        self.assertEqual(1, len(hits))

        b = PhraseQuery.Builder()
        b.add(Term("field", "3"))
        b.add(Term("field", "4"))
        hits = searcher.search(b.build(), 1000).scoreDocs
        self.assertEqual(0, len(hits))

        # phrase query would find it when correct positions are specified. 
        b = PhraseQuery.Builder()
        b.add(Term("field", "3"), 0)
        b.add(Term("field", "4"), 0)
        hits = searcher.search(b.build(), 1000).scoreDocs
        self.assertEqual(1, len(hits))

        # phrase query should fail for non existing searched term 
        # even if there exist another searched terms in the same searched
        # position.
        b = PhraseQuery.Builder()
        b.add(Term("field", "3"), 0)
        b.add(Term("field", "9"), 0)
        hits = searcher.search(b.build(), 1000).scoreDocs
        self.assertEqual(0, len(hits))

        # multi-phrase query should succed for non existing searched term
        # because there exist another searched terms in the same searched
        # position.

        b = MultiPhraseQuery.Builder()
        b.add([Term("field", "3"), Term("field", "9")], 0)
        hits = searcher.search(b.build(), 1000).scoreDocs
        self.assertEqual(1, len(hits))

        b = PhraseQuery.Builder()
        b.add(Term("field", "2"))
        b.add(Term("field", "4"))
        hits = searcher.search(b.build(), 1000).scoreDocs
        self.assertEqual(1, len(hits))

        b = PhraseQuery.Builder()
        b.add(Term("field", "3"))
        b.add(Term("field", "5"))
        hits = searcher.search(b.build(), 1000).scoreDocs
        self.assertEqual(1, len(hits))

        b = PhraseQuery.Builder()
        b.add(Term("field", "4"))
        b.add(Term("field", "5"))
        hits = searcher.search(b.build(), 1000).scoreDocs
        self.assertEqual(1, len(hits))

        b = PhraseQuery.Builder()
        b.add(Term("field", "2"))
        b.add(Term("field", "5"))
        hits = searcher.search(b.build(), 1000).scoreDocs
        self.assertEqual(0, len(hits))
示例#3
0
    def testPayloadsPos0(self):

        writer = self.getWriter(analyzer=TestPayloadAnalyzer())

        doc = Document()
        doc.add(
            Field("content", "a a b c d e a f g h i j a b k k",
                  TextField.TYPE_STORED))
        writer.addDocument(doc)
        reader = writer.getReader()
        writer.close()

        tp = MultiFields.getTermPositionsEnum(reader,
                                              MultiFields.getLiveDocs(reader),
                                              "content", BytesRef("a"))

        count = 0
        self.assert_(tp.nextDoc() != tp.NO_MORE_DOCS)
        # "a" occurs 4 times
        self.assertEqual(4, tp.freq())

        expected = 0
        self.assertEqual(expected, tp.nextPosition())
        self.assertEqual(1, tp.nextPosition())
        self.assertEqual(3, tp.nextPosition())
        self.assertEqual(6, tp.nextPosition())

        # only one doc has "a"
        self.assert_(tp.nextDoc() == tp.NO_MORE_DOCS)

        searcher = self.getSearcher(reader=reader)

        stq1 = SpanTermQuery(Term("content", "a"))
        stq2 = SpanTermQuery(Term("content", "k"))
        sqs = [stq1, stq2]
        snq = SpanNearQuery(sqs, 30, False)

        count = 0
        sawZero = False
        pspans = MultiSpansWrapper.wrap(searcher.getTopReaderContext(), snq)
        while pspans.next():
            payloads = pspans.getPayload()
            sawZero |= pspans.start() == 0

            it = payloads.iterator()
            while it.hasNext():
                count += 1
                it.next()

        self.assertEqual(5, count)
        self.assert_(sawZero)

        spans = MultiSpansWrapper.wrap(searcher.getTopReaderContext(), snq)
        count = 0
        sawZero = False
        while spans.next():
            count += 1
            sawZero |= spans.start() == 0

        self.assertEqual(4, count)
        self.assert_(sawZero)

        sawZero = False
        psu = PayloadSpanUtil(searcher.getTopReaderContext())
        pls = psu.getPayloadsForQuery(snq)
        count = pls.size()
        it = pls.iterator()
        while it.hasNext():
            bytes = JArray('byte').cast_(it.next())
            s = bytes.string_
            sawZero |= s == "pos: 0"

        self.assertEqual(5, count)
        self.assert_(sawZero)