def testPayloadsPos0(self): writer = self.getWriter(analyzer=TestPayloadAnalyzer()) doc = Document() doc.add(Field("content", "a a b c d e a f g h i j a b k k", TextField.TYPE_STORED)) writer.addDocument(doc) reader = writer.getReader() writer.close() tp = MultiFields.getTermPositionsEnum(reader, MultiFields.getLiveDocs(reader), "content", BytesRef("a")) count = 0 self.assert_(tp.nextDoc() != tp.NO_MORE_DOCS) # "a" occurs 4 times self.assertEqual(4, tp.freq()) expected = 0 self.assertEqual(expected, tp.nextPosition()) self.assertEqual(1, tp.nextPosition()) self.assertEqual(3, tp.nextPosition()) self.assertEqual(6, tp.nextPosition()) # only one doc has "a" self.assert_(tp.nextDoc() == tp.NO_MORE_DOCS) searcher = self.getSearcher(reader=reader) stq1 = SpanTermQuery(Term("content", "a")) stq2 = SpanTermQuery(Term("content", "k")) sqs = [stq1, stq2] snq = SpanNearQuery(sqs, 30, False) count = 0 sawZero = False pspans = MultiSpansWrapper.wrap(searcher.getTopReaderContext(), snq) while pspans.next(): payloads = pspans.getPayload() sawZero |= pspans.start() == 0 it = payloads.iterator() while it.hasNext(): count += 1 it.next() self.assertEqual(5, count) self.assert_(sawZero) spans = MultiSpansWrapper.wrap(searcher.getTopReaderContext(), snq) count = 0 sawZero = False while spans.next(): count += 1 sawZero |= spans.start() == 0 self.assertEqual(4, count) self.assert_(sawZero) sawZero = False psu = PayloadSpanUtil(searcher.getTopReaderContext()) pls = psu.getPayloadsForQuery(snq) count = pls.size() it = pls.iterator() while it.hasNext(): bytes = JArray('byte').cast_(it.next()) s = bytes.string_ sawZero |= s == "pos: 0" self.assertEqual(5, count) self.assert_(sawZero)
def testSetPosition(self): class _tokenizer(PythonTokenizer): def __init__(_self): super(_tokenizer, _self).__init__() _self.TOKENS = ["1", "2", "3", "4", "5"] _self.INCREMENTS = [1, 2, 1, 0, 1] _self.i = 0 _self.posIncrAtt = _self.addAttribute(PositionIncrementAttribute.class_) _self.termAtt = _self.addAttribute(CharTermAttribute.class_) _self.offsetAtt = _self.addAttribute(OffsetAttribute.class_) def incrementToken(_self): if _self.i == len(_self.TOKENS): return False _self.clearAttributes() _self.termAtt.append(_self.TOKENS[_self.i]) _self.offsetAtt.setOffset(_self.i, _self.i) _self.posIncrAtt.setPositionIncrement(_self.INCREMENTS[_self.i]) _self.i += 1 return True def reset(_self): super(_tokenizer, _self).reset() _self.i = 0 class _analyzer(PythonAnalyzer): def createComponents(_self, fieldName): return Analyzer.TokenStreamComponents(_tokenizer()) def initReader(_self, fieldName, reader): return reader writer = self.getWriter(analyzer=_analyzer()) d = Document() d.add(Field("field", "bogus", TextField.TYPE_STORED)) writer.addDocument(d) writer.commit() writer.close() searcher = self.getSearcher() reader = searcher.getIndexReader() pos = MultiFields.getTermPositionsEnum(reader, "field", BytesRef("1")) pos.nextDoc() # first token should be at position 0 self.assertEqual(0, pos.nextPosition()) pos = MultiFields.getTermPositionsEnum(reader, "field", BytesRef("2")) pos.nextDoc() # second token should be at position 2 self.assertEqual(2, pos.nextPosition()) b = PhraseQuery.Builder() b.add(Term("field", "1")) b.add(Term("field", "2")) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(0, len(hits)) # same as previous, just specify positions explicitely. b = PhraseQuery.Builder() b.add(Term("field", "1"), 0) b.add(Term("field", "2"), 1) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(0, len(hits)) # specifying correct positions should find the phrase. b = PhraseQuery.Builder() b.add(Term("field", "1"), 0) b.add(Term("field", "2"), 2) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(1, len(hits)) b = PhraseQuery.Builder() b.add(Term("field", "2")) b.add(Term("field", "3")) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(1, len(hits)) b = PhraseQuery.Builder() b.add(Term("field", "3")) b.add(Term("field", "4")) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(0, len(hits)) # phrase query would find it when correct positions are specified. b = PhraseQuery.Builder() b.add(Term("field", "3"), 0) b.add(Term("field", "4"), 0) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(1, len(hits)) # phrase query should fail for non existing searched term # even if there exist another searched terms in the same searched # position. b = PhraseQuery.Builder() b.add(Term("field", "3"), 0) b.add(Term("field", "9"), 0) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(0, len(hits)) # multi-phrase query should succed for non existing searched term # because there exist another searched terms in the same searched # position. b = MultiPhraseQuery.Builder() b.add([Term("field", "3"), Term("field", "9")], 0) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(1, len(hits)) b = PhraseQuery.Builder() b.add(Term("field", "2")) b.add(Term("field", "4")) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(1, len(hits)) b = PhraseQuery.Builder() b.add(Term("field", "3")) b.add(Term("field", "5")) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(1, len(hits)) b = PhraseQuery.Builder() b.add(Term("field", "4")) b.add(Term("field", "5")) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(1, len(hits)) b = PhraseQuery.Builder() b.add(Term("field", "2")) b.add(Term("field", "5")) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(0, len(hits))
def testPayloadsPos0(self): writer = self.getWriter(analyzer=TestPayloadAnalyzer()) doc = Document() doc.add( Field("content", "a a b c d e a f g h i j a b k k", TextField.TYPE_STORED)) writer.addDocument(doc) reader = writer.getReader() writer.close() tp = MultiFields.getTermPositionsEnum(reader, MultiFields.getLiveDocs(reader), "content", BytesRef("a")) count = 0 self.assert_(tp.nextDoc() != tp.NO_MORE_DOCS) # "a" occurs 4 times self.assertEqual(4, tp.freq()) expected = 0 self.assertEqual(expected, tp.nextPosition()) self.assertEqual(1, tp.nextPosition()) self.assertEqual(3, tp.nextPosition()) self.assertEqual(6, tp.nextPosition()) # only one doc has "a" self.assert_(tp.nextDoc() == tp.NO_MORE_DOCS) searcher = self.getSearcher(reader=reader) stq1 = SpanTermQuery(Term("content", "a")) stq2 = SpanTermQuery(Term("content", "k")) sqs = [stq1, stq2] snq = SpanNearQuery(sqs, 30, False) count = 0 sawZero = False pspans = MultiSpansWrapper.wrap(searcher.getTopReaderContext(), snq) while pspans.next(): payloads = pspans.getPayload() sawZero |= pspans.start() == 0 it = payloads.iterator() while it.hasNext(): count += 1 it.next() self.assertEqual(5, count) self.assert_(sawZero) spans = MultiSpansWrapper.wrap(searcher.getTopReaderContext(), snq) count = 0 sawZero = False while spans.next(): count += 1 sawZero |= spans.start() == 0 self.assertEqual(4, count) self.assert_(sawZero) sawZero = False psu = PayloadSpanUtil(searcher.getTopReaderContext()) pls = psu.getPayloadsForQuery(snq) count = pls.size() it = pls.iterator() while it.hasNext(): bytes = JArray('byte').cast_(it.next()) s = bytes.string_ sawZero |= s == "pos: 0" self.assertEqual(5, count) self.assert_(sawZero)