def testIndexRelationTermOutput(self): self.assertConversion(TermQuery(Term("animal", "cats")), 'animal=cats') query = PhraseQuery() query.add(Term("animal", "cats")) query.add(Term("animal", "dogs")) self.assertConversion(query, 'animal="cats dogs"') self.assertConversion(query, 'animal="catS Dogs"')
def testExcludeUnqualifiedFieldForWhichNoPhraseQueryIsPossibleInCaseOfPhraseQuery(self): fieldRegistry = FieldRegistry() fieldRegistry.register('noTermFreqField', NO_TERMS_FREQUENCY_FIELDTYPE) self.composer = LuceneQueryComposer(unqualifiedTermFields=[("unqualified", 1.0), ('noTermFreqField', 2.0)], luceneSettings=LuceneSettings(fieldRegistry=fieldRegistry)) expected = PhraseQuery() expected.add(Term("unqualified", "phrase query")) self.assertConversion(expected, '"phrase query"')
def lucene_sample_query_parse(sampleq, ftypes): fields = [] queries = [] booleans = [] bq = BooleanQuery() for query_tuple in sampleq: (field, op_, value) = re.split(snapconf.RANGE_QUERY_OPS, query_tuple) m=snapconf.RANGE_QUERY_FIELD_PATTERN.search(query_tuple) if m is None or field is None: continue op=m.group(1) if op not in snapconf.operators: sys.stderr.write("bad operator %s in range query,exiting\n" % (str(op))) sys.exit(-1) field_w_type = snapconf.SAMPLE_HEADER_FIELDS_TYPE_MAP[field] (fieldtypechar, ftype_method) = ftypes[field_w_type] #range query if fieldtypechar == 'i' or fieldtypechar == 'f': bq.add(lucene_range_query_parse(field_w_type, op, value, fieldtypechar, ftype_method), BOOLEAN_OCCUR) #phrase query elif ' ' in value or '\t' in value: pquery = PhraseQuery() [pquery.add(Term(field_w_type, v.lower())) for v in re.split(r'\s+',value)] #force exact phrase matching only pquery.setSlop(0) bq.add(pquery, BOOLEAN_OCCUR) #term query else: bq.add(TermQuery(Term(field_w_type, value.lower())), BOOLEAN_OCCUR) sys.stderr.write("value + fields: %s %s\n" % (value.lower(), field_w_type)) return bq
def _termOrPhraseQuery(self, index, termString): listOfTermStrings = self._analyzeToken(termString) if len(listOfTermStrings) == 1: if prefixRegexp.match(termString): return PrefixQuery(self._createTerm(index, listOfTermStrings[0])) return TermQuery(self._createTerm(index, listOfTermStrings[0])) result = PhraseQuery() for term in listOfTermStrings: result.add(self._createTerm(index, term)) return result
def _termOrPhraseQuery(self, index, termString): listOfTermStrings = self._analyzeToken(termString) if len(listOfTermStrings) == 1: if prefixRegexp.match(termString): return PrefixQuery( self._createTerm(index, listOfTermStrings[0])) return TermQuery(self._createTerm(index, listOfTermStrings[0])) result = PhraseQuery() for term in listOfTermStrings: result.add(self._createTerm(index, term)) return result
def setUp(self): super(PhraseQueryTestCase, self).setUp() doc = Document() doc.add(Field("field", "one two three four five", TextField.TYPE_STORED)) writer = self.getWriter() writer.addDocument(doc) writer.close() self.searcher = self.getSearcher() self.query = PhraseQuery()
def runSearch(self, runCount, mainThread=False): """ search for runCount number of times """ # problem: if there are any assertion errors in the child # thread, the calling thread is not notified and may still # consider the test case pass. We are using self.totalQueries # to double check that work has actually been done. if not mainThread: getVMEnv().attachCurrentThread() time.sleep(0.5) searcher = self.getSearcher() try: self.query = PhraseQuery() for word, count in self.testData[0:runCount]: query = TermQuery(Term("field", word)) topDocs = searcher.search(query, 50) self.assertEqual(topDocs.totalHits, count) self.lock.acquire() self.totalQueries += 1 self.lock.release() finally: del searcher
def testExact(self): """ Ensures slop of 0 works for exact matches, but not reversed """ # slop is zero by default self.query.add(Term("field", "four")) self.query.add(Term("field", "five")) topDocs = self.searcher.search(self.query, 50) self.assertEqual(1, topDocs.totalHits, "exact match") self.query = PhraseQuery() self.query.add(Term("field", "two")) self.query.add(Term("field", "one")) topDocs = self.searcher.search(self.query, 50) self.assertEqual(0, topDocs.totalHits, "reverse not exact")
def testOrderDoesntMatter(self): """ As long as slop is at least 2, terms can be reversed """ self.query.setSlop(2) # must be at least two for reverse order match self.query.add(Term("field", "two")) self.query.add(Term("field", "one")) topDocs = self.searcher.search(self.query, 50) self.assertEqual(1, topDocs.totalHits, "just sloppy enough") self.query = PhraseQuery() self.query.setSlop(2) self.query.add(Term("field", "three")) self.query.add(Term("field", "one")) topDocs = self.searcher.search(self.query, 50) self.assertEqual(0, topDocs.totalHits, "not sloppy enough")
def testSlop1(self): # Ensures slop of 1 works with terms in order. self.query.setSlop(1) self.query.add(Term("field", "one")) self.query.add(Term("field", "two")) topDocs = self.searcher.search(self.query, 50) self.assertEqual(1, topDocs.totalHits, "in order") # Ensures slop of 1 does not work for phrases out of order # must be at least 2. self.query = PhraseQuery() self.query.setSlop(1) self.query.add(Term("field", "two")) self.query.add(Term("field", "one")) topDocs = self.searcher.search(self.query, 50) self.assertEqual(0, topDocs.totalHits, "reversed, slop not 2 or more")
def get_phrase_query(self, query, field): """Creates phrase query for searching exact phrase.""" # NOTE: "slop" argument in phrasequery constructor would implement fuzzy matching phq_builder = PhraseQuery.Builder() for t in query.split(): phq_builder.add(Term(field, t)) phq = phq_builder.build() return phq
def testDiacriticsShouldBeNormalizedNFC(self): pq = PhraseQuery() pq.add(Term("title", "more")) pq.add(Term("title", "e")) self.assertConversion(pq, 'title=More\xcc\x81e') # Combined ` from unicodedata import normalize self.assertConversion(TermQuery(Term('title', 'moree')), normalize('NFC', unicode('title=More\xcc\x81e')))
def testMultipleTerms(self): """ slop is the total number of positional moves allowed to line up a phrase """ self.query.setSlop(2) self.query.add(Term("field", "one")) self.query.add(Term("field", "three")) self.query.add(Term("field", "five")) topDocs = self.searcher.search(self.query, 50) self.assertEqual(1, topDocs.totalHits, "two total moves") self.query = PhraseQuery() self.query.setSlop(5) # it takes six moves to match this phrase self.query.add(Term("field", "five")) self.query.add(Term("field", "three")) self.query.add(Term("field", "one")) topDocs = self.searcher.search(self.query, 50) self.assertEqual(0, topDocs.totalHits, "slop of 5 not close enough") self.query.setSlop(6) topDocs = self.searcher.search(self.query, 50) self.assertEqual(1, topDocs.totalHits, "slop of 6 just right")
def testPhraseQueryWithStopAnalyzer(self): writer = self.getWriter(analyzer=StopAnalyzer(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET)) doc = Document() doc.add(Field("field", "the stop words are here", TextField.TYPE_STORED)) writer.addDocument(doc) writer.close() searcher = self.getSearcher() # valid exact phrase query builder = PhraseQuery.Builder() builder.add(Term("field", "stop")) builder.add(Term("field", "words")) scoreDocs = searcher.search(builder.build(), 50).scoreDocs self.assertEqual(1, len(scoreDocs))
def get_phrase_query(self, query, field): """Creates phrase query for searching exact phrase.""" terms = [] trans_query = '' #process Chinese query for c in query: if ord(c) >=256: trans_query += '%s '%c else: trans_query += c for t in trans_query.split(): #term = Term(field, t) #terms.append(term) terms.append(t) #phq.add(Term(field, t)) phq = PhraseQuery(field, terms) return phq
def _parse_query(self, field_name, query): ts = self.analyzer.tokenStream("dummy", StringReader(query)) termAtt = ts.getAttribute(CharTermAttribute.class_) ts.reset() tokens = [] while ts.incrementToken(): tokens.append(termAtt.toString()) ts.end() ts.close() booleanQuery = BooleanQuery.Builder() for token in tokens: builder = PhraseQuery.Builder() for i, word in enumerate(token.split(' ')): builder.add(Term(field_name, word), i) pq = builder.build() booleanQuery.add(pq, BooleanClause.Occur.SHOULD) final_query = booleanQuery.build() return final_query
def testPhraseQueryWithStopAnalyzer(self): writer = self.getWriter(analyzer=StopAnalyzer(Version.LUCENE_CURRENT)) doc = Document() doc.add(Field("field", "the stop words are here", TextField.TYPE_STORED)) writer.addDocument(doc) writer.close() searcher = self.getSearcher() # valid exact phrase query query = PhraseQuery() query.add(Term("field", "stop")) query.add(Term("field", "words")) scoreDocs = searcher.search(query, None, 50).scoreDocs self.assertEqual(1, len(scoreDocs))
def search(self, words, words_orig, stopwords=[], min_length=0, slop=2, remove_digits=False, any_one_word_occur=False): words_without_digits = re.sub(r'\w*\d\w*', '', " ".join(words)).strip().split(" ") if remove_digits and len(words_without_digits) > 0: words = words_without_digits words = [ x for x in words if x.lower() not in stopwords and len(x) > min_length ] words_orig = [ x for x in words_orig if x.lower() not in stopwords and len(x) > min_length ] if len(words) == 0: return [] query = BooleanQuery() query1 = PhraseQuery() query1.setSlop(slop) query2 = PhraseQuery() query2.setSlop(slop) query3 = PhraseQuery() query3.setSlop(slop) for word in words: query2.add(Term("wiki_name_analyzed_nopunct", word)) query3.add(Term("wiki_name_analyzed_nopunct_nostop", word)) for word in words_orig: query1.add(Term("wiki_name_analyzed", word)) query.add(query1, BooleanClause.Occur.SHOULD) query.add(query2, BooleanClause.Occur.SHOULD) query.add(query3, BooleanClause.Occur.SHOULD) # print "1. query ", query scoreDocs = self.searcher.search(query, self.num_docs_to_return).scoreDocs if len(scoreDocs) > 0: # self.printDocs(scoreDocs) return scoreDocs query = BooleanQuery() for word in words: query_word = BooleanQuery() query_word.add(TermQuery(Term("wiki_name_analyzed_nopunct", word)), BooleanClause.Occur.SHOULD) query_word.add( TermQuery(Term("wiki_name_analyzed_nopunct_nostop", word)), BooleanClause.Occur.SHOULD) query.add(query_word, BooleanClause.Occur.MUST) # print "2. query ", query scoreDocs = self.searcher.search(query, self.num_docs_to_return).scoreDocs if len(scoreDocs) > 0: return scoreDocs query = BooleanQuery() for word in words_orig: query.add(TermQuery(Term("wiki_name_analyzed", word)), BooleanClause.Occur.MUST) # print "3. query ", query scoreDocs = self.searcher.search(query, self.num_docs_to_return).scoreDocs if len(stopwords) > 0 and any_one_word_occur: query = BooleanQuery() for word in words_orig: query.add(TermQuery(Term("wiki_name_analyzed", word)), BooleanClause.Occur.SHOULD) return scoreDocs
def testSimilarity(self): writer = self.getWriter(analyzer=SimpleAnalyzer(Version.LUCENE_CURRENT), similarity=SimpleSimilarity()) d1 = Document() d1.add(Field("field", "a c", TextField.TYPE_STORED)) d2 = Document() d2.add(Field("field", "a b c", TextField.TYPE_STORED)) writer.addDocument(d1) writer.addDocument(d2) writer.commit() writer.close() searcher = self.getSearcher() searcher.setSimilarity(SimpleSimilarity()) a = Term("field", "a") b = Term("field", "b") c = Term("field", "c") class collector1(PythonCollector): def collect(_self, doc, score): self.assertEqual(1.0, score) def setNextReader(_self, context): pass def acceptsDocsOutOfOrder(_self): return True searcher.search(TermQuery(b), collector1()) bq = BooleanQuery() bq.add(TermQuery(a), BooleanClause.Occur.SHOULD) bq.add(TermQuery(b), BooleanClause.Occur.SHOULD) class collector2(PythonCollector): def collect(_self, doc, score): self.assertEqual(doc + _self.base + 1, score) def setNextReader(_self, context): _self.base = context.docBase def acceptsDocsOutOfOrder(_self): return True searcher.search(bq, collector2()) pq = PhraseQuery() pq.add(a) pq.add(c) class collector3(PythonCollector): def collect(_self, doc, score): self.assertEqual(1.0, score) def setNextReader(_self, context): pass def acceptsDocsOutOfOrder(_self): return True searcher.search(pq, collector3()) pq.setSlop(2) class collector4(PythonCollector): def collect(_self, doc, score): self.assertEqual(2.0, score) def setNextReader(_self, context): pass def acceptsDocsOutOfOrder(_self): return True searcher.search(pq, collector4())
def testSimilarity(self): writer = self.getWriter(analyzer=SimpleAnalyzer( Version.LUCENE_CURRENT), similarity=SimpleSimilarity()) d1 = Document() d1.add(Field("field", "a c", TextField.TYPE_STORED)) d2 = Document() d2.add(Field("field", "a c b", TextField.TYPE_STORED)) writer.addDocument(d1) writer.addDocument(d2) writer.commit() writer.close() searcher = self.getSearcher() searcher.setSimilarity(SimpleSimilarity()) a = Term("field", "a") b = Term("field", "b") c = Term("field", "c") class collector1(PythonSimpleCollector): def collect(_self, doc, score): self.assertEqual(1.0, score) def doSetNextReader(_self, context): pass def scoreMode(_self): return ScoreMode.COMPLETE searcher.search(TermQuery(b), collector1()) builder = BooleanQuery.Builder() builder.add(TermQuery(a), BooleanClause.Occur.SHOULD) builder.add(TermQuery(b), BooleanClause.Occur.SHOULD) bq = builder.build() class collector2(PythonSimpleCollector): def collect(_self, doc, score): self.assertEqual(doc + _self.base + 1, score) def doSetNextReader(_self, context): _self.base = context.docBase def scoreMode(_self): return ScoreMode.COMPLETE searcher.search(bq, collector2()) pq = PhraseQuery(a.field(), [a.bytes(), c.bytes()]) class collector3(PythonSimpleCollector): def collect(_self, doc, score): self.assertEqual(1.0, score) def doSetNextReader(_self, context): pass def scoreMode(_self): return ScoreMode.COMPLETE searcher.search(pq, collector3()) pq = PhraseQuery(2, a.field(), [a.bytes(), b.bytes()]) class collector4(PythonSimpleCollector): def collect(_self, doc, score): self.assertEqual(0.5, score) def doSetNextReader(_self, context): pass def scoreMode(_self): return ScoreMode.COMPLETE searcher.search(pq, collector4())
def testPhraseOutputDutchStemming(self): self.composer = LuceneQueryComposer(unqualifiedTermFields=[("unqualified", 1.0)], luceneSettings=LuceneSettings(analyzer=MerescoDutchStemmingAnalyzer())) query = PhraseQuery() query.add(Term("unqualified", "kat")) query.add(Term("unqualified", "hond")) self.assertConversion(query, '"katten honden"')
def testOneTermPhraseQueryUsesStandardAnalyzed(self): expected = PhraseQuery() expected.add(Term('unqualified', 'aap')) expected.add(Term('unqualified', 'noot')) self.assertConversion(expected, 'aap:noot')
def testPhraseOutput(self): query = PhraseQuery() query.add(Term("unqualified", "cats")) query.add(Term("unqualified", "dogs")) self.assertConversion(query,'"cats dogs"')
def testSetPosition(self): class _tokenizer(PythonTokenizer): def __init__(_self, reader): super(_tokenizer, _self).__init__(reader) _self.TOKENS = ["1", "2", "3", "4", "5"] _self.INCREMENTS = [1, 2, 1, 0, 1] _self.i = 0 _self.posIncrAtt = _self.addAttribute(PositionIncrementAttribute.class_) _self.termAtt = _self.addAttribute(CharTermAttribute.class_) _self.offsetAtt = _self.addAttribute(OffsetAttribute.class_) def incrementToken(_self): if _self.i == len(_self.TOKENS): return False _self.clearAttributes() _self.termAtt.append(_self.TOKENS[_self.i]) _self.offsetAtt.setOffset(_self.i, _self.i) _self.posIncrAtt.setPositionIncrement(_self.INCREMENTS[_self.i]) _self.i += 1 return True def end(_self): pass def reset(_self): pass def close(_self): pass class _analyzer(PythonAnalyzer): def createComponents(_self, fieldName, reader): return Analyzer.TokenStreamComponents(_tokenizer(reader)) writer = self.getWriter(analyzer=_analyzer()) d = Document() d.add(Field("field", "bogus", TextField.TYPE_STORED)) writer.addDocument(d) writer.commit() writer.close() searcher = self.getSearcher() reader = searcher.getIndexReader() pos = MultiFields.getTermPositionsEnum(reader, MultiFields.getLiveDocs(reader), "field", BytesRef("1")) pos.nextDoc() # first token should be at position 0 self.assertEqual(0, pos.nextPosition()) pos = MultiFields.getTermPositionsEnum(reader, MultiFields.getLiveDocs(reader), "field", BytesRef("2")) pos.nextDoc() # second token should be at position 2 self.assertEqual(2, pos.nextPosition()) q = PhraseQuery() q.add(Term("field", "1")) q.add(Term("field", "2")) hits = searcher.search(q, None, 1000).scoreDocs self.assertEqual(0, len(hits)) # same as previous, just specify positions explicitely. q = PhraseQuery() q.add(Term("field", "1"), 0) q.add(Term("field", "2"), 1) hits = searcher.search(q, None, 1000).scoreDocs self.assertEqual(0, len(hits)) # specifying correct positions should find the phrase. q = PhraseQuery() q.add(Term("field", "1"), 0) q.add(Term("field", "2"), 2) hits = searcher.search(q, None, 1000).scoreDocs self.assertEqual(1, len(hits)) q = PhraseQuery() q.add(Term("field", "2")) q.add(Term("field", "3")) hits = searcher.search(q, None, 1000).scoreDocs self.assertEqual(1, len(hits)) q = PhraseQuery() q.add(Term("field", "3")) q.add(Term("field", "4")) hits = searcher.search(q, None, 1000).scoreDocs self.assertEqual(0, len(hits)) # phrase query would find it when correct positions are specified. q = PhraseQuery() q.add(Term("field", "3"), 0) q.add(Term("field", "4"), 0) hits = searcher.search(q, None, 1000).scoreDocs self.assertEqual(1, len(hits)) # phrase query should fail for non existing searched term # even if there exist another searched terms in the same searched # position. q = PhraseQuery() q.add(Term("field", "3"), 0) q.add(Term("field", "9"), 0) hits = searcher.search(q, None, 1000).scoreDocs self.assertEqual(0, len(hits)) # multi-phrase query should succed for non existing searched term # because there exist another searched terms in the same searched # position. mq = MultiPhraseQuery() mq.add([Term("field", "3"), Term("field", "9")], 0) hits = searcher.search(mq, None, 1000).scoreDocs self.assertEqual(1, len(hits)) q = PhraseQuery() q.add(Term("field", "2")) q.add(Term("field", "4")) hits = searcher.search(q, None, 1000).scoreDocs self.assertEqual(1, len(hits)) q = PhraseQuery() q.add(Term("field", "3")) q.add(Term("field", "5")) hits = searcher.search(q, None, 1000).scoreDocs self.assertEqual(1, len(hits)) q = PhraseQuery() q.add(Term("field", "4")) q.add(Term("field", "5")) hits = searcher.search(q, None, 1000).scoreDocs self.assertEqual(1, len(hits)) q = PhraseQuery() q.add(Term("field", "2")) q.add(Term("field", "5")) hits = searcher.search(q, None, 1000).scoreDocs self.assertEqual(0, len(hits))
def testPhraseQueryIsStandardAnalyzed(self): expected = PhraseQuery() for term in ["vol.118", "2008", "nr.3", "march", "p.435-444"]: expected.add(Term("unqualified", term)) input = '"vol.118 (2008) nr.3 (March) p.435-444"' self.assertConversion(expected, input)
def get_phrase_query(self, query, field): """Creates phrase query for searching exact phrase.""" phq = PhraseQuery() for t in query.split(): phq.add(Term(field, t)) return phq
def testStandardAnalyserWithoutStopWords(self): expected = PhraseQuery() for term in ["no", "is", "the", "only", "option"]: expected.add(Term("unqualified", term)) self.assertConversion(expected, '"no is the only option"')
def testPhraseQueryInConjunctionScorer(self): writer = self.getWriter() doc = Document() doc.add(Field("source", "marketing info", TextField.TYPE_STORED)) writer.addDocument(doc) doc = Document() doc.add(Field("contents", "foobar", TextField.TYPE_STORED)) doc.add(Field("source", "marketing info", TextField.TYPE_STORED)) writer.addDocument(doc) writer.close() searcher = self.getSearcher() phraseQuery = PhraseQuery() phraseQuery.add(Term("source", "marketing")) phraseQuery.add(Term("source", "info")) topDocs = searcher.search(phraseQuery, 50) self.assertEqual(2, topDocs.totalHits) termQuery = TermQuery(Term("contents","foobar")) booleanQuery = BooleanQuery() booleanQuery.add(termQuery, BooleanClause.Occur.MUST) booleanQuery.add(phraseQuery, BooleanClause.Occur.MUST) topDocs = searcher.search(booleanQuery, 50) self.assertEqual(1, topDocs.totalHits) writer = self.getWriter() doc = Document() doc.add(Field("contents", "map entry woo", TextField.TYPE_STORED)) writer.addDocument(doc) doc = Document() doc.add(Field("contents", "woo map entry", TextField.TYPE_STORED)) writer.addDocument(doc) doc = Document() doc.add(Field("contents", "map foobarword entry woo", TextField.TYPE_STORED)) writer.addDocument(doc) writer.close() searcher = self.getSearcher() termQuery = TermQuery(Term("contents", "woo")) phraseQuery = PhraseQuery() phraseQuery.add(Term("contents", "map")) phraseQuery.add(Term("contents", "entry")) topDocs = searcher.search(termQuery, 50) self.assertEqual(3, topDocs.totalHits) topDocs = searcher.search(phraseQuery, 50) self.assertEqual(2, topDocs.totalHits) booleanQuery = BooleanQuery() booleanQuery.add(termQuery, BooleanClause.Occur.MUST) booleanQuery.add(phraseQuery, BooleanClause.Occur.MUST) topDocs = searcher.search(booleanQuery, 50) self.assertEqual(2, topDocs.totalHits) booleanQuery = BooleanQuery() booleanQuery.add(phraseQuery, BooleanClause.Occur.MUST) booleanQuery.add(termQuery, BooleanClause.Occur.MUST) topDocs = searcher.search(booleanQuery, 50) self.assertEqual(2, topDocs.totalHits)
def testSetPosition(self): class _tokenizer(PythonTokenizer): def __init__(_self): super(_tokenizer, _self).__init__() _self.TOKENS = ["1", "2", "3", "4", "5"] _self.INCREMENTS = [1, 2, 1, 0, 1] _self.i = 0 _self.posIncrAtt = _self.addAttribute(PositionIncrementAttribute.class_) _self.termAtt = _self.addAttribute(CharTermAttribute.class_) _self.offsetAtt = _self.addAttribute(OffsetAttribute.class_) def incrementToken(_self): if _self.i == len(_self.TOKENS): return False _self.clearAttributes() _self.termAtt.append(_self.TOKENS[_self.i]) _self.offsetAtt.setOffset(_self.i, _self.i) _self.posIncrAtt.setPositionIncrement(_self.INCREMENTS[_self.i]) _self.i += 1 return True def reset(_self): super(_tokenizer, _self).reset() _self.i = 0 class _analyzer(PythonAnalyzer): def createComponents(_self, fieldName): return Analyzer.TokenStreamComponents(_tokenizer()) def initReader(_self, fieldName, reader): return reader writer = self.getWriter(analyzer=_analyzer()) d = Document() d.add(Field("field", "bogus", TextField.TYPE_STORED)) writer.addDocument(d) writer.commit() writer.close() searcher = self.getSearcher() reader = searcher.getIndexReader() pos = MultiFields.getTermPositionsEnum(reader, "field", BytesRef("1")) pos.nextDoc() # first token should be at position 0 self.assertEqual(0, pos.nextPosition()) pos = MultiFields.getTermPositionsEnum(reader, "field", BytesRef("2")) pos.nextDoc() # second token should be at position 2 self.assertEqual(2, pos.nextPosition()) b = PhraseQuery.Builder() b.add(Term("field", "1")) b.add(Term("field", "2")) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(0, len(hits)) # same as previous, just specify positions explicitely. b = PhraseQuery.Builder() b.add(Term("field", "1"), 0) b.add(Term("field", "2"), 1) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(0, len(hits)) # specifying correct positions should find the phrase. b = PhraseQuery.Builder() b.add(Term("field", "1"), 0) b.add(Term("field", "2"), 2) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(1, len(hits)) b = PhraseQuery.Builder() b.add(Term("field", "2")) b.add(Term("field", "3")) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(1, len(hits)) b = PhraseQuery.Builder() b.add(Term("field", "3")) b.add(Term("field", "4")) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(0, len(hits)) # phrase query would find it when correct positions are specified. b = PhraseQuery.Builder() b.add(Term("field", "3"), 0) b.add(Term("field", "4"), 0) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(1, len(hits)) # phrase query should fail for non existing searched term # even if there exist another searched terms in the same searched # position. b = PhraseQuery.Builder() b.add(Term("field", "3"), 0) b.add(Term("field", "9"), 0) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(0, len(hits)) # multi-phrase query should succed for non existing searched term # because there exist another searched terms in the same searched # position. b = MultiPhraseQuery.Builder() b.add([Term("field", "3"), Term("field", "9")], 0) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(1, len(hits)) b = PhraseQuery.Builder() b.add(Term("field", "2")) b.add(Term("field", "4")) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(1, len(hits)) b = PhraseQuery.Builder() b.add(Term("field", "3")) b.add(Term("field", "5")) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(1, len(hits)) b = PhraseQuery.Builder() b.add(Term("field", "4")) b.add(Term("field", "5")) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(1, len(hits)) b = PhraseQuery.Builder() b.add(Term("field", "2")) b.add(Term("field", "5")) hits = searcher.search(b.build(), 1000).scoreDocs self.assertEqual(0, len(hits))
query1b = Term("capital_html","shakespeare") fuzzy_query_b = FuzzyQuery(query1b) get_query_results(reader,fuzzy_query_b,n_docs,"capital") #Results: #Found 4 hits: #1. https://en.wikipedia.org/wiki/London #2. https://en.wikipedia.org/wiki/Prague #3. https://en.wikipedia.org/wiki/Cairo #4. https://en.wikipedia.org/wiki/Washington,_D.C. #### part(c) phrase_c = PhraseQuery() phrase_c.setSlop(10) term_phrase_c = 'located below sea level' token_phrase_c = lucene_english_normalizer(term_phrase_c) def get_phrase(token_phrase): for word in token_phrase: term = Term('capital_html', word.encode('ascii', 'ignore')) phrase_c.add(term) get_phrase(token_phrase_c) get_query_results(reader,phrase_c,n_docs,'capital') #Found 1 hits: #Results:
q_regex = RegexpQuery(Term("contents", "[0-9]+\.?[0-9]*")) print(f'regex results: {searcher.search(q_regex,1000000).totalHits}') span1 = SpanMultiTermQueryWrapper(q_regex) span2 = SpanMultiTermQueryWrapper(RegexpQuery(Term("contents", "tiger"))) spannearquery = SpanNearQuery([span1, span2], 20, True) print( f'spanquery results: {searcher.search(spannearquery, 1000000).totalHits}' ) parser = QueryParser('contents', StandardAnalyzer()) q = parser.parse('"tiger leopard"') print(q) # prints contents:"tiger leopard" print(searcher.search(q, 10000000).totalHits) phrase_query = PhraseQuery(10, 'contents', 'tiger leopard') print(phrase_query) print(searcher.search(phrase_query, 10000000).totalHits) parser = QueryParser('contents', StandardAnalyzer()) q = parser.parse('"tiger leopard"~10') print(q) # prints contents:"tiger leopard"~10 print(searcher.search(q, 10000000).totalHits) for i in range(0, reader.numDocs()): doc = reader.document(i) text = doc.get("contents") articleID = doc.get("articleID") # Do your pattern matching and record patterns for document articleID
class PhraseQueryTestCase(PyLuceneTestCase): """ Unit tests ported from Java Lucene """ def setUp(self): super(PhraseQueryTestCase, self).setUp() doc = Document() doc.add(Field("field", "one two three four five", TextField.TYPE_STORED)) writer = self.getWriter() writer.addDocument(doc) writer.close() self.searcher = self.getSearcher() self.query = PhraseQuery() def testNotCloseEnough(self): self.query.setSlop(2) self.query.add(Term("field", "one")) self.query.add(Term("field", "five")) topDocs = self.searcher.search(self.query, 50) self.assertEqual(0, topDocs.totalHits) def testBarelyCloseEnough(self): self.query.setSlop(3) self.query.add(Term("field", "one")) self.query.add(Term("field", "five")) topDocs = self.searcher.search(self.query, 50) self.assertEqual(1, topDocs.totalHits) def testExact(self): """ Ensures slop of 0 works for exact matches, but not reversed """ # slop is zero by default self.query.add(Term("field", "four")) self.query.add(Term("field", "five")) topDocs = self.searcher.search(self.query, 50) self.assertEqual(1, topDocs.totalHits, "exact match") self.query = PhraseQuery() self.query.add(Term("field", "two")) self.query.add(Term("field", "one")) topDocs = self.searcher.search(self.query, 50) self.assertEqual(0, topDocs.totalHits, "reverse not exact") def testSlop1(self): # Ensures slop of 1 works with terms in order. self.query.setSlop(1) self.query.add(Term("field", "one")) self.query.add(Term("field", "two")) topDocs = self.searcher.search(self.query, 50) self.assertEqual(1, topDocs.totalHits, "in order") # Ensures slop of 1 does not work for phrases out of order # must be at least 2. self.query = PhraseQuery() self.query.setSlop(1) self.query.add(Term("field", "two")) self.query.add(Term("field", "one")) topDocs = self.searcher.search(self.query, 50) self.assertEqual(0, topDocs.totalHits, "reversed, slop not 2 or more") def testOrderDoesntMatter(self): """ As long as slop is at least 2, terms can be reversed """ self.query.setSlop(2) # must be at least two for reverse order match self.query.add(Term("field", "two")) self.query.add(Term("field", "one")) topDocs = self.searcher.search(self.query, 50) self.assertEqual(1, topDocs.totalHits, "just sloppy enough") self.query = PhraseQuery() self.query.setSlop(2) self.query.add(Term("field", "three")) self.query.add(Term("field", "one")) topDocs = self.searcher.search(self.query, 50) self.assertEqual(0, topDocs.totalHits, "not sloppy enough") def testMultipleTerms(self): """ slop is the total number of positional moves allowed to line up a phrase """ self.query.setSlop(2) self.query.add(Term("field", "one")) self.query.add(Term("field", "three")) self.query.add(Term("field", "five")) topDocs = self.searcher.search(self.query, 50) self.assertEqual(1, topDocs.totalHits, "two total moves") self.query = PhraseQuery() self.query.setSlop(5) # it takes six moves to match this phrase self.query.add(Term("field", "five")) self.query.add(Term("field", "three")) self.query.add(Term("field", "one")) topDocs = self.searcher.search(self.query, 50) self.assertEqual(0, topDocs.totalHits, "slop of 5 not close enough") self.query.setSlop(6) topDocs = self.searcher.search(self.query, 50) self.assertEqual(1, topDocs.totalHits, "slop of 6 just right") def testPhraseQueryWithStopAnalyzer(self): writer = self.getWriter(analyzer=StopAnalyzer(Version.LUCENE_CURRENT)) doc = Document() doc.add(Field("field", "the stop words are here", TextField.TYPE_STORED)) writer.addDocument(doc) writer.close() searcher = self.getSearcher() # valid exact phrase query query = PhraseQuery() query.add(Term("field", "stop")) query.add(Term("field", "words")) scoreDocs = searcher.search(query, None, 50).scoreDocs self.assertEqual(1, len(scoreDocs)) def testPhraseQueryInConjunctionScorer(self): writer = self.getWriter() doc = Document() doc.add(Field("source", "marketing info", TextField.TYPE_STORED)) writer.addDocument(doc) doc = Document() doc.add(Field("contents", "foobar", TextField.TYPE_STORED)) doc.add(Field("source", "marketing info", TextField.TYPE_STORED)) writer.addDocument(doc) writer.close() searcher = self.getSearcher() phraseQuery = PhraseQuery() phraseQuery.add(Term("source", "marketing")) phraseQuery.add(Term("source", "info")) topDocs = searcher.search(phraseQuery, 50) self.assertEqual(2, topDocs.totalHits) termQuery = TermQuery(Term("contents","foobar")) booleanQuery = BooleanQuery() booleanQuery.add(termQuery, BooleanClause.Occur.MUST) booleanQuery.add(phraseQuery, BooleanClause.Occur.MUST) topDocs = searcher.search(booleanQuery, 50) self.assertEqual(1, topDocs.totalHits) writer = self.getWriter() doc = Document() doc.add(Field("contents", "map entry woo", TextField.TYPE_STORED)) writer.addDocument(doc) doc = Document() doc.add(Field("contents", "woo map entry", TextField.TYPE_STORED)) writer.addDocument(doc) doc = Document() doc.add(Field("contents", "map foobarword entry woo", TextField.TYPE_STORED)) writer.addDocument(doc) writer.close() searcher = self.getSearcher() termQuery = TermQuery(Term("contents", "woo")) phraseQuery = PhraseQuery() phraseQuery.add(Term("contents", "map")) phraseQuery.add(Term("contents", "entry")) topDocs = searcher.search(termQuery, 50) self.assertEqual(3, topDocs.totalHits) topDocs = searcher.search(phraseQuery, 50) self.assertEqual(2, topDocs.totalHits) booleanQuery = BooleanQuery() booleanQuery.add(termQuery, BooleanClause.Occur.MUST) booleanQuery.add(phraseQuery, BooleanClause.Occur.MUST) topDocs = searcher.search(booleanQuery, 50) self.assertEqual(2, topDocs.totalHits) booleanQuery = BooleanQuery() booleanQuery.add(phraseQuery, BooleanClause.Occur.MUST) booleanQuery.add(termQuery, BooleanClause.Occur.MUST) topDocs = searcher.search(booleanQuery, 50) self.assertEqual(2, topDocs.totalHits)
def testPhraseQueryInConjunctionScorer(self): writer = self.getWriter() doc = Document() doc.add(Field("source", "marketing info", TextField.TYPE_STORED)) writer.addDocument(doc) doc = Document() doc.add(Field("contents", "foobar", TextField.TYPE_STORED)) doc.add(Field("source", "marketing info", TextField.TYPE_STORED)) writer.addDocument(doc) writer.close() searcher = self.getSearcher() builder = PhraseQuery.Builder() builder.add(Term("source", "marketing")) builder.add(Term("source", "info")) phraseQuery = builder.build() topDocs = searcher.search(phraseQuery, 50) self.assertEqual(2, topDocs.totalHits) termQuery = TermQuery(Term("contents", "foobar")) builder = BooleanQuery.Builder() builder.add(termQuery, BooleanClause.Occur.MUST) builder.add(phraseQuery, BooleanClause.Occur.MUST) booleanQuery = builder.build() topDocs = searcher.search(booleanQuery, 50) self.assertEqual(1, topDocs.totalHits) writer = self.getWriter() doc = Document() doc.add(Field("contents", "map entry woo", TextField.TYPE_STORED)) writer.addDocument(doc) doc = Document() doc.add(Field("contents", "woo map entry", TextField.TYPE_STORED)) writer.addDocument(doc) doc = Document() doc.add( Field("contents", "map foobarword entry woo", TextField.TYPE_STORED)) writer.addDocument(doc) writer.close() searcher = self.getSearcher() termQuery = TermQuery(Term("contents", "woo")) builder = PhraseQuery.Builder() builder.add(Term("contents", "map")) builder.add(Term("contents", "entry")) topDocs = searcher.search(termQuery, 50) self.assertEqual(3, topDocs.totalHits) phraseQuery = builder.build() topDocs = searcher.search(phraseQuery, 50) self.assertEqual(2, topDocs.totalHits) builder = BooleanQuery.Builder() builder.add(termQuery, BooleanClause.Occur.MUST) builder.add(phraseQuery, BooleanClause.Occur.MUST) booleanQuery = builder.build() topDocs = searcher.search(booleanQuery, 50) self.assertEqual(2, topDocs.totalHits) builder = BooleanQuery.Builder() builder.add(phraseQuery, BooleanClause.Occur.MUST) builder.add(termQuery, BooleanClause.Occur.MUST) booleanQuery = builder.build() topDocs = searcher.search(booleanQuery, 50) self.assertEqual(2, topDocs.totalHits)
def search(self, value, stopwords=[], min_length=0): words = [ x for x in nltk.word_tokenize(value) if x not in stopwords and len(x) > min_length ] query = BooleanQuery() query1 = PhraseQuery() query1.setSlop(2) query2 = PhraseQuery() query2.setSlop(2) query3 = PhraseQuery() query3.setSlop(2) for word in words: query1.add(Term("wiki_name_analyzed", word)) query2.add(Term("wiki_name_analyzed_nopunct", word)) query3.add(Term("wiki_name_analyzed_nopunct_nostop", word)) query.add(query1, BooleanClause.Occur.SHOULD) query.add(query2, BooleanClause.Occur.SHOULD) query.add(query3, BooleanClause.Occur.SHOULD) scoreDocs = self.searcher.search(query, self.num_docs_to_return).scoreDocs if len(scoreDocs) > 0: #self.printDocs(scoreDocs) return scoreDocs query = BooleanQuery() for word in words: query_word = BooleanQuery() query_word.add(TermQuery(Term("wiki_name_analyzed", word)), BooleanClause.Occur.SHOULD) query_word.add(TermQuery(Term("wiki_name_analyzed_nopunct", word)), BooleanClause.Occur.SHOULD) query_word.add( TermQuery(Term("wiki_name_analyzed_nopunct_nostop", word)), BooleanClause.Occur.SHOULD) query.add(query_word, BooleanClause.Occur.MUST) scoreDocs = self.searcher.search(query, self.num_docs_to_return).scoreDocs return scoreDocs
def testSimilarity(self): writer = self.getWriter(analyzer=SimpleAnalyzer( Version.LUCENE_CURRENT), similarity=SimpleSimilarity()) d1 = Document() d1.add(Field("field", "a c", TextField.TYPE_STORED)) d2 = Document() d2.add(Field("field", "a b c", TextField.TYPE_STORED)) writer.addDocument(d1) writer.addDocument(d2) writer.commit() writer.close() searcher = self.getSearcher() searcher.setSimilarity(SimpleSimilarity()) a = Term("field", "a") b = Term("field", "b") c = Term("field", "c") class collector1(PythonCollector): def collect(_self, doc, score): self.assertEqual(1.0, score) def setNextReader(_self, context): pass def acceptsDocsOutOfOrder(_self): return True searcher.search(TermQuery(b), collector1()) bq = BooleanQuery() bq.add(TermQuery(a), BooleanClause.Occur.SHOULD) bq.add(TermQuery(b), BooleanClause.Occur.SHOULD) class collector2(PythonCollector): def collect(_self, doc, score): self.assertEqual(doc + _self.base + 1, score) def setNextReader(_self, context): _self.base = context.docBase def acceptsDocsOutOfOrder(_self): return True searcher.search(bq, collector2()) pq = PhraseQuery() pq.add(a) pq.add(c) class collector3(PythonCollector): def collect(_self, doc, score): self.assertEqual(1.0, score) def setNextReader(_self, context): pass def acceptsDocsOutOfOrder(_self): return True searcher.search(pq, collector3()) pq.setSlop(2) class collector4(PythonCollector): def collect(_self, doc, score): self.assertEqual(2.0, score) def setNextReader(_self, context): pass def acceptsDocsOutOfOrder(_self): return True searcher.search(pq, collector4())