def testTerm(self): searcher = IndexSearcher(self.directory, True) t = Term("subject", "ant") query = TermQuery(t) scoreDocs = searcher.search(query, 50).scoreDocs self.assertEqual(1, len(scoreDocs), "JDwA") t = Term("subject", "junit") scoreDocs = searcher.search(TermQuery(t), 50).scoreDocs self.assertEqual(2, len(scoreDocs)) searcher.close()
def testSecurityFilter(self): query = TermQuery(Term("keywords", "info")) searcher = IndexSearcher(self.directory, True) topDocs = searcher.search(query, 50) self.assertEqual(2, topDocs.totalHits, "Both documents match") jakeFilter = QueryWrapperFilter(TermQuery(Term("owner", "jake"))) scoreDocs = searcher.search(query, jakeFilter, 50).scoreDocs self.assertEqual(1, len(scoreDocs)) self.assertEqual("jakes sensitive info", searcher.doc(scoreDocs[0].doc).get("keywords"), "elwood is safe")
def search(self, topic): query = self.query_parser.parse(topic.title) results = self.searcher.search(query, self.top_n) score_pairs = {} for hit in results.scoreDocs: doc = self.searcher.doc(hit.doc) for field in ["title", "heading", "text"]: terms = doc.get(field).split() for term in terms: if (field, term) in score_pairs: score_pairs[(field, term)].increment() else: score_pairs[(field, term)] = ScorePair( self.reader, field, term) # XXX top_terms = score_pairs.values() top_terms.sort(key=lambda x: x.score(), reverse=True) top_terms = top_terms[:25] # print([term.term for term in top_terms]) bq = BooleanQuery() query.setBoost(float(10000000)) bq.add(query, BooleanClause.Occur.SHOULD) for score_pair in top_terms: term = score_pair.to_term() bq.add(TermQuery(term), BooleanClause.Occur.SHOULD) return self.searcher.search(bq, 5000)
def testChinese(self): searcher = IndexSearcher(self.directory, True) query = TermQuery(Term("contents", "道")) scoreDocs = searcher.search(query, 50).scoreDocs self.assertEqual(1, len(scoreDocs), "tao")
def purgeDocuments(self, txn, counter, indexSearcher, indexReader, uItem, toVersion=None): term = Term("item", uItem.str64()) if toVersion is None: counter.documentCount += indexReader.deleteDocuments(term) else: x, keep = self.store._items.findValues(None, toVersion, uItem, None, True) keep = set(keep) for hit in indexSearcher.search(TermQuery(term)): hit = Hit.cast_(hit) doc = hit.getDocument() ver = long(doc['version']) if ver <= toVersion and UUID(doc['value']) not in keep: indexReader.deleteDocument(hit.getId()) counter.documentCount += 1
def searchWithDrillDown(cls, indexReader, taxoReader): """ Search an index with facets drill-down. returns a List<FacetResult> """ # base query the user is interested in baseQuery = TermQuery(Term(TEXT, "white")) # facet of interest facetRequest = CountFacetRequest(createCategoryPath(["root", "a"]), 10) # initial search - all docs matching the base query will contribute to the accumulation res1 = cls.searchWithRequest(indexReader, taxoReader, None, facetRequest) # a single result (because there was a single request) fres = res1.get(0) # assume the user is interested in the second sub-result # (just take the second sub-result returned by the iterator - we know there are 3 results!) subResults = fres.getFacetResultNode().getSubResults() # NOTE: .getSubResults() yields an "Iterable<? extends FacetResultNode>:" # the elements of this iterator are of type Object and need to be casted to # FacetResultNode by calling FacetResultNode.cast_(obj) first resIterator = subResults.iterator() resIterator.next() # skip first result resultNode = resIterator.next() resultNode = FacetResultNode.cast_(resultNode) categoryOfInterest = resultNode.getLabel() # drill-down preparation: turn the base query into a drill-down query for the category of interest query2 = DrillDown.query(baseQuery, [ categoryOfInterest, ]) # that's it - search with the new query and we're done! # only documents both matching the base query AND containing the # category of interest will contribute to the new accumulation return cls.searchWithRequestAndQuery(query2, indexReader, taxoReader, None, facetRequest)
def main(cls, argv): query = TermQuery(Term("f", "ipsum")) scorer = QueryScorer(query) formatter = SimpleHTMLFormatter("<span class=\"highlight\">", "</span>") highlighter = Highlighter(formatter, scorer) fragmenter = SimpleFragmenter(50) highlighter.setTextFragmenter(fragmenter) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) tokenStream = analyzer.tokenStream("f", StringReader(cls.text)) result = highlighter.getBestFragments(tokenStream, cls.text, 5, "...") stdout.write("<html>") stdout.write("<style>\n") stdout.write(".highlight {\n") stdout.write(" background: yellow\n") stdout.write("}\n") stdout.write("</style>") stdout.write("<body>") stdout.write(result) stdout.write("</body></html>\n") stdout.flush()
def testKeyword(self): searcher = IndexSearcher(self.directory, True) t = Term("isbn", "1930110995") query = TermQuery(t) scoreDocs = searcher.search(query, 50).scoreDocs self.assertEqual(1, len(scoreDocs), "JUnit in Action")
def undoDocuments(self, indexSearcher, indexReader, uItem, version): term = Term("item", uItem.str64()) for hit in indexSearcher.search(TermQuery(term)): hit = Hit.cast_(hit) if long(hit.getDocument()['version']) == version: indexReader.deleteDocument(hit.getId())
def testPhraseQuery(self): analyzer = StandardAnalyzer(Version.LUCENE_24) q = QueryParser(Version.LUCENE_24, "field", analyzer).parse('"This is Some Phrase*"') self.assertEqual('"some phrase"', q.toString("field"), "analyzed") q = QueryParser(Version.LUCENE_CURRENT, "field", self.analyzer).parse('"term"') self.assert_(TermQuery.instance_(q), "reduced to TermQuery")
def main(cls, argv): if len(argv) != 2: print "Usage: BerkeleyDbSearcher <index dir>" return dbHome = argv[1] env = DBEnv() env.set_flags(DB_LOG_INMEMORY, 1); if os.name == 'nt': env.set_cachesize(0, 0x4000000, 1) elif os.name == 'posix': from commands import getstatusoutput if getstatusoutput('uname') == (0, 'Linux'): env.set_cachesize(0, 0x4000000, 1) env.open(dbHome, (DB_THREAD | DB_INIT_MPOOL | DB_INIT_LOCK | DB_INIT_TXN), 0) index = DB(env) blocks = DB(env) txn = None try: txn = env.txn_begin(None) index.open(filename = '__index__', dbtype = DB_BTREE, flags = DB_THREAD, txn = txn) blocks.open(filename = '__blocks__', dbtype = DB_BTREE, flags = DB_THREAD, txn = txn) except: if txn is not None: txn.abort() txn = None raise else: txn.commit() txn = None try: txn = env.txn_begin(None) directory = DbDirectory(txn, index, blocks, 0) searcher = IndexSearcher(directory, True) topDocs = searcher.search(TermQuery(Term("contents", "fox")), 50) print topDocs.totalHits, "document(s) found" searcher.close() except: if txn is not None: txn.abort() txn = None raise else: txn.abort() index.close() blocks.close() env.close()
def getHitCount(self, fieldName, searchString): searcher = IndexSearcher(self.dir, True) t = Term(fieldName, searchString) query = TermQuery(t) hitCount = len(searcher.search(query, 50).scoreDocs) searcher.close() return hitCount
def testPhraseQuery(self): parser = CustomQueryParser("field", self.analyzer) query = parser.parse("singleTerm") self.assert_(TermQuery.instance_(query), "TermQuery") query = parser.parse("\"a phrase\"") self.assert_(SpanNearQuery.instance_(query), "SpanNearQuery")
def searchWithRequest(cls, indexReader, taxoReader, indexingParams, facetRequest): """ Search an index with facets for given facet requests. returns a List<FacetResult> """ query = TermQuery(Term(TEXT, "white")) return cls.searchWithRequestAndQuery(query, indexReader, taxoReader, indexingParams, facetRequest)
def testToString(self): query = BooleanQuery() query.add(FuzzyQuery(Term("field", "kountry")), BooleanClause.Occur.MUST) query.add(TermQuery(Term("title", "western")), BooleanClause.Occur.SHOULD) self.assertEqual("+kountry~0.5 title:western", query.toString("field"), "both kinds")
def testPhraseQuery(self): analyzer = StandardAnalyzer(Version.LUCENE_24) q = QueryParser(Version.LUCENE_24, "field", analyzer).parse('"This is Some Phrase*"') self.assertEqual("\"some phrase\"", q.toString("field"), "analyzed") q = QueryParser(Version.LUCENE_CURRENT, "field", self.analyzer).parse('"term"') self.assert_(TermQuery.instance_(q), "reduced to TermQuery")
def testSearchByAPI(self): tq = TermQuery(Term("content", "hops")) topDocs = self.searcher.search(tq, 50) self.assertEqual(1, topDocs.totalHits) pq = PhraseQuery() pq.add(Term("content", "fox")) pq.add(Term("content", "hops")) topDocs = self.searcher.search(pq, 50) self.assertEquals(1, topDocs.totalHits)
def testOr(self): methodologyBooks = TermQuery( Term("category", "/technology/computers/programming/methodology")) easternPhilosophyBooks = TermQuery( Term("category", "/philosophy/eastern")) enlightenmentBooks = BooleanQuery() enlightenmentBooks.add(methodologyBooks, BooleanClause.Occur.SHOULD) enlightenmentBooks.add(easternPhilosophyBooks, BooleanClause.Occur.SHOULD) searcher = IndexSearcher(self.directory, True) scoreDocs = searcher.search(enlightenmentBooks, 50).scoreDocs print "or =", enlightenmentBooks self.assertHitsIncludeTitle(searcher, scoreDocs, "Extreme Programming Explained") self.assertHitsIncludeTitle(searcher, scoreDocs, u"Tao Te Ching \u9053\u5FB7\u7D93")
def getSynonyms(self, word): synList = [] topDocs = self.searcher.search(TermQuery(Term("word", word)), 50) for scoreDoc in topDocs.scoreDocs: doc = self.searcher.doc(scoreDoc.doc) for value in doc.getValues("syn"): synList.append(value) return synList
def testHighlighting(self): text = "The quick brown fox jumps over the lazy dog" query = TermQuery(Term("field", "fox")) scorer = QueryScorer(query) highlighter = Highlighter(scorer) tokenStream = SimpleAnalyzer(Version.LUCENE_CURRENT).tokenStream( "field", StringReader(text)) self.assertEqual("The quick brown <B>fox</B> jumps over the lazy dog", highlighter.getBestFragment(tokenStream, text))
def testAnd(self): searchingBooks = TermQuery(Term("subject", "search")) books2004 = NumericRangeQuery.newIntRange("pubmonth", Integer(200401), Integer(200412), True, True) searchingBooks2004 = BooleanQuery() searchingBooks2004.add(searchingBooks, BooleanClause.Occur.MUST) searchingBooks2004.add(books2004, BooleanClause.Occur.MUST) searcher = IndexSearcher(self.directory, True) scoreDocs = searcher.search(searchingBooks2004, 50).scoreDocs self.assertHitsIncludeTitle(searcher, scoreDocs, "Lucene in Action")
def setUp(self): self.directory = RAMDirectory() writer = IndexWriter(self.directory, WhitespaceAnalyzer(), True, IndexWriter.MaxFieldLength.UNLIMITED) self.addPoint(writer, "El Charro", "restaurant", 1, 2) self.addPoint(writer, "Cafe Poca Cosa", "restaurant", 5, 9) self.addPoint(writer, "Los Betos", "restaurant", 9, 6) self.addPoint(writer, "Nico's Taco Shop", "restaurant", 3, 8) writer.close() self.searcher = IndexSearcher(self.directory, True) self.query = TermQuery(Term("type", "restaurant"))
def docsLike(self, id, doc, max): authors = doc.getValues("author") authorQuery = BooleanQuery() for author in authors: authorQuery.add(TermQuery(Term("author", author)), BooleanClause.Occur.SHOULD) authorQuery.setBoost(2.0) vector = self.reader.getTermFreqVector(id, "subject") subjectQuery = BooleanQuery() for term in vector.getTerms(): tq = TermQuery(Term("subject", term)) subjectQuery.add(tq, BooleanClause.Occur.SHOULD) likeThisQuery = BooleanQuery() likeThisQuery.add(authorQuery, BooleanClause.Occur.SHOULD) likeThisQuery.add(subjectQuery, BooleanClause.Occur.SHOULD) # exclude myself likeThisQuery.add(TermQuery(Term("isbn", doc.get("isbn"))), BooleanClause.Occur.MUST_NOT) print " Query:", likeThisQuery.toString("contents") scoreDocs = self.searcher.search(likeThisQuery, 50).scoreDocs docs = [] for scoreDoc in scoreDocs: doc = self.searcher.doc(scoreDoc.doc) if len(docs) < max: docs.append(doc) else: break return docs
def testCollecting(self): query = TermQuery(Term("contents", "junit")) searcher = IndexSearcher(self.directory, True) collector = BookLinkCollector(searcher) searcher.search(query, collector) links = collector.getLinks() self.assertEqual("java development with ant", links["http://www.manning.com/antbook"]) scoreDocs = searcher.search(query, 10).scoreDocs self.dumpHits(searcher, scoreDocs) searcher.close()
def testPrefix(self): searcher = IndexSearcher(self.directory, True) # search for programming books, including subcategories term = Term("category", "/technology/computers/programming") query = PrefixQuery(term) topDocs = searcher.search(query, 50) programmingAndBelow = topDocs.totalHits # only programming books, not subcategories topDocs = searcher.search(TermQuery(term), 50) justProgramming = topDocs.totalHits self.assert_(programmingAndBelow > justProgramming)
def testHits(self): searcher = IndexSearcher(self.directory, True) query = TermQuery(Term("title", "action")) scoreDocs = searcher.search(query, 50).scoreDocs scorer = QueryScorer(query) highlighter = Highlighter(scorer) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) title = doc["title"] stream = SimpleAnalyzer(Version.LUCENE_CURRENT).tokenStream( "title", StringReader(title)) fragment = highlighter.getBestFragment(stream, title) print fragment
def testFilteredQuery(self): isbns = ["0854402624"] # Steiner accessor = TestSpecialsAccessor(isbns) filter = SpecialsFilter(accessor) educationBooks = WildcardQuery(Term("category", "*education*")) edBooksOnSpecial = FilteredQuery(educationBooks, filter) logoBooks = TermQuery(Term("subject", "logo")) logoOrEdBooks = BooleanQuery() logoOrEdBooks.add(logoBooks, BooleanClause.Occur.SHOULD) logoOrEdBooks.add(edBooksOnSpecial, BooleanClause.Occur.SHOULD) topDocs = self.searcher.search(logoOrEdBooks, 50) print logoOrEdBooks self.assertEqual(2, topDocs.totalHits, "Papert and Steiner")
def testSimple(self): class SimpleSimilarity(PythonSimilarity): def lengthNorm(_self, field, numTerms): return 1.0 def queryNorm(_self, sumOfSquaredWeights): return 1.0 def tf(_self, freq): return freq def sloppyFreq(_self, distance): return 2.0 def idfTerms(_self, terms, searcher): return 1.0 def idf(_self, docFreq, numDocs): return 1.0 def coord(_self, overlap, maxOverlap): return 1.0 def scorePayload(_self, docId, fieldName, start, end, payload, offset, length): return 1.0 self.indexSingleFieldDocs([Field("contents", "x", Field.Store.YES, Field.Index.ANALYZED)]) searcher = IndexSearcher(self.directory) searcher.setSimilarity(SimpleSimilarity()) query = TermQuery(Term("contents", "x")) explanation = searcher.explain(query, 0) print explanation scoreDocs = searcher.search(query, 50).scoreDocs self.assertEqual(1, len(scoreDocs)) self.assertEqual(scoreDocs[0].score, 1.0) searcher.close()
def testTermQuery(self): query = TermQuery(Term("partnum", "Q36")) scoreDocs = self.searcher.search(query, 50).scoreDocs self.assertEqual(1, len(scoreDocs))
def searchDocuments(self, view, version, query=None, attribute=None): store = self.store if query is None: query = MatchAllDocsQuery() else: query = QueryParser("contents", StandardAnalyzer()).parse(query) if attribute: combinedQuery = BooleanQuery() combinedQuery.add(query, BooleanClause.Occur.MUST) combinedQuery.add(TermQuery(Term("attribute", attribute.str64())), BooleanClause.Occur.MUST) query = combinedQuery class _collector(PythonHitCollector): def __init__(_self): super(_collector, _self).__init__() _self.hits = [] def collect(_self, id, score): _self.hits.append((-score, id)) class _iterator(object): def __init__(_self): _self.txnStatus = 0 _self.searcher = None _self.collector = None def __del__(_self): try: if _self.searcher is not None: _self.searcher.close() store.abortTransaction(view, _self.txnStatus) except: store.repository.logger.exception("in __del__") _self.txnStatus = 0 _self.searcher = None _self.collector = None def __iter__(_self): _self.txnStatus = store.startTransaction(view) _self.searcher = searcher = self.getIndexSearcher() _self.collector = _collector() searcher.search(query, _self.collector) hits = _self.collector.hits if hits: heapify(hits) while hits: score, id = heappop(hits) doc = searcher.doc(id) uItem = UUID(doc['item']) if long(doc['version']) <= version: if store._items.isValue(view, version, uItem, UUID(doc['value'])): yield uItem, UUID(doc['attribute']) return _iterator()