def main(): _vm = lucene.initVM(vmargs=['-Djava.awt.headless=true']) query = BooleanQuery() query.add(MatchAllDocsQuery(), BooleanClause.Occur.MUST) query.add(TermQuery(Term('type', 'user')), BooleanClause.Occur.MUST) i = 0 with zh_iatd.create_searcher() as searcher: with open('pagerank_data.txt', 'w') as fout: reslst = searcher.searcher.search(query, 100) initval = 1.0 / reslst.totalHits while len(reslst.scoreDocs) > 0: for x in reslst.scoreDocs: realdoc = searcher.searcher.doc(x.doc) obj = document_to_obj(realdoc) if not obj.data.followed_users is None: print '{0:8}'.format(i), ' user', obj.index, len( obj.data.followed_users) fout.write('{0}\t{1}\t{2}\n'.format( obj.index, initval, ' '.join( (x.encode('utf8') for x in obj.data.followed_users)))) else: print '{0:8}'.format(i), 'I user', obj.index i += 1 reslst = searcher.searcher.searchAfter(reslst.scoreDocs[-1], query, 100)
def testFacetAndTopsMultiCollector(self): I = Index(path=self.tempdir, settings=LuceneSettings()) for i in xrange(99): document1 = createDocument(fields=[("field1", str(i)), ("field2", str(i) * 1000)], facets=[("facet1", "value%s" % (i % 10)) ]) document1 = I._facetsConfig.build(I._taxoWriter, document1) I._indexWriter.addDocument(document1) I.commit() I.close() I = Index(path=self.tempdir, settings=LuceneSettings()) f = FacetSuperCollector(I._indexAndTaxonomy.taxoReader, I._facetsConfig, I._ordinalsReader) t = TopScoreDocSuperCollector(10, True) collectors = ArrayList().of_(SuperCollector) collectors.add(t) collectors.add(f) C = MultiSuperCollector(collectors) Q = MatchAllDocsQuery() I.search(Q, None, C) self.assertEquals(99, t.topDocs(0).totalHits) self.assertEquals(10, len(t.topDocs(0).scoreDocs)) tc = f.getTopChildren(10, "facet1", []) self.assertEquals([('value0', 10), ('value1', 10), ('value2', 10), ('value3', 10), ('value4', 10), ('value5', 10), ('value6', 10), ('value7', 10), ('value8', 10), ('value9', 9)], [(l.label, l.value.intValue()) for l in tc.labelValues])
def searchWithFacets(cls, indexReader, taxoReader, facets_config): """ Search an index with facets. return a list of FacetResult instances """ # MatchAllDocsQuery is for "browsing" (counts facets for all non-deleted docs in the index) query = MatchAllDocsQuery() return cls.searchWithQuery(query, indexReader, taxoReader, facets_config)
def facets(self, facets, filterQueries, drilldownQueries=None, filter=None): facetCollector = self._facetCollector() if facets else None filter_ = self._filterFor(filterQueries, filter=filter) query = MatchAllDocsQuery() if drilldownQueries: query = self.createDrilldownQuery(query, drilldownQueries) self._index.search(query, filter_, facetCollector) generatorReturn(self._facetResult(facetCollector, facets)) yield
def main(): _vm = lucene.initVM(vmargs=['-Djava.awt.headless=true']) db_writer = zh_iatd.create_index_writer('.newdb') db_reader = zh_iatd.create_searcher(INDEXED_FOLDER) if len(sys.argv) < 2: res = db_reader.searcher.search(MatchAllDocsQuery(), 100) tot = 0 while len(res.scoreDocs) > 0: for x in res.scoreDocs: realdoc = db_reader.searcher.doc(x.doc) obj = document_to_obj(realdoc) newdoc = obj_to_document(obj) db_writer.addDocument(newdoc) tot += 1 sys.stdout.write('\r{0}'.format(tot)) sys.stdout.flush() res = db_reader.searcher.searchAfter(res.scoreDocs[-1], MatchAllDocsQuery(), 100) elif sys.argv[1] == 'mergerank': ranks = {} with open('prrank.txt', 'r') as fin: for x in fin.readlines(): v = x.split() ranks[v[0]] = float(v[1]) res = db_reader.searcher.search(MatchAllDocsQuery(), 100) tot = 0 while len(res.scoreDocs) > 0: for x in res.scoreDocs: realdoc = db_reader.searcher.doc(x.doc) obj = document_to_obj(realdoc) if isinstance(obj, zh_pganlz.user): if obj.index in ranks.keys(): obj.data.rank = ranks[obj.index] newdoc = obj_to_document(obj) db_writer.addDocument(newdoc) tot += 1 sys.stdout.write('\r{0}'.format(tot)) sys.stdout.flush() res = db_reader.searcher.searchAfter(res.scoreDocs[-1], MatchAllDocsQuery(), 100) db_writer.commit()
def testSearch(self): C = TotalHitCountSuperCollector() I = Index(path=self.tempdir, settings=LuceneSettings()) Q = MatchAllDocsQuery() I.search(Q, None, C) self.assertEquals(0, C.getTotalHits()) I._indexWriter.addDocument(document(name="one", price="2")) I.close() I = Index(path=self.tempdir, settings=LuceneSettings()) I.search(Q, None, C) self.assertEquals(1, C.getTotalHits())
def main(): _vm = lucene.initVM(vmargs=['-Djava.awt.headless=true']) db_writer = zh_iatd.create_index_writer('.newdb') db_reader = zh_iatd.create_searcher(INDEXED_FOLDER) res = db_reader.searcher.search(MatchAllDocsQuery(), 100) tot = 0 while len(res.scoreDocs) > 0: for x in res.scoreDocs: realdoc = db_reader.searcher.doc(x.doc) obj = document_to_obj(realdoc) if isinstance(obj, zh_pganlz.article): if 'contents' in vars(obj.data).keys(): obj.data.text = obj.data.contents del obj.data.contents newdoc = obj_to_document(obj) db_writer.addDocument(newdoc) tot += 1 sys.stdout.write('\r{0}'.format(tot)) sys.stdout.flush() res = db_reader.searcher.searchAfter(res.scoreDocs[-1], MatchAllDocsQuery(), 100) db_writer.commit()
def getQueryBuiler(): # builder = QueryBuilder(analyzer) boolean_query = BooleanQuery.Builder() # print(args.search) if len(args.search) == 0: boolean_query.add(MatchAllDocsQuery(), BooleanClause.Occur.MUST) return boolean_query for i in range(len(args.search)): curSearch = args.search[i].split(' ') if curSearch[1] == 'query': parser = QueryParser(curSearch[2], analyzer) query = parser.parse(curSearch[3]) elif curSearch[1] == 'intrange': query = IntPoint.newRangeQuery(curSearch[2], curSearch[3], curSearch[4]) elif curSearch[1] == 'termrange': lowerDate = handleDate(curSearch[3], '%d/%b/%Y:%H:%M:%S') upperDate = handleDate(curSearch[4], '%d/%b/%Y:%H:%M:%S') query = TermRangeQuery.newStringRange(curSearch[2], lowerDate, upperDate, True, True) if curSearch[0] == 'must': boolean_query.add(query, BooleanClause.Occur.MUST) elif curSearch[0] == 'should': boolean_query.add(query, BooleanClause.Occur.SHOULD) elif curSearch[0] == 'filter': boolean_query.add(query, BooleanClause.Occur.FILTER) elif curSearch[0] == 'must_not': boolean_query.add(query, BooleanClause.Occur.MUST_NOT) else: print('raise exception') # raise Exception # exit() # parser = QueryParser('method1', analyzer) # query = parser.parse('options') # boolean_query.add(query, BooleanClause.Occur.MUST) # parser = QueryParser('response_code', analyzer) # query = IntPoint.newRangeQuery('response_code', 200, 300) # boolean_query.add(query, BooleanClause.Occur.MUST) # lowerDate = handleDate("19/Jul/2020:05:40:00 +0000") # upperDate = handleDate("19/Jul/2020:06:45:04 +0000") # query = TermRangeQuery.newStringRange("date_time", lowerDate, upperDate, True, True) # boolean_query.add(query, BooleanClause.Occur.MUST) return boolean_query
def _sinqleQuery(self, query): t0 = time() resultCoreName = query.resultsFrom resultCoreQuery = query.queryFor(core=resultCoreName) if resultCoreQuery is None: resultCoreQuery = MatchAllDocsQuery() result = yield self.any[resultCoreName].executeQuery( luceneQuery=resultCoreQuery, facets=query.facetsFor(resultCoreName), filterQueries=query.filterQueriesFor(resultCoreName), drilldownQueries=query.drilldownQueriesFor(resultCoreName), **query.otherKwargs()) result.queryTime = millis(time() - t0) generatorReturn(result)
def getLastStampId(self, prefix='oai_dc', setSpec=None): searcher = self._getSearcher() sort = Sort(SortField(NUMERIC_STAMP_FIELD, SortField.Type.LONG, True)) if prefix is None and setSpec is None: query = MatchAllDocsQuery() else: if prefix is None: query = TermQuery(Term(SETS_FIELD, setSpec)) else: query = TermQuery(Term(PREFIX_FIELD, prefix)) results = searcher.search(query, 1, sort) if results.totalHits.value < 1: return None return _stampFromDocument(searcher.doc(results.scoreDocs[0].doc))
def testSearchTopDocs(self): I = Index(path=self.tempdir, settings=LuceneSettings()) I._indexWriter.addDocument(document(name="one", price="aap noot mies")) I._indexWriter.addDocument(document(name="two", price="aap vuur boom")) I._indexWriter.addDocument( document(name="three", price="noot boom mies")) I.close() I = Index(path=self.tempdir, settings=LuceneSettings()) C = TopScoreDocSuperCollector(2, True) Q = MatchAllDocsQuery() I.search(Q, None, C) td = C.topDocs(0) self.assertEquals(3, C.getTotalHits()) self.assertEquals(3, td.totalHits) self.assertEquals(2, len(td.scoreDocs))
def testCollectorFiltersTwoSimilar(self): self._addDocument("urn:1", 2, 1) self._addDocument("urn:2", 2, 2) tc = TopScoreDocSuperCollector(100, True) c = DeDupFilterSuperCollector("__isformatof__", "__sort__", tc) self.lucene.search(query=MatchAllDocsQuery(), collector=c) topDocsResult = tc.topDocs(0) self.assertEquals(1, topDocsResult.totalHits) self.assertEquals(1, len(topDocsResult.scoreDocs)) docId = topDocsResult.scoreDocs[0].doc key = c.keyForDocId(docId) identifier = self.lucene._index.getDocument( key.getDocId()).get(IDFIELD) self.assertEquals('urn:2', identifier) self.assertEquals(2, key.count)
def testShouldAddResultsWithoutIsFormatOf(self): self._addDocument("urn:1", 2) self._addDocument("urn:2", None) self._addDocument("urn:3", 2) self._addDocument("urn:4", None) self._addDocument("urn:5", None) self._addDocument("urn:6", None) self._addDocument("urn:7", None) self._addDocument("urn:8", None) self._addDocument("urn:9", None) self._addDocument("urn:A", None) self._addDocument("urn:B", None) # trigger a merge tc = TopScoreDocSuperCollector(100, True) c = DeDupFilterSuperCollector("__isformatof__", "__sort__", tc) self.lucene.search(query=MatchAllDocsQuery(), collector=c) self.assertEquals(10, tc.topDocs(0).totalHits)
def visitSEARCH_CLAUSE(self, node): # possible children: # CQL_QUERY # SEARCH_TERM # INDEX, RELATION, SEARCH_TERM firstChild = node.children[0].name results = CqlVisitor.visitSEARCH_CLAUSE(self, node) if firstChild == 'SEARCH_TERM': (unqualifiedRhs, ) = results if unqualifiedRhs == '*': return MatchAllDocsQuery() subQueries = [] for fieldname, boost in self._unqualifiedTermFields: subQuery = self._termOrPhraseQuery(fieldname, unqualifiedRhs) if isinstance( subQuery, PhraseQuery ) and not self._fieldRegistry.phraseQueryPossible(fieldname): continue subQuery.setBoost(boost) subQueries.append(subQuery) if len(subQueries) == 1: query = subQueries[0] else: query = BooleanQuery() for subQuery in subQueries: query.add(subQuery, BooleanClause.Occur.SHOULD) return query elif firstChild == 'INDEX': (left, (relation, boost), right) = results if relation in [ '==', 'exact' ] or (relation == '=' and self._fieldRegistry.isUntokenized(left)): query = TermQuery(self._createTerm(left, right)) elif relation == '=': query = self._termOrPhraseQuery(left, right) elif relation in ['<', '<=', '>=', '>']: query = self._termRangeQuery(left, relation, right) else: raise UnsupportedCQL("'%s' not supported for the field '%s'" % (relation, left)) query.setBoost(boost) return query else: ((query, ), ) = results return query
def get_title_id_map(self): # get number of docs n_docs = self.searcher.getIndexReader().numDocs() title_id = {} id_title = {} query = MatchAllDocsQuery() hits = self.searcher.search(query, n_docs) for hit in hits.scoreDocs: doc = self.searcher.doc(hit.doc) idd = int(doc['id']) title = doc['title'] title_id[title] = idd id_title[idd] = title return title_id, id_title
def _multipleCoreQuery(self, query): t0 = time() resultCoreName = query.resultsFrom resultCoreKey = query.keyName(resultCoreName) otherCoreNames = [ coreName for coreName in query.cores if coreName != resultCoreName ] finalKeys = self._uniteFilter(query) for otherCoreName in otherCoreNames: finalKeys = self._coreQueries(otherCoreName, query, finalKeys) summaryFilter = None if finalKeys is not None: summaryFilter = KeyFilter(finalKeys, resultCoreKey) resultCoreQuery = self._luceneQueryForCore(resultCoreName, query) aggregateScoreCollector = self._createAggregateScoreCollector( query, resultCoreKey) keyCollector = KeySuperCollector( resultCoreKey) if self._multithreaded else KeyCollector( resultCoreKey) result = yield self.any[resultCoreName].executeQuery( luceneQuery=resultCoreQuery or MatchAllDocsQuery(), filter=summaryFilter, facets=query.facetsFor(resultCoreName), scoreCollector=aggregateScoreCollector, keyCollector=keyCollector, **query.otherKwargs()) for otherCoreName in otherCoreNames: if query.facetsFor(otherCoreName): keyFilter = KeyFilter(keyCollector.getCollectedKeys(), query.keyName(otherCoreName)) result.drilldownData.extend( (yield self.any[otherCoreName].facets( facets=query.facetsFor(otherCoreName), filterQueries=query.queriesFor(otherCoreName) + query.uniteQueriesFor(otherCoreName), drilldownQueries=query.drilldownQueriesFor( otherCoreName), filter=keyFilter))) result.queryTime = millis(time() - t0) generatorReturn(result)
def testCollectorFiltersTwoTimesTwoSimilarOneNot(self): self._addDocument("urn:1", 1, 2001) self._addDocument("urn:2", 3, 2009) # result 2x self._addDocument("urn:3", 50, 2010) # result 1x self._addDocument("urn:4", 3, 2001) self._addDocument("urn:5", 1, 2009) # result 2x #expected: "urn:2', "urn:3" and "urn:5" in no particular order tc = TopScoreDocSuperCollector(100, True) c = DeDupFilterSuperCollector("__isformatof__", "__sort__", tc) self.lucene.search(query=MatchAllDocsQuery(), collector=c) topDocsResult = tc.topDocs(0) self.assertEquals(3, topDocsResult.totalHits) self.assertEquals(3, len(topDocsResult.scoreDocs)) rawDocIds = [scoreDoc.doc for scoreDoc in topDocsResult.scoreDocs] netDocIds = [c.keyForDocId(rawDocId).docId for rawDocId in rawDocIds] identifiers = set( self.lucene._index.getDocument(doc).get(IDFIELD) for doc in netDocIds) self.assertEquals(set(["urn:2", "urn:3", "urn:5"]), identifiers) self.assertEquals( [1, 2, 2], list(sorted(c.keyForDocId(d).count for d in netDocIds)))
def _luceneQueryBuilder(self, prefix, sets=None, setsMask=None, partition=None): numberOfClausesAdded = 0 queryBuilder = BooleanQuery.Builder() if prefix: queryBuilder.add(TermQuery(Term(PREFIX_FIELD, prefix)), BooleanClause.Occur.MUST) numberOfClausesAdded += 1 if sets: setQueryBuilder = BooleanQuery.Builder() for setSpec in sets: setQueryBuilder.add(TermQuery(Term(SETS_FIELD, setSpec)), BooleanClause.Occur.SHOULD) queryBuilder.add(setQueryBuilder.build(), BooleanClause.Occur.MUST) numberOfClausesAdded += 1 for set_ in setsMask or []: queryBuilder.add(TermQuery(Term(SETS_FIELD, set_)), BooleanClause.Occur.MUST) numberOfClausesAdded += 1 if partition: partitionQueries = [] for start, stop in partition.ranges(): partitionQueries.append( IntPoint.newRangeQuery(HASH_FIELD, start, stop - 1)) if len(partitionQueries) == 1: pQuery = partitionQueries[0] else: pQueryBuilder = BooleanQuery.Builder() for q in partitionQueries: pQueryBuilder.add(q, BooleanClause.Occur.SHOULD) pQuery = pQueryBuilder.build() queryBuilder.add(pQuery, BooleanClause.Occur.MUST) numberOfClausesAdded += 1 if numberOfClausesAdded == 0: queryBuilder.add(MatchAllDocsQuery(), BooleanClause.Occur.MUST) return queryBuilder
def testFacetSuperCollector(self): I = Index(path=self.tempdir, settings=LuceneSettings()) for i in xrange(1000): document1 = createDocument(fields=[("field1", str(i)), ("field2", str(i) * 1000)], facets=[("facet1", "value%s" % (i % 100))]) document1 = I._facetsConfig.build(I._taxoWriter, document1) I._indexWriter.addDocument(document1) I.close() I = Index(path=self.tempdir, settings=LuceneSettings()) C = FacetSuperCollector(I._indexAndTaxonomy.taxoReader, I._facetsConfig, I._ordinalsReader) Q = MatchAllDocsQuery() I.search(Q, None, C) tc = C.getTopChildren(10, "facet1", []) self.assertEquals([('value90', 10), ('value91', 10), ('value92', 10), ('value93', 10), ('value94', 10), ('value95', 10), ('value96', 10), ('value97', 10), ('value98', 10), ('value99', 10)], [(l.label, l.value.intValue()) for l in tc.labelValues])
def testSearchTopField(self): I = Index(path=self.tempdir, settings=LuceneSettings()) I._indexWriter.addDocument( document(__id__='1', name="one", price="aap noot mies")) I.commit() I._indexWriter.addDocument( document(__id__='2', name="two", price="aap vuur boom")) I.commit() I._indexWriter.addDocument( document(__id__='3', name="three", price="noot boom mies")) I.commit() I.close() I = Index(path=self.tempdir, settings=LuceneSettings()) sort = Sort(SortField("name", SortField.Type.STRING, True)) C = TopFieldSuperCollector(sort, 2, True, False, True) Q = MatchAllDocsQuery() I.search(Q, None, C) td = C.topDocs(0) self.assertEquals(3, C.getTotalHits()) self.assertEquals(3, td.totalHits) self.assertEquals(2, len(td.scoreDocs)) self.assertEquals( ['2', '3'], [I.getDocument(s.doc).get("__id__") for s in td.scoreDocs])
def get_all_docs(self, n_hits=1000): # debug method return [ self._process_search_result(result) for result in self.searcher.search(MatchAllDocsQuery(), n_hits).scoreDocs ]
def index_images_until_stop(session, handler, lbound): global _stop, _stopped, _vm _vm.attachCurrentThread() searcher = IndexSearcher( DirectoryReader.open(SimpleFSDirectory(File(TASK_FOLDER)))) query = BooleanQuery() query.add(TermQuery(Term('finish_time', '0')), BooleanClause.Occur.MUST_NOT) query.add(MatchAllDocsQuery(), BooleanClause.Occur.MUST) if not lbound is None: query.add( TermRangeQuery.newStringRange('finish_time', lbound, '9999999999', False, True), BooleanClause.Occur.MUST) sort = Sort(SortField('finish_time', SortField.Type.INT)) tmpbk = None res = searcher.search(query, 100, sort) answer_content_searcher = zh_iatd.create_searcher() logger = external_console_logger('/tmp/zh_imgc_info') while not _stop: print 'got', len(res.scoreDocs), 'docs' for x in res.scoreDocs: try: imgsgot = 0 realdoc = searcher.doc(x.doc) doctype = realdoc['func_name'] objid = realdoc['id'] logger.write(' ft:{0}'.format(realdoc['finish_time'])) if doctype == 'user_data': soup = bs4.BeautifulSoup( session.opener.open( urllib2.Request( url='https://www.zhihu.com/people/{0}'.format( objid))), HTML_PARSER) cover = soup.select( '#ProfileHeader .ProfileHeader-userCover img') if len(cover) > 0: cover_img = cover[0]['src'] imgsgot += 1 handler(cover_img, ZH_IMGTYPE_USERINFO_COVER, objid) avatar_img = soup.select( '#ProfileHeader .ProfileHeader-main .UserAvatar img' )[0]['src'] imgsgot += 1 handler(avatar_img, ZH_IMGTYPE_USER_AVATAR, objid) elif doctype == 'article_data': jsondata = session.get_article_content_raw(objid) if 'titleImage' in jsondata.keys(): cover_img = jsondata['titleImage'] if len(cover_img) > 0: imgsgot += 1 handler(cover_img, ZH_IMGTYPE_ARTICLE_COVER, objid) soup = bs4.BeautifulSoup(jsondata['content'], HTML_PARSER) for x in soup.select('img'): imgsgot += 1 handler(x['src'], ZH_IMGTYPE_IN_ARTICLE, objid) elif doctype == 'topic_data': soup = bs4.BeautifulSoup( session.opener.open( urllib2.Request( url='https://www.zhihu.com/topic/{0}/hot'. format(objid))), HTML_PARSER) topic_img = soup.select( '.zu-main-content .topic-avatar .zm-entry-head-avatar-link img' )[0]['src'] imgsgot += 1 handler(topic_img, ZH_IMGTYPE_TOPIC_ICON, objid) elif doctype == 'answer_comments' and realdoc['start'] == '0': obj, q = zh_iatd.query_object(answer_content_searcher, objid, zh_pganlz.answer) for x in obj.data.text.as_soup().select('img'): imgsgot += 1 handler(x['src'], ZH_IMGTYPE_IN_ANSWER, objid) elif doctype == 'question_data': soup = bs4.BeautifulSoup( session.opener.open( urllib2.Request( url='https://www.zhihu.com/question/{0}'. format(objid))), HTML_PARSER) for x in soup.select('#zh-question-detail img'): imgsgot += 1 handler(x['src'], ZH_IMGTYPE_IN_QUESTION, objid) else: logger.write('\n') continue logger.write(' ({0}, +{1})\n'.format(doctype, imgsgot)) if _stop: break time.sleep(3) except Exception as e: logger.write('\n## ERROR ################################\n') logger.write(traceback.format_exc()) if len(res.scoreDocs) > 0: tmpbk = res.scoreDocs[-1] res = searcher.searchAfter(tmpbk, query, 100, sort) print 'stopped' _stopped = True
def testCollectorTransparentlyDelegatesToNextCollector(self): self._addDocument("urn:1", 2) tc = TopScoreDocSuperCollector(100, True) c = DeDupFilterSuperCollector("__isformatof__", "__sort__", tc) self.lucene.search(query=MatchAllDocsQuery(), collector=c) self.assertEquals(1, tc.topDocs(0).totalHits)
def _collectKeys(self, filter, keyName, query): keyCollector = KeySuperCollector(keyName) if self._multithreaded else KeyCollector(keyName) self.search(query=query or MatchAllDocsQuery(), filterQuery=filter, collector=keyCollector) return keyCollector.getCollectedKeys()
def testSilentyYieldsWrongResultWhenFieldNameDoesNotMatch(self): self._addDocument("urn:1", 2) tc = TopScoreDocSuperCollector(100, True) c = DeDupFilterSuperCollector("__wrong_field__", "__sort__", tc) self.lucene.search(query=MatchAllDocsQuery(), collector=c) self.assertEquals(1, tc.topDocs(0).totalHits)
def testMatchAllQuery(self): self.assertConversion(MatchAllDocsQuery(), '*')