def retrieve_sents(self): indexDir = self.indexDir query = self.query sent_ind_list = [] # template = CustomTemplate(format) fsDir = SimpleFSDirectory(Paths.get(indexDir)) # print indexDir searcher = IndexSearcher(DirectoryReader.open(fsDir)) analyzer = StandardAnalyzer() parser = QueryParser("contents", analyzer) parser.setDefaultOperator(QueryParser.Operator.OR) query = parser.parse(query) # print query start = datetime.now() scoreDocs = searcher.search(query, 50).scoreDocs duration = datetime.now() - start # print query if self.stats: print >> sys.stderr, "Found %d sentences (in %s) that matched query '%s':" % ( len(scoreDocs), duration, query) for scoreDoc in scoreDocs: # print scoreDoc.doc # doc = searcher.doc(scoreDoc.doc) sent_ind_list.append(scoreDoc.doc) return sent_ind_list
def buscar(indexDir, args,options = None): #lucene.initVM(vmargs=['-Djava.awt.headless=true']) fsDir = SimpleFSDirectory(File(indexDir)) #print fsDir #Criando buscador baseado no diretorio dos indices passados pelo usuario searcher = IndexSearcher(DirectoryReader.open(fsDir)) #Analizador para filtro dos tokens analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) #print analyzer #Criando um QueryParser usando por padrao contents #Variavel com as restricoes da busca parser = QueryParser(Version.LUCENE_CURRENT, "keywords", analyzer) #print parser parser.setDefaultOperator(QueryParser.Operator.AND) #print args #Juntando parametros passados com o valor do mesmo command = ' +'.join(args) #print command query = parser.parse(command) print query #Criando um JArray com resultado da consulta return searcher.search(query, 200).scoreDocs
class Searcher: def __init__(self, indexDir): self.directory = SimpleFSDirectory(Paths.get(indexDir)) self.searcher = IndexSearcher(DirectoryReader.open(self.directory)) self.nameQueryParser = QueryParser('name', StandardAnalyzer()) self.nameQueryParser.setDefaultOperator(QueryParser.Operator.AND) self.idQueryParser = QueryParser('id', StandardAnalyzer()) self.idQueryParser.setDefaultOperator(QueryParser.Operator.AND) def find_by_name(self, name): query = self.nameQueryParser.parse(name) docs = self.searcher.search(query, 100).scoreDocs tables = [] for scoreDoc in docs: doc = self.searcher.doc(scoreDoc.doc) table = dict((field.name(), field.stringValue()) for field in doc.getFields()) tables.append(table) return tables def find_by_id(self, id): query = self.idQueryParser.parse(id) docs = self.searcher.search(query, 100).scoreDocs tables = [] for scoreDoc in docs: doc = self.searcher.doc(scoreDoc.doc) table = dict((field.name(), field.stringValue()) for field in doc.getFields()) tables.append(table) return tables
class LuceneRanker(object): def __init__(self, tfidf_path, strict=True): lucene.initVM() analyzer = StandardAnalyzer() reader = DirectoryReader.open(SimpleFSDirectory(Paths.get(tfidf_path))) self.searcher = IndexSearcher(reader) self.parser = QueryParser("text", analyzer) self.parser.setDefaultOperator(QueryParser.Operator.OR) def closest_docs(self, query, k=1): """Closest docs by dot product between query and documents in tfidf weighted word vector space. """ query = self.parser.parse( query.replace('/', '//').replace('?', '').replace('"', '')) hits = self.searcher.search(query, k) docids = [] docs = [] for i, hit in enumerate(hits.scoreDocs): doc = self.searcher.doc(hit.doc) docs.append(unicode(doc['text'])) docids.append(unicode(doc['title'])) return docids, docs def batch_closest_docs(self, queries, k=1, num_workers=None): """Process a batch of closest_docs requests multithreaded.""" # get highest scoring document for multiple queries batch = [] for i, q in enumerate(queries): if i % 100 == 0: print(i) t0 = time.time() docids, docs = self.closest_docs(q, k) batch.append((docids, docs)) return batch def parse(self, query): return None def text2spvec(self, query): return None def get_doc_index(self, doc_id): return 0 def get_doc_id(self, doc_index): return 0 def __exit__(self, *args): pass
def perform_search(self, searchterm): # processing a query parser = QueryParser(Version.LUCENE_CURRENT, "content", self.analyzer) parser.setDefaultOperator(QueryParser.Operator.AND) query = parser.parse(searchterm) # conducting search searcher = IndexSearcher(DirectoryReader.open(self.store)) start = datetime.now() scoreDocs = searcher.search(query, 50).scoreDocs duration = datetime.now() - start print scoreDocs print duration
def getQueryParser(self): analyzers = self.getSearchingAnalyzers() map = HashMap() map.put('name', analyzers['name']) map.put('parent', analyzers['parent']) map.put('content', analyzers['default']) map.put('id', analyzers['id']) analyzerWrapper = PerFieldAnalyzerWrapper(analyzers['default'], map) queryParser = QueryParser(Version.LUCENE_CURRENT, 'content', analyzerWrapper) queryParser.setAutoGeneratePhraseQueries(PHRASE_QUERY_BY_DEFAULT) queryParser.setPhraseSlop(PHRASE_SLOP) queryParser.setFuzzyMinSim(FUZZY_MIN_SIM) queryParser.setDefaultOperator(DEFAULT_OPERATOR) return queryParser
def search(termo, **args): indexDir = os.environ.get('MANDEX') or '3iteracao' fsDir = SimpleFSDirectory(File(indexDir)) searcher = IndexSearcher(DirectoryReader.open(fsDir)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) parser = QueryParser(Version.LUCENE_CURRENT, field, analyzer) parser.setDefaultOperator(QueryParser.Operator.OR) query = parser.parse(termo + ' '.join(args.values())) start = datetime.now() scoreDocs = searcher.search(query, 50).scoreDocs duration = datetime.now() - start politicos = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) table = dict((field.name(), field.stringValue()) for field in doc.getFields()) politicos.append(table) return politicos
format = a elif o == "--index": indexDir = a elif o == "--stats": stats = True class CustomTemplate(Template): delimiter = '#' template = CustomTemplate(format) fsDir = SimpleFSDirectory(Paths.get(indexDir)) searcher = IndexSearcher(DirectoryReader.open(fsDir)) analyzer = StandardAnalyzer() parser = QueryParser("keywords", analyzer) parser.setDefaultOperator(QueryParser.Operator.AND) query = parser.parse(' '.join(args)) start = datetime.now() scoreDocs = searcher.search(query, 50).scoreDocs duration = datetime.now() - start if stats: print >>sys.stderr, "Found %d document(s) (in %s) that matched query '%s':" %(len(scoreDocs), duration, query) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) table = dict((field.name(), field.stringValue()) for field in doc.getFields()) print template.substitute(table)
def search(request): query = request.GET.get('q', None) page = int(request.GET.get('page', 1)) perPage = 5 nodes = [] usage = {} usage["time"] = time.time() if not query: count = 0 nodes = [] keywords = [] else: #conn = ReplicaSetConnection('localhost', replicaSet='jlu') conn = MongoClient('localhost') db = conn.sina #db.read_preference = ReadPreference.SECONDARY CACHE = db.cache keywords = query.split(' ') cache = CACHE.find_one({"query":keywords,"page":page}) if cache == None: print "query cache not found" VM_ENV.attachCurrentThread() fsDir = SimpleFSDirectory(File(settings.ROOT_DIR+'/index')) searcher = IndexSearcher(DirectoryReader.open(fsDir)) analyzer = SmartChineseAnalyzer(Version.LUCENE_CURRENT) parser = QueryParser(Version.LUCENE_CURRENT, 'text', analyzer) parser.setDefaultOperator(QueryParser.Operator.AND) lucene_query = parser.parse(query) scoreDocs = searcher.search(lucene_query, 3000000).scoreDocs ids = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) for field in doc.getFields(): ids.append(field.stringValue()) print "got ids from lucene",len(ids) ids = [int(x) for x in ids] NODES = conn.sina.nodes count = 0 for n in NODES.find({"node_id":{"$in":ids}}).sort("in_degree",-1).skip((page-1)*perPage): count += 1 print "doing",n["node_id"],count,"/",perPage n["js"] = similarity(n["node_id"],topk=10) nodes.append(n) if len(nodes) == perPage: break count = len(ids) CACHE.insert({"query":keywords,"page":page,"cache":nodes,"count":len(ids)}) usage["isCache"] = False else: print "found query cache" usage["isCache"] = True nodes = cache["cache"] count = cache["count"] pagenav = {} if page == 1: pagenav["has_pre"] = None else: pagenav["has_pre"] = page - 1 if page > count/perPage: pagenav["has_next"] = None else: pagenav["has_next"] = page + 1 pagenav["page"] = page usage["time"] = time.time() - usage["time"] return { 'q' : request.GET.get('q', ''), 'keywords' : keywords, 'nodes' : nodes, 'count' : count, 'page' : pagenav, 'usage' : usage, }
class TASearcher(): def __init__(self, queries=[], criteria=[], conjunctions=[], orderby=["ta"], ascending=True, limit=10000): vm.attachCurrentThread() self.queries = [query for query in queries if len(query.strip()) > 0] self.criteria = criteria self.conjunctions = conjunctions self.orderby = orderby self.ascending = ascending self.queryString = "" self.limit = limit self.fields = fields self.analyzer = PorterStemmerAnalyzer() self.queryParser = QueryParser(Version.LUCENE_30, "freetext", self.analyzer) self.queryParser.setAllowLeadingWildcard(True) self.queryParser.setDefaultOperator(QueryParser.Operator.AND) indexDir = settings.LUCENE_INDEX_DIRECTORY self.index = MMapDirectory(File(indexDir)) def createQueryString(self): # Simple if len(self.criteria) == 0: self.queryString = "(%s) OR freetext-normalized:(%s)" % (self.queries[0], self.queries[0]) # Advanced else: queryPairs = [] criteriaQueries = zip(self.criteria, self.queries) self.criteria = dict(criteriaQueries).keys() for criterion, query in criteriaQueries: if criterion in ("volume", "number", "category-label", "pubtype", "author-sort"): queryPairs.append("%s:%s" % (criterion, query)) elif criterion == "year": queryPairs.append("year-start:%s OR year-end:%s" % (query, query)) else: queryPairs.append('%s:%s OR %s-normalized:%s' % (criterion, query, criterion, query)) # queryPairs = ["%s:%s"%(criterion,query.replace(" ", "+")) for criterion, query in zip(criteria, queries)] try: queryString = "%s %s" % (queryPairs[0], " ".join( ["%s (%s)" % (conj, pair) for conj, pair in zip(self.conjunctions, queryPairs[1:])])) self.queryString = queryString return queryString except: self.queryString = "freetext" return self.queryString def getQueryString(self): return self.queryString def _getHits(self): reader = IndexReader.open(self.index) searcher = IndexSearcher(reader) # Sortierung nach Band- und Eintragsnummer (4: Wert als Integer behandeln) sortDict = { "ta": (("volume", SortField.Type.INT), ("number", SortField.Type.INT)), "year": (("year-start", SortField.Type.INT), ("year-end", SortField.Type.INT)), "author-title": (("author-sort", SortField.Type.STRING), ("title-sort", SortField.Type.STRING)), "title": (("title-sort", Locale.GERMAN),), "author": (("author-sort", Locale.GERMAN),), } sortFields = [] reverse = not self.ascending for name in self.orderby: for fieldName, typeNum in sortDict.get(name, []): sortFields.append(SortField(fieldName, typeNum, reverse)) if len(sortFields) == 0: sortFields = [SortField("volume", SortField.Type.INT), SortField("number", SortField.Type.INT)] sort = Sort(sortFields) topDocs = searcher.search(self.query, None, 80000, sort) hits = topDocs.scoreDocs self.hits = hits self.searcher = searcher lang = translation.get_language() if lang != "de": lang = "en" facets = {"author": {}, "pubtype": {}, "category-%s" % lang: {}} # Highlighting highlighter = Highlighter(SimpleHTMLFormatter('<span class="highlight">', '</span>'), QueryScorer(self.query)) hitObjects = [] fields = {} for hit in hits: doc = searcher.doc(hit.doc) # print unicode(doc) fields["score"] = hit.score fields["volume"] = doc["volume"] fields["number"] = doc["number"] fields["id"] = doc["id"] fields["title"] = doc["title"] fields["author"] = doc["author"] fields["authors"] = [field.stringValue() for field in doc.getFields("author")] for author in fields["authors"]: # XXX facets["author"][author] = facets["author"].get(author, 0) + 1 # XXX fields["categories"] = [field.stringValue() for field in doc.getFields("category-%s" % lang)] for cat in fields["categories"]: facets["category-%s" % lang][cat] = facets["category-%s" % lang].get(cat, 0) + 1 maxNumFragmentsRequired = 2 fragmentSeparator = "..."; pubtype = doc["pubtype"] fields["pubtype"] = pubtype facets["pubtype"][pubtype] = facets["pubtype"].get(pubtype, 0) + 1 fields["city"] = doc["city"] fields["year"] = doc["year-start"] if fields["year"] and doc["year-end"] and doc["year-end"] != fields["year"]: fields["year"] += " - " + doc["year-end"] highlightFields = ("title", "author", "city", "year", "category") if "freetext" in self.criteria: for fieldName in highlightFields: try: tokenStream = self.analyzer.tokenStream(fieldName, lucene.StringReader(fields[fieldName])) newVal = highlighter.getBestFragments(tokenStream, fields[fieldName], maxNumFragmentsRequired, fragmentSeparator) if len(newVal) > 0: # fields[fieldName] = re.sub(r'</span>\s*<span class="highlight">', ' ', newVal) fields[fieldName] = newVal except: continue for fieldName in highlightFields: if fieldName in self.criteria or fieldName + "-de" in self.criteria or fieldName + "-en" in self.criteria: try: tokenStream = self.analyzer.tokenStream(fieldName, lucene.StringReader(fields[fieldName])) newVal = highlighter.getBestFragments(tokenStream, fields[fieldName], maxNumFragmentsRequired, fragmentSeparator) if len(newVal) > 0: # fields[fieldName] = re.sub(r'</span>\s*<span class="highlight">', ' ', newVal) fields[fieldName] = newVal except: continue """if "author" in self.criteria: try: tokenStream = self.analyzer.tokenStream("author", lucene.StringReader(fields["author"])) fields["author"] = highlighter.getBestFragments(tokenStream, fields["author"], maxNumFragmentsRequired, fragmentSeparator) except: pass""" hitObjects.append( Hit(fields["id"], fields["volume"], fields["number"], fields["title"], fields["author"], fields["city"], fields["year"], fields["categories"], fields["pubtype"], fields["score"])) facetsToDelete = [] for facet in facets: if len(facets[facet]) < 2: facetsToDelete.append(facet) continue values = sorted(facets[facet].items(), key=itemgetter(0)) values = sorted(values, key=itemgetter(1), reverse=True) facets[facet] = values[:25] for facet in facetsToDelete: del facets[facet] self.facets = facets reader.close() self.hitObjects = hitObjects return hitObjects def search(self): self.createQueryString() querystr = self.getQueryString() self.query = self.queryParser.parse(querystr) return self._getHits() def getAll(self): self.query = MatchAllDocsQuery() return self._getHits()
indexDir = a elif o == "--stats": stats = True class CustomTemplate(Template): delimiter = '#' template = CustomTemplate(format) fsDir = SimpleFSDirectory(Paths.get(indexDir)) searcher = IndexSearcher(DirectoryReader.open(fsDir)) analyzer = StandardAnalyzer() parser = QueryParser("keywords", analyzer) parser.setDefaultOperator(QueryParser.Operator.AND) query = parser.parse(' '.join(args)) start = datetime.now() scoreDocs = searcher.search(query, 50).scoreDocs duration = datetime.now() - start if stats: print >> sys.stderr, "Found %d document(s) (in %s) that matched query '%s':" % ( len(scoreDocs), duration, query) for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) table = dict( (field.name(), field.stringValue()) for field in doc.getFields()) print template.substitute(table)
raise Exception( 'At least one between TAGS_AND_GENRES and DESCR should be True') lucene.initVM(vmargs=['-Djava.awt.headless=true']) fsDir = SimpleFSDirectory(Paths.get('index')) searcher = IndexSearcher(DirectoryReader.open(fsDir)) if CLASSIC_SIMILARITY: searcher.setSimilarity(ClassicSimilarity()) analyzer = EnglishAnalyzer() tags_parser = QueryParser(TAGS_LABEL, analyzer) genres_parser = QueryParser(GENRES_LABEL, analyzer) descr_parser = QueryParser(DESCR_LABEL, analyzer) tags_parser.setDefaultOperator(QueryParser.Operator.OR) genres_parser.setDefaultOperator(QueryParser.Operator.OR) descr_parser.setDefaultOperator(QueryParser.Operator.OR) BooleanQuery.setMaxClauseCount( 2000000) # prevents 1024 limit error for very long queries ############################## Build user queries ########################## ratings = ML1M('../datasets/ml-1m').ratings movies_descriptions = pd.read_csv('../datasets/movies-descriptions.csv') movies_tags = pd.read_csv('../datasets/movies-tags.csv') movies_genres = pd.read_csv('../datasets/movies-genres.csv') users = set(ratings[['user']].values.flatten())