def _create_query(self, fields): """ Build query with Term, Phrase and Fuzzy clauses. :param fields: dictionary of (field, text) tuples :return: query """ query = BooleanQuery() for (field, text) in fields: if field.startswith("year"): start, end = text.split(",") numeric_query = NumericRangeQuery.newIntRange( 'year', int(start), int(end), True, True) query.add(BooleanClause(numeric_query, BooleanClause.Occur.MUST)) if field == 'title': spans = [] for word in text.lower().split(): spans.append(SpanTermQuery(Term(field, word))) query.add(BooleanClause(SpanNearQuery(spans, 2, True), BooleanClause.Occur.SHOULD)) field_names, field_texts = zip(*fields) flags = [BooleanClause.Occur.MUST] * len(field_names) query_parser_query = MultiFieldQueryParser.parse( Version.LUCENE_CURRENT, field_texts, field_names, flags, StandardAnalyzer(Version.LUCENE_CURRENT)) query.add(BooleanClause(query_parser_query, BooleanClause.Occur.MUST)) fuzzify = lambda s: (s + " ").replace(" ", "~1 ") fuzzy_field_texts = map(fuzzify, field_texts) fuzzy_query_parser_query = MultiFieldQueryParser.parse( Version.LUCENE_CURRENT, fuzzy_field_texts, field_names, flags, StandardAnalyzer(Version.LUCENE_CURRENT)) query.add(BooleanClause(fuzzy_query_parser_query, BooleanClause.Occur.MUST)) boostQuery = FunctionQuery( LinearFloatFunction( PowFloatFunction( DoubleConstValueSource(0.0001), ScaleFloatFunction(IntFieldSource("imdb_votes_boost"), 0.0, 1.0) ), -1.0, 1.0)) query = CustomScoreQuery(query, boostQuery) return query
def search(searcher, analyzer, directory, query2): print print "Empty to quit." # command = raw_input("Query: ") #raw_input for query command = query2 if command == '': loopVar = False return print print "Searching for ", command parserVar = MultiFieldQueryParser(fields, analyzer) parserVar.setDefaultOperator(QueryParserBase.AND_OPERATOR) query = MultiFieldQueryParser.parse(parserVar, command) scoreDocs = searcher.search( query, 10).scoreDocs #number is max number of matching documents print "total matching documents in: " + str((len(scoreDocs))) counter = 0 for scoreDoc in scoreDocs: #dont really know what this is either doc = searcher.doc(scoreDoc.doc) print "@" + doc.get("u_name") + ": " + doc.get( "tweet") + " Score:" + str(scoreDocs[counter].score) docData = {} docData['u_name'] = doc.get("u_name") docData['tweet'] = doc.get("tweet") docData['score'] = str(scoreDocs[counter].score) results.append(docData) counter = counter + 1 print print "\n------------------------------------------------------" return results
def searchForClass(self, inst, pred): classUris = list() fields = [FreyaConstants.INST_FEATURE_LKB, FreyaConstants.PROPERTY_FEATURE_LKB] flags = [BooleanClause.Occur.MUST, BooleanClause.Occur.MUST] queries = ["\"" + QueryParser.escape(inst) + "\"", "\"" + QueryParser.escape(pred) + "\""] try: analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT, queries, fields, flags, analyzer) result = self._searcher.search(query, 1) logging.info("For " + str(query) + " : " + str(result.totalHits)) freq = result.totalHits if freq > 0: result = self._searcher.search(query, freq) hits = pyJava.JArray2List(result.scoreDocs) # for (ScoreDoc hit : hits) { indexus = 0 while indexus < len(hits): hit = hits[indexus] doc = self._searcher.doc(hit.doc) classUris.append(doc.get(FreyaConstants.CLASS_FEATURE_LKB)) indexus += 1 except Exception as e:#ParseException(e): print e.message logging.error("Error") return classUris
def findLiteral(self, instanceUri, propertyURI): labels = list() try: analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) fields = [FreyaConstants.INST_FEATURE_LKB, FreyaConstants.PROPERTY_FEATURE_LKB] flags = [BooleanClause.Occur.MUST, BooleanClause.Occur.MUST] labelOrTitleUris = "\"" + propertyURI + "\"" queries = ["\"" + QueryParser.escape(instanceUri) + "\"", QueryParser.escape(labelOrTitleUris)] query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT, queries, fields, flags, analyzer) result = self._searcher.search(query, 1) logging.debug("For " + str(query) + " : " + str(result.totalHits)) freq = result.totalHits if freq > 0: result = self._searcher.search(query, freq) hits = pyJava.JArray2List(result.scoreDocs) # for (ScoreDoc hit : hits) { indexus = 0 while indexus < len(hits): hit = hits[indexus] doc = self._searcher.doc(hit.doc) labels.append(doc.get(FreyaConstants.FIELD_EXACT_CONTENT)) indexus += 1 except Exception as e:#CorruptIndexException(e): print e.message logging.error("Error") return labels
def retrieve(indexdir, queries): lucene.initVM() f = open("results_lucene.txt", "w") analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) reader = IndexReader.open(SimpleFSDirectory(File(indexdir))) searcher = IndexSearcher(reader) fields = ["title", "abstract", "authors"] st = PorterStemmer() for id, q in queries.iteritems(): query = q tokenizer = RegexpTokenizer(r'\w+') qwords = tokenizer.tokenize(query) qwords_k = [st.stem(q) for q in qwords] query = " ".join(qwords_k) parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields, analyzer) parser.setDefaultOperator(QueryParserBase.OR_OPERATOR) query = MultiFieldQueryParser.parse(parser, query) MAX = 1000 hits = searcher.search(query, MAX) # print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) for i, hit in enumerate(hits.scoreDocs): f.write("%s Q0 %s %s %s G17R3\n" % (id, hit.doc+1, i+1, hit.score)) # print hit.doc+1, hit.score # doc = searcher.doc(hit.doc) # print doc.get("authors").encode("utf-8") f.close()
def findSubClasses(self, classUri): #RESOLVE multifieldqueryparser DOCUMENTATION PROBLEM!!!! propertyURI = "http://www.w3.org/2000/01/rdf-schema#subClassOf" subClasses = list() try: analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) fields = [FreyaConstants.CLASS_FEATURE_LKB, FreyaConstants.PROPERTY_FEATURE_LKB] flags = [BooleanClause.Occur.MUST, BooleanClause.Occur.MUST] subClassUri = "\"" + QueryParser.escape(propertyURI) + "\"" queries = ["\"" + QueryParser.escape(classUri) + "\"", subClassUri] query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT,queries, fields,flags,analyzer) result = self._searcher.search(query, 1) logging.debug("For " + str(query) + " : " + str(result.totalHits)) freq = result.totalHits if freq > 0: result = self._searcher.search(query, freq) hits = pyJava.JArray2List(result.scoreDocs) # for (ScoreDoc hit : hits) { indexus = 0 while indexus < len(hits): hit = hits[indexus] doc = self._searcher.doc(hit.doc) subClasses.append(doc.get(FreyaConstants.INST_FEATURE_LKB)) indexus += 1 except Exception as e:#CorruptIndexException(e): print e.message logging.error("Error") return subClasses
def search(self, input_query=None, max_answers=10): ''' Searches the given query in the index ''' if input_query is None: return None base_dir = '.' directory = SimpleFSDirectory(File(os.path.join(base_dir, self.index_dir))) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) # query = QueryParser(Version.LUCENE_CURRENT, "title", analyzer).parse(input_query) parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, (self._posts_fields + self._answer_fields), analyzer) query = MultiFieldQueryParser.parse(parser, input_query) scoreDocs = searcher.search(query, max_answers).scoreDocs print "%s total matching documents." % len(scoreDocs) docs = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) doc_dict = dict((field.name(), field.stringValue()) for field in doc.getFields()) docs.append(doc_dict) # print doc return docs
def SearchQuery(queryString, fields, classification): #if __name__ == "__main__": #if __name__ == "retriever": location = web.__path__[0] + "/static/web/files/index/index.articles" #lucene.initVM() vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) reader = IndexReader.open(SimpleFSDirectory(File(location))) searcher = IndexSearcher(reader) #multi field query: http://stackoverflow.com/questions/2005084/how-to-specify-two-fields-in-lucene-queryparser query = MultiFieldQueryParser(Version.LUCENE_4_10_1, fields, analyzer) #query.setDefaultOperator(QueryParserBase.AND_OPERATOR) query = MultiFieldQueryParser.parse(query, queryString) #query.parse(queryString)#"Shigella sonnei" #query = QueryParser(Version.LUCENE_4_10_1, "abstract", analyzer).parse(queryString)#"Shigella sonnei" MAX = 10000 hits = searcher.search(query, MAX) print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) paths = [] pmcids = [] documentDict = {} for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) pmcids.append(doc.get("pmcid")) docDict = {"title" : doc.get("title")}#we can add any other field we want... documentDict[doc.get("pmcid")] = docDict #Where we get the images for all the pmcids images = get_image_pmcid(pmcids, classification)#should take in pmcids and class #create dictionary of images with pmcid being their key imagesDict = {} for img in images: img_pmcid = img.get("pmcid") if img_pmcid in imagesDict.keys(): imagesDict[img_pmcid].append(img.get("filepath") + "/" + img.get("figureid")) else: imagesDict[img_pmcid] = [(img.get("filepath") + "/" + img.get("figureid"))] #for each pmcid, we will assign an image to it for the search results for pmcid in pmcids: if imagesDict: docDict = documentDict[pmcid] docDict["imgURL"] = imagesDict[pmcid][0] documentDict[pmcid] = docDict else: docDict = documentDict[pmcid] docDict["imgURL"] = "images/NoImageAvailable.jpg" documentDict[pmcid] = docDict #END - Where we get the images for all the pmcids return documentDict
def search_lucene(fields_,terms_,requirements_,searcher,index=0): terms = [] fields = [] requirements = [] for (i,x) in enumerate(terms_): terms.append(x[index]) fields.append(fields_[i][index]) requirements.append(requirements_[i][index]) sys.stdout.write("Running query %s: (\"%s\") in fields (%s) with requirements (%s)\n" % (sym2name[index],"\",\"".join(terms),",".join(fields),",".join([sym2name[str(x)] for x in requirements]))) query = MultiFieldQueryParser.parse(Version.LUCENE_4_10_1,terms,fields,requirements,analyzer2) return(terms,fields,requirements,searcher.search(query, NUM_TO_RETRIEVE))
def multiFieldsSearch(self, query, sim): lucene.getVMEnv().attachCurrentThread() parser = MultiFieldQueryParser( ["content_section", "title_section", 'title_article'], self.analyzer) parser.setDefaultOperator(QueryParserBase.OR_OPERATOR) query = MultiFieldQueryParser.parse(parser, QueryParser.escape(query)) self.searcher.setSimilarity(sim) hits = self.searcher.search(query, 6).scoreDocs return hits
def explain(self, query, fields, doc): if not self.searcher: self.open_searcher() query = QueryParser.escape(query) parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields, self.analyzer) query = MultiFieldQueryParser.parse(parser, query) return self.searcher.explain(query, doc)
def parse_query(self, query_string, order_matters=True): query_parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, ["title", "qbody"], self.analyzer) if order_matters: # Take into account order of query terms base_query = getSpanNearQuery(self.analyzer, query_string) else: # Considers query keywords as bag of words base_query = query_parser.parse(query_string) #http://shaierera.blogspot.com/2013/09/boosting-documents-in-lucene.html boost_query = FunctionQuery(LongFieldSource("view_count")) self.query = CustomScoreQuery(base_query, boost_query)
def scents_search(former, mid, last): query = ''.join(former) + ' ' + ''.join(mid) + ' ' + ''.join(last) fields = ["former_scents", "mid_scents", "last_scents"] clauses = [ BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD ] parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields, analyzer) parser.setDefaultOperator(QueryParserBase.AND_OPERATOR) query = MultiFieldQueryParser.parse(parser, query) return query
def brand_scent_search(brand, scent): query = brand + ' ' + ''.join(scents) fields = ["name", "scents"] clauses = [ BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD ] parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields, analyzer) parser.setDefaultOperator(QueryParserBase.AND_OPERATOR) query = MultiFieldQueryParser.parse(parser, query) return query
def multiFieldsPairSearch(self, pair, sim): """ Method that searches through documents using only content_section Field searchDir : the path to the folder that contains the index. """ # Now search the index: title = pair[0].replace('_', ' ') content = pair[1] parser = MultiFieldQueryParser(["content_section", "title_article"], self.analyzer) parser.setDefaultOperator(QueryParserBase.OR_OPERATOR) query1 = MultiFieldQueryParser.parse(parser, QueryParser.escape(title)) query2 = MultiFieldQueryParser.parse(parser, QueryParser.escape(content)) bq = BooleanQuery.Builder() bq.add(query1, BooleanClause.Occur.FILTER) bq.add(query2, BooleanClause.Occur.SHOULD) self.searcher.setSimilarity(sim) hits = self.searcher.search(bq.build(), 6).scoreDocs return hits
def search_samples_lucene(sample_map,sampleq,sample_set,ra,stream_sample_metadata=False): (fields,queries,booleans) = lucene_sample_query_parse(sampleq) query = MultiFieldQueryParser.parse(Version.LUCENE_4_10_1, queries, fields, booleans, snapconf.LUCENE_ANALYZER) hits = searcher.search(query, snapconf.LUCENE_MAX_SAMPLE_HITS) #if we get nothing, try with the backup analyzer if hits.totalHits == 0: query = MultiFieldQueryParser.parse(Version.LUCENE_4_10_1, queries, fields, booleans, snapconf.LUCENE_BACKUP_ANALYZER) hits = searcher.search(query, snapconf.LUCENE_MAX_SAMPLE_HITS) if DEBUG_MODE: sys.stderr.write("Found %d document(s) that matched query '%s':\n" % (hits.totalHits, sampleq)) if stream_sample_metadata: sys.stdout.write("DataSource:Type\tLucene TF-IDF Score\t%s\n" % (snapconf.SAMPLE_HEADER)) for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) sid = doc.get(snapconf.SAMPLE_ID_FIELD_NAME) #track the sample ids if asked to if sid != None and len(sid) >= 1: if sample_set != None: sample_set.add(sid) #stream back the full sample metadata record from the in-memory dictionary if stream_sample_metadata: sys.stdout.write("%s:S\t%s\t%s\n" % (snapconf.DATA_SOURCE,str(hit.score),sample_map[sid]))
def getTrainingData(searcher, analyzer, Sno, keyterm): query = str(Sno) + ' ' + keyterm escaped_string = MultiFieldQueryParser.escape(query) multiQueryParser = MultiFieldQueryParser.parse( escaped_string, ["Sno", "keyterm"], [BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD], analyzer) start = datetime.now() scoreDocs = searcher.search(multiQueryParser, 1).scoreDocs duration = datetime.now() - start result = '' for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) result = str(doc.get("text")) return result
def search(self, query): lucene.initVM() luceneDirectory = "/index/" path = str(os.path.abspath(os.getcwd()) + luceneDirectory) directory = FSDirectory.open(Paths.get(path)) reader = DirectoryReader.open(directory) searcher = IndexSearcher(reader) analyzer = StandardAnalyzer() #args = len(sys.argv) - 1 #if args < 1: # print ("\n No query was submitted! \n") #else: #query_string = "" #position = 1 #while(args >= position): #query_string = query_string + str(sys.argv[position]) + " " #position = position + 1 print("Searching for '" + query + "'") fields_to_search = ["text", "page title", "date"] filter_date = 'date:"May 25"' filtered_query = filter_date + "AND " + query parser = MultiFieldQueryParser(fields_to_search, analyzer) updated_query = MultiFieldQueryParser.parse(parser, filtered_query) scored_documents = searcher.search(updated_query, 10).scoreDocs # array of docs print("Found " + str((len(scored_documents))) + " matches in the collection.") results = [] for doc in scored_documents: scoredTweet = dict() scoredTweet['score'] = doc.score result = searcher.doc(doc.doc) scoredTweet['username'] = result.get("username") scoredTweet['tweet_body'] = result.get("text") scoredTweet['date'] = result.get("date") results.append(scoredTweet) print(scoredTweet) return results
def multiFieldsSearch(self, query, sim): """ Method that searches through documents using content_section and title_article Fields searchDir : the path to the folder that contains the index. """ # Now search the index: lucene.getVMEnv().attachCurrentThread() parser = MultiFieldQueryParser(["content_section", "title_article"], self.analyzer) parser.setDefaultOperator(QueryParserBase.OR_OPERATOR) query = MultiFieldQueryParser.parse(parser, QueryParser.escape(query)) self.searcher.setSimilarity(sim) hits = self.searcher.search(query, 6).scoreDocs return hits
def get_sorted_results(self, query): SHOULD = BooleanClause.Occur.SHOULD parsed_query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT, query, ['docno', 'content'], [SHOULD, SHOULD], self.analyzer) reader = IndexReader.open(self.directory) searcher = IndexSearcher(reader) searcher.setSimilarity(BM25Similarity()) topDocs = searcher.search(parsed_query, 10) j = 0 for i in topDocs.scoreDocs: d = searcher.doc(i.doc) print 'No. %02d: ' % (j + 1) + d['docno'] + ' ' + str(i.score) j += 1
def test_searchDocumentsWithMultiField(self): """ Tests searching with MultiFieldQueryParser """ self.test_indexDocument() store = self.openStore() searcher = None try: searcher = self.getSearcher(store) SHOULD = BooleanClause.Occur.SHOULD query = MultiFieldQueryParser.parse("value", ["title", "docid"], [SHOULD, SHOULD], self.getAnalyzer()) topDocs = searcher.search(query, 50) self.assertEqual(1, topDocs.totalHits) finally: self.closeStore(store)
def test_searchDocumentsWithMultiField(self): """ Tests searching with MultiFieldQueryParser """ self.test_indexDocument() store = self.openStore() searcher = None try: searcher = self.getSearcher(store) SHOULD = BooleanClause.Occur.SHOULD query = MultiFieldQueryParser.parse("value", ["title", "docid"], [SHOULD, SHOULD], self.getAnalyzer()) topDocs = searcher.search(query, 50) self.assertEquals(1, topDocs.totalHits) finally: self.closeStore(store)
def search_samples_lucene(sample_map,sampleq,sample_set,stream_sample_metadata=False): (fields,queries,booleans) = lucene_sample_query_parse(sampleq) query = MultiFieldQueryParser.parse(Version.LUCENE_4_10_1, queries, fields, booleans, analyzer) #query = MultiFieldQueryParser.parse(Version.LUCENE_4_10_1, ['human AND adult AND brain'], ['description_t'], [BooleanClause.Occur.MUST], analyzer) hits = searcher.search(query, snapconf.LUCENE_MAX_SAMPLE_HITS) if DEBUG_MODE: sys.stderr.write("Found %d document(s) that matched query '%s':\n" % (hits.totalHits, sampleq)) if stream_sample_metadata: sys.stdout.write("DataSource:Type\t%s\n" % (snapconf.SAMPLE_HEADER)) for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) sid = doc.get("intropolis_sample_id_i") #track the sample ids if asked to if sample_set != None: sample_set.add(sid) #stream back the full sample metadata record from the in-memory dictionary if stream_sample_metadata: sys.stdout.write("%s:S\t%s\n" % (snapconf.DATA_SOURCE,sample_map[sid]))
def search(self, terms, n_hits=5): """ Run search query. """ # TODO: support date range queries # build query parser = MultiFieldQueryParser(['fullpath', 'body'], self.analyzer) #parser.setDefaultOperator(QueryParser.Operator.AND) # defaults to OR unless terms have modifier query = MultiFieldQueryParser.parse( parser, terms) # https://stackoverflow.com/a/26853987/130164 # create a highlighter highlighter = Highlighter(SimpleHTMLFormatter('*', '*'), QueryScorer(query)) # execute search for top N hits return [ self._process_search_result(result, highlighter) for result in self.searcher.search(query, n_hits).scoreDocs ]
def search(self, query): SHOULD = BooleanClause.Occur.SHOULD q = MultiFieldQueryParser.parse(query, FIELDS, [SHOULD, SHOULD], StandardAnalyzer()) # print(q.toString()) topHits = 100 scores = self._indexSearcher.search(q, topHits).scoreDocs results = [] for i in range(10): doc = self._indexSearcher.doc(scores[i].doc) results.append(i + 1, scores[i].doc, doc.get("filename"), doc.get("contents")) # print(i+1) # print("Score: ", scores[i].doc) # print("Title: ", doc.get("filename")) # print("Contents: ", doc.get("contents")) return results
def preprocess_query(self, query, fields, mode="ANY"): ''' Fix query according to provided mode. If the value is not supported, the query remains unchanged ''' terms = query.lower().strip().split() if mode == "ANY": query = " OR ".join(terms) elif mode == "ALL": query = " AND ".join(terms) else: print "Invalid mode parameter '%s'." % mode query = QueryParser.escape(query) parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields, self.analyzer) query = MultiFieldQueryParser.parse(parser, query) return query
def func_cross(former, mid, last): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() STORE_DIR = "index_tb_new" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) query = former + ' ' + ' ' + mid + ' ' + last fields = ["former", "mid", "last"] clauses = [ BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD ] parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields, analyzer) parser.setDefaultOperator(QueryParserBase.AND_OPERATOR) query = MultiFieldQueryParser.parse(parser, query) scoreDocs = searcher.search(query, 200).scoreDocs results = process(scoreDocs, searcher) return results
def doc_search(self, field, keywords, numHits): if field != 'All': analyzer = StandardAnalyzer() parser = QueryParser(field, analyzer) query = parser.parse(keywords) # self.lReader.getDocCount("title"); try: collector = TopScoreDocCollector.create(numHits) self.lSearcher.search(query, collector) hits = collector.topDocs().scoreDocs except RuntimeError: print "Score docoment run fail" self.hits = hits self.field = field return hits else: analyzer = WhitespaceAnalyzer() parser = MultiFieldQueryParser(['Title', 'Body'], analyzer) query = MultiFieldQueryParser.parse(parser, keywords) # self.lReader.getDocCount("title"); try: collector = TopScoreDocCollector.create(numHits) self.lSearcher.search(query, collector) hits = collector.topDocs().scoreDocs except RuntimeError: print "Score docoment run fail" self.hits = hits self.field = field return hits self.hits = hits self.field = field return hits
def query(q): """ :param q: :return:search result, type list, eg. [{'name', 'path'}...] """ lucene.initVM() index_store_dir = current_app.config['INDEX_STORE_DIR'] directory = SimpleFSDirectory(File(index_store_dir)) print 'directory', directory searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) parse = MultiFieldQueryParser(Version.LUCENE_CURRENT, ['name', 'title', 'content'], analyzer) query = MultiFieldQueryParser.parse(parse, q) scoreDocs = searcher.search(query, 50).scoreDocs print "%s total matching documents." % len(scoreDocs) result = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) item = dict(date=doc.get('date'), name=doc.get('name'), title=doc.get('title'), summary=doc.get('summary')) result.append(item) return result
def searchGivenHallmarks(self, query, hallmarksList, hallmarksField, maxReturnLimit): qList = [query] qList.extend(hallmarksList) #print(qList) fList = ["text"] fList.extend([hallmarksField]*len(hallmarksList)) #print(fList) flagList = [BooleanClause.Occur.MUST] flagList.extend([BooleanClause.Occur.MUST]*len(hallmarksList)) #print(flagList) qp = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT, qList, fList, flagList, self.analyzer) #print (qp) hits = self.searcher.search(qp, maxReturnLimit) result = [] for hit in hits.scoreDocs: record = dict() doc = self.searcher.doc(hit.doc) record["id"] = doc.get("id") record["pos"] = doc.get("pos") record["hallmarks"] = doc.get("hallmarks").split() #record["hallmarks-exp"] = doc.get("hallmarks-exp").split() record["text"] = doc.get("text") result.append(record) return result
def getIntersectionCount(self, query, countTermString, sfield, cfield): qp = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT,[query,countTermString],[sfield,cfield],[BooleanClause.Occur.MUST,BooleanClause.Occur.MUST],self.analyzer) collector = TotalHitCountCollector() self.searcher.search(qp, collector) return collector.getTotalHits()
def getRelatedArticles(pmcid): import tools.retriever as retriever location = web.__path__[0] + "/static/web/files/index/index.articles" #lucene.initVM() vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) reader = IndexReader.open(SimpleFSDirectory(File(location))) searcher = IndexSearcher(reader) q = MultiFieldQueryParser(Version.LUCENE_4_10_1, ["pmcid"], analyzer) #query.setDefaultOperator(QueryParserBase.AND_OPERATOR) q = MultiFieldQueryParser.parse(q, pmcid) MAX = 10000 hits = searcher.search(q, MAX) id = hits.scoreDocs[0].doc#searcher.doc(hits[0].scoredocs[0].doc) result = BooleanQuery() result.add(BooleanClause(q, BooleanClause.Occur.MUST_NOT)) titlemlt = getSimilarityGenerator("title", 1, 1, 2) abstractmlt = getSimilarityGenerator("abstract", 2, 5, 2) citationmlt = getSimilarityGenerator("citation", 2, 5, 2) fulltextmlt = getSimilarityGenerator("fulltext", 2, 5, 2) keywordmlt = getSimilarityGenerator("keyword", 1, 1, 1) titleQ = titlemlt.like(id); titleQ.setBoost(0.2) abstractQ = abstractmlt.like(id); abstractQ.setBoost(0.1); #Do we even want to include a query for similar citations? citationQ = citationmlt.like(id); citationQ.setBoost(0.0); fulltextQ = fulltextmlt.like(id); fulltextQ.setBoost(0.0); keywordQ = keywordmlt.like(id); keywordQ.setBoost(0.0); result.add(BooleanClause(titleQ, BooleanClause.Occur.SHOULD)); result.add(BooleanClause(abstractQ, BooleanClause.Occur.SHOULD)); result.add(BooleanClause(citationQ, BooleanClause.Occur.SHOULD)); result.add(BooleanClause(fulltextQ, BooleanClause.Occur.SHOULD)); result.add(BooleanClause(keywordQ, BooleanClause.Occur.SHOULD)); hits = searcher.search(result, 5) print "Found %d document(s) that matched query '%s':" % (hits.totalHits, result) paths = [] pmcids = [] documentDict = {} for hit in hits.scoreDocs: doc = searcher.doc(hit.doc) pmcids.append(doc.get("pmcid")) docDict = {"title" : doc.get("title")}#we can add any other field we want... documentDict[doc.get("pmcid")] = docDict #Where we get the images for all the pmcids images = retriever.get_image_pmcid(pmcids, "all")#should take in pmcids and class #create dictionary of images with pmcid being their key imagesDict = {} for img in images: img_pmcid = img.get("pmcid") if img_pmcid in imagesDict.keys(): imagesDict[img_pmcid].append(img.get("filepath") + "/" + img.get("figureid")) else: imagesDict[img_pmcid] = [(img.get("filepath") + "/" + img.get("figureid"))] #for each pmcid, we will assign an image to it for the search results for pmcid in pmcids: if imagesDict: if pmcid in imagesDict.keys(): docDict = documentDict[pmcid] docDict["imgURL"] = imagesDict[pmcid][0] documentDict[pmcid] = docDict else: docDict = documentDict[pmcid] docDict["imgURL"] = "images/NoImageAvailable.jpg" documentDict[pmcid] = docDict else: docDict = documentDict[pmcid] docDict["imgURL"] = "images/NoImageAvailable.jpg" documentDict[pmcid] = docDict #END - Where we get the images for all the pmcids return documentDict
class Searcher(): """A simple interface to search articles. In this class `MultiFieldQueryParse`, `DuplicateFilter` are used to accomplish our application: query should apply on multiple fields, duplication should be avoid. """ def __init__(self, index_dir, search_fields=['canonical_url', 'title', 'meta', 'content'], unique_field='uq_id_str', boost=dict(canonical_url=4.0, title=8.0, meta=2.0, content=1.0), date_format='%Y-%m-%dT%H:%M:%S'): """Constructor of Searcher. Parameters ---------- index_dir : string The location of lucene index. search_fields : list A list of field names indicating fields to search on. unique_field : string The field name, on which the duplication should avoid. boost : dict This dict control the weight when computing score. date_format : string Convert the string into datetime. Should consistent with the index part. """ self.index_dir = index_dir self.search_fields = search_fields self.sort_by_recent = Sort(SortField('date_published', SortField.Type.STRING, True)) self.store = FSDirectory.open(File(index_dir)) self.reader = DirectoryReader.open(self.store) self.isearcher = IndexSearcher(self.reader) self.analyzer = StandardAnalyzer() self.dup_filter = DuplicateFilter(unique_field) self.boost_map = HashMap() for k, v in boost.iteritems(): self.boost_map.put(k, Float(v)) self.mul_parser = MultiFieldQueryParser(search_fields, self.analyzer, self.boost_map) self.date_format = date_format def prepare_chained_filter(self, dt1, dt2): """Return a chained filter.""" return ChainedFilter( [self.dup_filter, TermRangeFilter('date_published', BytesRef(dt1.strftime(self.date_format)), BytesRef(dt2.strftime(self.date_format)), True, True)], [ChainedFilter.AND, ChainedFilter.AND] ) def refresh(self): """Refresh the searsher, if index is changed.""" nireader = DirectoryReader.openIfChanged(self.reader) if nireader: self.reader.close() self.reader = nireader self.isearcher = IndexSearcher(self.reader) logger.debug('Index file changed, freshed') else: logger.debug('Index file did not change.') def fetch_one_doc(self, score_doc): """Fetch one document from the scored doc results.""" doc = self.isearcher.doc(score_doc.doc) return ( doc.getField("group_id").numericValue().intValue(), doc.get("canonical_url"), doc.get("title"), doc.get("date_published"), doc.get("domain"), doc.get("site_type"), score_doc.score, ) def search(self, query, n1=100, n2=100000, sort_by='relevant', use_lucene_syntax=False, min_score_of_recent_sorting=0.4, min_date_published=None): """Return the matched articles from lucene. Parameters ---------- query : string The query string. n1 : int How many result finally returned. n2 : int How many search results returned when sort by recent. sort_by : string {'relevant', 'recent'}, the sorting order when doing lucene searching. min_score_of_recent_sorting : float The min score when sorting by 'recent'. min_date_published : datetime<Plug>(neosnippet_expand) The min date_published when filtering lucene searching results. Returns ------- tuple (total_hits, df), where total_hits represents the total number of hits and df is a pandas.DataFrame object. df.columns = ['id', 'canonical_url', 'title', 'date_published', 'domain', 'site_type', 'score'] """ if min_date_published is not None: dt2 = datetime.utcnow() if isinstance(min_date_published, datetime): dt1 = min_date_published elif isinstance(min_date_published, basestring): dt1 = utc_from_str(min_date_published) sf = self.prepare_chained_filter(dt1, dt2) else: sf = self.dup_filter try: if use_lucene_syntax is False: query = clean_query(query) q = self.mul_parser.parse(self.mul_parser, query) logger.debug('Parsed query: %s', q) except Exception as e: logger.error(e) if use_lucene_syntax is True: raise APIParseError("""Error when parse the query string! \ You are quering with lucene syntax, be careful of your query string!""") else: raise APIParseError('Error when parse the query string!') cnames = ['id', 'canonical_url', 'title', 'date_published', 'domain', 'site_type', 'score'] if sort_by == 'relevant': top_docs = self.isearcher.search(q, sf, n1) score_docs = top_docs.scoreDocs total_hits = top_docs.totalHits if total_hits == 0: df = pd.DataFrame() else: records = [self.fetch_one_doc(sd) for sd in score_docs] df = pd.DataFrame(records, columns=cnames) df['date_published'] = pd.to_datetime(df['date_published']) return total_hits, df elif sort_by == 'recent': counter = 0 records = [] top_field_docs = self.isearcher.search(q, sf, n2, self.sort_by_recent, True, True) if top_field_docs.maxScore >= min_score_of_recent_sorting: for sd in top_field_docs.scoreDocs: if sd.score >= min_score_of_recent_sorting: records.append(self.fetch_one_doc(sd)) counter += 1 if counter == n1: break if counter == 0: df = pd.DataFrame() else: df = pd.DataFrame(records, columns=cnames) df['date_published'] = pd.to_datetime(df['date_published']) return counter, df
index = SimpleFSDirectory(indexDir) reader = IndexReader.open(index) n_docs = reader.numDocs() print("Index contains %d documents." % n_docs) # 2. parse the query from the command line a = {"typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer()} wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a) query_string = "lucene get similar documents to the current one" query_parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, ["title"], wrapper_analyzer) #base_query = getSpanNearQuery(analyzer, query_string) base_query = query_parser.parse(query_string) #http://shaierera.blogspot.com/2013/09/boosting-documents-in-lucene.html boost_query = FunctionQuery(LongFieldSource("view_count")) query = CustomScoreQuery(base_query, boost_query) # queryparser = QueryParser(Version.LUCENE_CURRENT, "title", analyzer) # query = queryparser.parse(query_string) # 3. search the index for the query # We retrieve and sort all documents that match the query. # In a real application, use a TopScoreDocCollector to sort the hits. searcher = IndexSearcher(reader) hits = searcher.search(query, 10).scoreDocs # 4. display results
class Searcher(): """A simple interface to search articles. In this class `MultiFieldQueryParse`, `DuplicateFilter` are used to accomplish our application: query should apply on multiple fields, duplication should be avoid. """ def __init__(self, index_dir, search_fields=['canonical_url', 'title', 'meta', 'content'], unique_field='uq_id_str', boost=dict(canonical_url=4.0, title=8.0, meta=2.0, content=1.0), date_format='%Y-%m-%dT%H:%M:%S'): """Constructor of Searcher. Parameters ---------- index_dir : string The location of lucene index. search_fields : list A list of field names indicating fields to search on. unique_field : string The field name, on which the duplication should avoid. boost : dict This dict control the weight when computing score. date_format : string Convert the string into datetime. Should consistent with the index part. """ self.index_dir = index_dir self.search_fields = search_fields self.sort_by_recent = Sort( SortField('date_published', SortField.Type.STRING, True)) self.store = FSDirectory.open(Paths.get(index_dir)) self.reader = DirectoryReader.open(self.store) self.isearcher = IndexSearcher(self.reader) self.analyzer = StandardAnalyzer() self.boost_map = HashMap() for k, v in boost.items(): self.boost_map.put(k, Float(v)) self.mul_parser = MultiFieldQueryParser(search_fields, self.analyzer, self.boost_map) self.date_format = date_format def query_between_dates(self, dt1, dt2, original_query=None): '''Update the given query to only allow records between dt1 and dt2.''' return TermRangeQuery( 'date_published', # Field BytesRef(dt1.strftime(self.date_format)), # Lower bound BytesRef(dt2.strftime(self.date_format)), # Upper bound True, # Include lower bound True # Include upper bound ) def refresh(self): """Refresh the searsher, if index is changed.""" nireader = DirectoryReader.openIfChanged(self.reader) if nireader: self.reader.close() self.reader = nireader self.isearcher = IndexSearcher(self.reader) logger.debug('Index file changed, freshed') else: logger.debug('Index file did not change.') def fetch_one_doc(self, score_doc): """Fetch one document from the scored doc results.""" doc = self.isearcher.doc(score_doc.doc) return ( doc.getField("group_id").numericValue().intValue(), doc.get("canonical_url"), doc.get("title"), doc.get("date_published"), doc.get("domain"), doc.get("site_type"), score_doc.score, ) def search(self, query, n1=100, n2=100000, sort_by='relevant', use_lucene_syntax=False, min_score_of_recent_sorting=0.4, min_date_published=None): """Return the matched articles from lucene. Parameters ---------- query : string The query string. n1 : int How many result finally returned. n2 : int How many search results returned when sort by recent. sort_by : string {'relevant', 'recent'}, the sorting order when doing lucene searching. min_score_of_recent_sorting : float The min score when sorting by 'recent'. min_date_published : datetime The min date_published when filtering lucene searching results. Returns ------- tuple (total_hits, df), where total_hits represents the total number of hits and df is a pandas.DataFrame object. df.columns = ['id', 'canonical_url', 'title', 'date_published', 'domain', 'site_type', 'score'] """ if min_date_published is not None: dt2 = datetime.utcnow() if isinstance(min_date_published, datetime): dt1 = min_date_published elif isinstance(min_date_published, str): dt1 = utc_from_str(min_date_published) q_dates = self.query_between_dates(dt1, dt2) try: if use_lucene_syntax is False: query = clean_query(query) q = self.mul_parser.parse(self.mul_parser, query) logger.warning(q) if 'date_published:' in query: end = query.find('AND date_published') q_without_date_publushed = query[:end] logger.warning(q_without_date_publushed) q = self.mul_parser.parse(self.mul_parser, q_without_date_publushed) date_published_splits = query.split('date_published:[') date_range = date_published_splits[len(date_published_splits) - 1] date_range = date_range[:-1] logger.warning(date_range) if 'TO' in date_range: date_range_splits = date_range.split('TO') dt1_string = date_range_splits[0] # handling when regex presents if '*' in dt1_string: date1_end = dt1_string.find('*') - 1 dt1_string = dt1_string[:date1_end] logger.warning(dt1_string) dt1 = utc_from_str(dt1_string) dt2_string = date_range_splits[1] if '*' in dt2_string: date2_end = dt2_string.find('*') - 1 dt2_string = dt2_string[:date2_end] logger.warning(dt2_string) dt2 = utc_from_str(dt2_string) query_dates = self.query_between_dates(dt1, dt2) q = combine_queries(q, query_dates) if min_date_published is not None: q = combine_queries(q, q_dates) logger.warning('Parsed query: %s', q) except Exception as e: logger.error(e) if use_lucene_syntax is True: raise APIParseError("""Error when parse the query string! \ You are quering with lucene syntax, be careful of your query string!""") else: raise APIParseError('Error when parse the query string!') cnames = [ 'id', 'canonical_url', 'title', 'date_published', 'domain', 'site_type', 'score' ] if sort_by == 'relevant': top_docs = self.isearcher.search(q, n1) score_docs = top_docs.scoreDocs total_hits = top_docs.totalHits if total_hits == 0: df = pd.DataFrame() else: records = [self.fetch_one_doc(sd) for sd in score_docs] # Index in each record of canonical URL and title canonical_url, title = 1, 2 # Store 2-tuples of (site, article title) as keys in dict then # turn back to list unique_docs = dict() for record in records: key = (record[canonical_url], record[title]) if key not in unique_docs: unique_docs[key] = record # Include only unique records records = list(unique_docs.values()) df = pd.DataFrame(records, columns=cnames) df['date_published'] = pd.to_datetime(df['date_published']) return total_hits, df elif sort_by == 'recent': counter = 0 records = [] top_field_docs = self.isearcher.search(q, n2, self.sort_by_recent, True, True) if top_field_docs.maxScore >= min_score_of_recent_sorting: for sd in top_field_docs.scoreDocs: if sd.score >= min_score_of_recent_sorting: records.append(self.fetch_one_doc(sd)) counter += 1 if counter == n1: break if counter == 0: df = pd.DataFrame() else: df = pd.DataFrame(records, columns=cnames) df['date_published'] = pd.to_datetime(df['date_published']) return counter, df
def search(self, command, num, use_clf): print("log1", command, num, use_clf) self.vm.attachCurrentThread() searcher = self.searcher print("command", command) if (not self.reT.search(command)): if (use_clf): print("sentence feed to classify", command) probs = self.classifier.classify(command) command = self.text.seg(command) command = self.text.remove_stop_word(command) # command = self.text.replace_white_space_with_dash(command) key = sorted(range(len(self.keys)), key=lambda i: probs[i], reverse=True) key_use = [] key_use.append(key[0]) for i in key[1:]: if probs[i] > 0.3 or probs[i] - probs[key[0]] > -0.1: key_use.append(i) command_final = self.keys[key_use[0]] + ":(" + command + ")" for i in key_use[1:]: command_final = "%s OR %s:(%s)" % (command_final, self.keys[i], command) command = command_final # command = "Title:\"2016 吉 07 民终 491号 包颜峰诉\"" # command = "PubDate:\"2016 11 24\"" # command = "WBSB:浙江省 WBSB:苍南县 WBSB:人民法院" print(command) # command = "Title:陕西省-高级-人民法院 Pubdate:陕西省-高级-人民法院" query = QueryParser("PubDate", WhitespaceAnalyzer()).parse(command) # parser = MultiFieldQueryParser(['WBSB'], self.analyzer) # parser.setDefaultOperator(QueryParserBase.AND_OPERATOR) # query =parser.parse(QueryParserBase,command) # P = QueryParser('Pubdate', CJKAnalyzer()) # query = MultiFieldQueryParser(['WBSB','Pubdate'],CJKAnalyzer()).parse(P,command) # # # # query = MultiFieldQueryParser(['WBSB',"title"], CJKAnalyzer()).getMultiFieldQuery(q) # # p = QueryParser('Title', CJKAnalyzer()).parse("你好 中国 你好 北京") # print(query) # fields = [] # # fields = ["filename", "contents", "description"] # # for i in key_use: # fields.append(self.keys[i]) # flags = [BooleanClause.Occur.SHOULD]*len(fields) # # query=MultiFieldQueryParser.parse(command, fields, flags, WhitespaceAnalyzer()) # print(query) scoreDocs = searcher.search(query, num).scoreDocs results = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) result = dict() for i in self.keys: result[i] = doc.get(i) result['id'] = doc.get('id') results.append(result) probs_tmp = "" for key, prob in zip(self.keys, probs): probs_tmp += "%s:%2f " % (key, prob) probs = probs_tmp key_use_tmp = "" for i in key_use: key_use_tmp += "%s " % (self.keys[i]) key_use = key_use_tmp return results, probs, key_use else: command = self.text.seg(command) command = self.text.remove_stop_word(command) fields = self.keys flags = [BooleanClause.Occur.SHOULD] * len(fields) query = MultiFieldQueryParser.parse(command, fields, flags, WhitespaceAnalyzer()) # command_final = "Title:"+command # for i in self.keys[1:]: # command_final = "%s OR %s:%s"% (command_final,i,command) # command=command_final # print("矣") # print(command) # query = QueryParser("Title", self.analyzer).parse(command) fields = self.keys flags = [BooleanClause.Occur.SHOULD] * len(fields) query = MultiFieldQueryParser.parse(command, fields, flags, WhitespaceAnalyzer()) print(query) scoreDocs = searcher.search(query, num).scoreDocs results = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) result = dict() for i in self.keys: result[i] = doc.get(i) result['id'] = doc.get('id') results.append(result) return results, [None] * len(self.keys), self.keys else: print('command', command) ps = self.reT.findall(command) print(ps) print(type(command)) rem = self.reT.sub(command, ' ') print(ps) print(rem) q_t = [] key_use = [] for i in ps: f = i[1] data = i[4] rela = i[5] key_use.append(f) q_t.append(f) q_t.append(':') seg_t = self.text.seg(data) seg_t = self.text.remove_stop_word(seg_t) dash_t = self.text.replace_white_space_with_dash(seg_t) q_t.append(dash_t) if (rela): q_t.append(" %s " % rela) print('tract pattern', q_t) q_f = "".join(q_t) print("final q", q_f) query = QueryParser("PubDate", SimpleAnalyzer()).parse(q_f) print("query", query) scoreDocs = searcher.search(query, num).scoreDocs results = [] for scoreDoc in scoreDocs: doc = searcher.doc(scoreDoc.doc) result = dict() for i in self.keys: result[i] = doc.get(i) result['id'] = doc.get('id') results.append(result) return results, [None] * len(key_use), key_use