def retrieve(indexdir, queries): lucene.initVM() f = open("results_lucene.txt", "w") analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) reader = IndexReader.open(SimpleFSDirectory(File(indexdir))) searcher = IndexSearcher(reader) fields = ["title", "abstract", "authors"] st = PorterStemmer() for id, q in queries.iteritems(): query = q tokenizer = RegexpTokenizer(r'\w+') qwords = tokenizer.tokenize(query) qwords_k = [st.stem(q) for q in qwords] query = " ".join(qwords_k) parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields, analyzer) parser.setDefaultOperator(QueryParserBase.OR_OPERATOR) query = MultiFieldQueryParser.parse(parser, query) MAX = 1000 hits = searcher.search(query, MAX) # print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) for i, hit in enumerate(hits.scoreDocs): f.write("%s Q0 %s %s %s G17R3\n" % (id, hit.doc+1, i+1, hit.score)) # print hit.doc+1, hit.score # doc = searcher.doc(hit.doc) # print doc.get("authors").encode("utf-8") f.close()
def search(searcher, analyzer, directory, query2): print print "Empty to quit." # command = raw_input("Query: ") #raw_input for query command = query2 if command == '': loopVar = False return print print "Searching for ", command parserVar = MultiFieldQueryParser(fields, analyzer) parserVar.setDefaultOperator(QueryParserBase.AND_OPERATOR) query = MultiFieldQueryParser.parse(parserVar, command) scoreDocs = searcher.search( query, 10).scoreDocs #number is max number of matching documents print "total matching documents in: " + str((len(scoreDocs))) counter = 0 for scoreDoc in scoreDocs: #dont really know what this is either doc = searcher.doc(scoreDoc.doc) print "@" + doc.get("u_name") + ": " + doc.get( "tweet") + " Score:" + str(scoreDocs[counter].score) docData = {} docData['u_name'] = doc.get("u_name") docData['tweet'] = doc.get("tweet") docData['score'] = str(scoreDocs[counter].score) results.append(docData) counter = counter + 1 print print "\n------------------------------------------------------" return results
def scents_search(former, mid, last): query = ''.join(former) + ' ' + ''.join(mid) + ' ' + ''.join(last) fields = ["former_scents", "mid_scents", "last_scents"] clauses = [ BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD ] parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields, analyzer) parser.setDefaultOperator(QueryParserBase.AND_OPERATOR) query = MultiFieldQueryParser.parse(parser, query) return query
def brand_scent_search(brand, scent): query = brand + ' ' + ''.join(scents) fields = ["name", "scents"] clauses = [ BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD ] parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields, analyzer) parser.setDefaultOperator(QueryParserBase.AND_OPERATOR) query = MultiFieldQueryParser.parse(parser, query) return query
def multiFieldsSearch(self, query, sim): lucene.getVMEnv().attachCurrentThread() parser = MultiFieldQueryParser( ["content_section", "title_section", 'title_article'], self.analyzer) parser.setDefaultOperator(QueryParserBase.OR_OPERATOR) query = MultiFieldQueryParser.parse(parser, QueryParser.escape(query)) self.searcher.setSimilarity(sim) hits = self.searcher.search(query, 6).scoreDocs return hits
def explain(self, query, fields, doc): if not self.searcher: self.open_searcher() query = QueryParser.escape(query) parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields, self.analyzer) query = MultiFieldQueryParser.parse(parser, query) return self.searcher.explain(query, doc)
def parse_query(self, query_string, order_matters=True): query_parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, ["title", "qbody"], self.analyzer) if order_matters: # Take into account order of query terms base_query = getSpanNearQuery(self.analyzer, query_string) else: # Considers query keywords as bag of words base_query = query_parser.parse(query_string) #http://shaierera.blogspot.com/2013/09/boosting-documents-in-lucene.html boost_query = FunctionQuery(LongFieldSource("view_count")) self.query = CustomScoreQuery(base_query, boost_query)
def search(self, query): lucene.initVM() luceneDirectory = "/index/" path = str(os.path.abspath(os.getcwd()) + luceneDirectory) directory = FSDirectory.open(Paths.get(path)) reader = DirectoryReader.open(directory) searcher = IndexSearcher(reader) analyzer = StandardAnalyzer() #args = len(sys.argv) - 1 #if args < 1: # print ("\n No query was submitted! \n") #else: #query_string = "" #position = 1 #while(args >= position): #query_string = query_string + str(sys.argv[position]) + " " #position = position + 1 print("Searching for '" + query + "'") fields_to_search = ["text", "page title", "date"] filter_date = 'date:"May 25"' filtered_query = filter_date + "AND " + query parser = MultiFieldQueryParser(fields_to_search, analyzer) updated_query = MultiFieldQueryParser.parse(parser, filtered_query) scored_documents = searcher.search(updated_query, 10).scoreDocs # array of docs print("Found " + str((len(scored_documents))) + " matches in the collection.") results = [] for doc in scored_documents: scoredTweet = dict() scoredTweet['score'] = doc.score result = searcher.doc(doc.doc) scoredTweet['username'] = result.get("username") scoredTweet['tweet_body'] = result.get("text") scoredTweet['date'] = result.get("date") results.append(scoredTweet) print(scoredTweet) return results
def multiFieldsSearch(self, query, sim): """ Method that searches through documents using content_section and title_article Fields searchDir : the path to the folder that contains the index. """ # Now search the index: lucene.getVMEnv().attachCurrentThread() parser = MultiFieldQueryParser(["content_section", "title_article"], self.analyzer) parser.setDefaultOperator(QueryParserBase.OR_OPERATOR) query = MultiFieldQueryParser.parse(parser, QueryParser.escape(query)) self.searcher.setSimilarity(sim) hits = self.searcher.search(query, 6).scoreDocs return hits
def preprocess_query(self, query, fields, mode="ANY"): ''' Fix query according to provided mode. If the value is not supported, the query remains unchanged ''' terms = query.lower().strip().split() if mode == "ANY": query = " OR ".join(terms) elif mode == "ALL": query = " AND ".join(terms) else: print "Invalid mode parameter '%s'." % mode query = QueryParser.escape(query) parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields, self.analyzer) query = MultiFieldQueryParser.parse(parser, query) return query
def search(self, terms, n_hits=5): """ Run search query. """ # TODO: support date range queries # build query parser = MultiFieldQueryParser(['fullpath', 'body'], self.analyzer) #parser.setDefaultOperator(QueryParser.Operator.AND) # defaults to OR unless terms have modifier query = MultiFieldQueryParser.parse( parser, terms) # https://stackoverflow.com/a/26853987/130164 # create a highlighter highlighter = Highlighter(SimpleHTMLFormatter('*', '*'), QueryScorer(query)) # execute search for top N hits return [ self._process_search_result(result, highlighter) for result in self.searcher.search(query, n_hits).scoreDocs ]
def func_cross(former, mid, last): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() STORE_DIR = "index_tb_new" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) query = former + ' ' + ' ' + mid + ' ' + last fields = ["former", "mid", "last"] clauses = [ BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD ] parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields, analyzer) parser.setDefaultOperator(QueryParserBase.AND_OPERATOR) query = MultiFieldQueryParser.parse(parser, query) scoreDocs = searcher.search(query, 200).scoreDocs results = process(scoreDocs, searcher) return results
def __init__(self, index_dir, search_fields=['canonical_url', 'title', 'meta', 'content'], unique_field='uq_id_str', boost=dict(canonical_url=4.0, title=8.0, meta=2.0, content=1.0), date_format='%Y-%m-%dT%H:%M:%S'): """Constructor of Searcher. Parameters ---------- index_dir : string The location of lucene index. search_fields : list A list of field names indicating fields to search on. unique_field : string The field name, on which the duplication should avoid. boost : dict This dict control the weight when computing score. date_format : string Convert the string into datetime. Should consistent with the index part. """ self.index_dir = index_dir self.search_fields = search_fields self.sort_by_recent = Sort( SortField('date_published', SortField.Type.STRING, True)) self.store = FSDirectory.open(File(index_dir)) self.reader = DirectoryReader.open(self.store) self.isearcher = IndexSearcher(self.reader) self.analyzer = StandardAnalyzer() self.dup_filter = DuplicateFilter(unique_field) self.boost_map = HashMap() for k, v in boost.iteritems(): self.boost_map.put(k, Float(v)) self.mul_parser = MultiFieldQueryParser(search_fields, self.analyzer, self.boost_map) self.date_format = date_format
def multiFieldsPairSearch(self, pair, sim): """ Method that searches through documents using only content_section Field searchDir : the path to the folder that contains the index. """ # Now search the index: title = pair[0].replace('_', ' ') content = pair[1] parser = MultiFieldQueryParser(["content_section", "title_article"], self.analyzer) parser.setDefaultOperator(QueryParserBase.OR_OPERATOR) query1 = MultiFieldQueryParser.parse(parser, QueryParser.escape(title)) query2 = MultiFieldQueryParser.parse(parser, QueryParser.escape(content)) bq = BooleanQuery.Builder() bq.add(query1, BooleanClause.Occur.FILTER) bq.add(query2, BooleanClause.Occur.SHOULD) self.searcher.setSimilarity(sim) hits = self.searcher.search(bq.build(), 6).scoreDocs return hits
def doc_search(self, field, keywords, numHits): if field != 'All': analyzer = StandardAnalyzer() parser = QueryParser(field, analyzer) query = parser.parse(keywords) # self.lReader.getDocCount("title"); try: collector = TopScoreDocCollector.create(numHits) self.lSearcher.search(query, collector) hits = collector.topDocs().scoreDocs except RuntimeError: print "Score docoment run fail" self.hits = hits self.field = field return hits else: analyzer = WhitespaceAnalyzer() parser = MultiFieldQueryParser(['Title', 'Body'], analyzer) query = MultiFieldQueryParser.parse(parser, keywords) # self.lReader.getDocCount("title"); try: collector = TopScoreDocCollector.create(numHits) self.lSearcher.search(query, collector) hits = collector.topDocs().scoreDocs except RuntimeError: print "Score docoment run fail" self.hits = hits self.field = field return hits self.hits = hits self.field = field return hits
indexDir = File("/tmp/github") # 1. open the index analyzer = KeywordAnalyzer() index = SimpleFSDirectory(indexDir) reader = IndexReader.open(index) n_docs = reader.numDocs() print("Index contains %d documents." % n_docs) # 2. parse the query from the command line # a = {"typed_method_call": WhitespaceAnalyzer()} # wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a) query_string = "HttpURLConnection.disconnect Exception.printStackTrace BufferedReader.close HttpURLConnection.setRequestProperty HttpURLConnection.setRequestMethod DataOutputStream.writeBytes HttpURLConnection.getInputStream DataOutputStream.close HttpURLConnection.setUseCaches StringBuffer.append URL.openConnection HttpURLConnection.getOutputStream Integer.toString String.getBytes StringBuffer.toString HttpURLConnection.setDoOutput BufferedReader.readLine DataOutputStream.flush HttpURLConnection.setDoInput" query_parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, ["typed_method_call"], analyzer) #base_query = getSpanNearQuery(analyzer, query_string) base_query = query_parser.parse(query_string) #http://shaierera.blogspot.com/2013/09/boosting-documents-in-lucene.html # boost_query = FunctionQuery( LongFieldSource("view_count")) #query = CustomScoreQuery(base_query, boost_query) # queryparser = QueryParser(Version.LUCENE_CURRENT, "title", analyzer) # query = queryparser.parse(query_string) # 3. search the index for the query # We retrieve and sort all documents that match the query. # In a real application, use a TopScoreDocCollector to sort the hits.
indexDir = File("/tmp/stackoverflow") # 1. open the index analyzer = PorterAnalyzer(StandardAnalyzer(Version.LUCENE_CURRENT)) index = SimpleFSDirectory(indexDir) reader = IndexReader.open(index) n_docs = reader.numDocs() print("Index contains %d documents." % n_docs) # 2. parse the query from the command line a = {"typed_method_call": KeywordAnalyzer(), "extends": KeywordAnalyzer()} wrapper_analyzer = PerFieldAnalyzerWrapper(analyzer, a) query_string = "lucene get similar documents to the current one" query_parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, ["title"], wrapper_analyzer) #base_query = getSpanNearQuery(analyzer, query_string) base_query = query_parser.parse(query_string) #http://shaierera.blogspot.com/2013/09/boosting-documents-in-lucene.html boost_query = FunctionQuery(LongFieldSource("view_count")) query = CustomScoreQuery(base_query, boost_query) # queryparser = QueryParser(Version.LUCENE_CURRENT, "title", analyzer) # query = queryparser.parse(query_string) # 3. search the index for the query # We retrieve and sort all documents that match the query. # In a real application, use a TopScoreDocCollector to sort the hits.