def retrieve(indexdir, queries): lucene.initVM() f = open("results_lucene.txt", "w") analyzer = StandardAnalyzer(Version.LUCENE_4_10_1) reader = IndexReader.open(SimpleFSDirectory(File(indexdir))) searcher = IndexSearcher(reader) fields = ["title", "abstract", "authors"] st = PorterStemmer() for id, q in queries.iteritems(): query = q tokenizer = RegexpTokenizer(r'\w+') qwords = tokenizer.tokenize(query) qwords_k = [st.stem(q) for q in qwords] query = " ".join(qwords_k) parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields, analyzer) parser.setDefaultOperator(QueryParserBase.OR_OPERATOR) query = MultiFieldQueryParser.parse(parser, query) MAX = 1000 hits = searcher.search(query, MAX) # print "Found %d document(s) that matched query '%s':" % (hits.totalHits, query) for i, hit in enumerate(hits.scoreDocs): f.write("%s Q0 %s %s %s G17R3\n" % (id, hit.doc+1, i+1, hit.score)) # print hit.doc+1, hit.score # doc = searcher.doc(hit.doc) # print doc.get("authors").encode("utf-8") f.close()
def search(searcher, analyzer, directory, query2): print print "Empty to quit." # command = raw_input("Query: ") #raw_input for query command = query2 if command == '': loopVar = False return print print "Searching for ", command parserVar = MultiFieldQueryParser(fields, analyzer) parserVar.setDefaultOperator(QueryParserBase.AND_OPERATOR) query = MultiFieldQueryParser.parse(parserVar, command) scoreDocs = searcher.search( query, 10).scoreDocs #number is max number of matching documents print "total matching documents in: " + str((len(scoreDocs))) counter = 0 for scoreDoc in scoreDocs: #dont really know what this is either doc = searcher.doc(scoreDoc.doc) print "@" + doc.get("u_name") + ": " + doc.get( "tweet") + " Score:" + str(scoreDocs[counter].score) docData = {} docData['u_name'] = doc.get("u_name") docData['tweet'] = doc.get("tweet") docData['score'] = str(scoreDocs[counter].score) results.append(docData) counter = counter + 1 print print "\n------------------------------------------------------" return results
def brand_scent_search(brand, scent): query = brand + ' ' + ''.join(scents) fields = ["name", "scents"] clauses = [ BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD ] parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields, analyzer) parser.setDefaultOperator(QueryParserBase.AND_OPERATOR) query = MultiFieldQueryParser.parse(parser, query) return query
def scents_search(former, mid, last): query = ''.join(former) + ' ' + ''.join(mid) + ' ' + ''.join(last) fields = ["former_scents", "mid_scents", "last_scents"] clauses = [ BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD ] parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields, analyzer) parser.setDefaultOperator(QueryParserBase.AND_OPERATOR) query = MultiFieldQueryParser.parse(parser, query) return query
def multiFieldsSearch(self, query, sim): lucene.getVMEnv().attachCurrentThread() parser = MultiFieldQueryParser( ["content_section", "title_section", 'title_article'], self.analyzer) parser.setDefaultOperator(QueryParserBase.OR_OPERATOR) query = MultiFieldQueryParser.parse(parser, QueryParser.escape(query)) self.searcher.setSimilarity(sim) hits = self.searcher.search(query, 6).scoreDocs return hits
def multiFieldsSearch(self, query, sim): """ Method that searches through documents using content_section and title_article Fields searchDir : the path to the folder that contains the index. """ # Now search the index: lucene.getVMEnv().attachCurrentThread() parser = MultiFieldQueryParser(["content_section", "title_article"], self.analyzer) parser.setDefaultOperator(QueryParserBase.OR_OPERATOR) query = MultiFieldQueryParser.parse(parser, QueryParser.escape(query)) self.searcher.setSimilarity(sim) hits = self.searcher.search(query, 6).scoreDocs return hits
def func_cross(former, mid, last): vm_env = lucene.getVMEnv() vm_env.attachCurrentThread() STORE_DIR = "index_tb_new" directory = SimpleFSDirectory(File(STORE_DIR)) searcher = IndexSearcher(DirectoryReader.open(directory)) analyzer = StandardAnalyzer(Version.LUCENE_CURRENT) query = former + ' ' + ' ' + mid + ' ' + last fields = ["former", "mid", "last"] clauses = [ BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD ] parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields, analyzer) parser.setDefaultOperator(QueryParserBase.AND_OPERATOR) query = MultiFieldQueryParser.parse(parser, query) scoreDocs = searcher.search(query, 200).scoreDocs results = process(scoreDocs, searcher) return results
def multiFieldsPairSearch(self, pair, sim): """ Method that searches through documents using only content_section Field searchDir : the path to the folder that contains the index. """ # Now search the index: title = pair[0].replace('_', ' ') content = pair[1] parser = MultiFieldQueryParser(["content_section", "title_article"], self.analyzer) parser.setDefaultOperator(QueryParserBase.OR_OPERATOR) query1 = MultiFieldQueryParser.parse(parser, QueryParser.escape(title)) query2 = MultiFieldQueryParser.parse(parser, QueryParser.escape(content)) bq = BooleanQuery.Builder() bq.add(query1, BooleanClause.Occur.FILTER) bq.add(query2, BooleanClause.Occur.SHOULD) self.searcher.setSimilarity(sim) hits = self.searcher.search(bq.build(), 6).scoreDocs return hits