def form_new_query_from_rf(self, relevant_doc_ids): firstSet = True new_query = set() for id in relevant_doc_ids: doc = self.searcher.doc(id) contents = re.sub('[/\*&^%$#@?\'`":()<>]', " ", doc.get("title")).strip() query = QueryParser(Version.LUCENE_CURRENT, "contents", self.analyzer).parse(contents) keywords = query.toString().split("contents:") keywords_set = set() for k in keywords: if k.strip() != "": keywords_set.add(k) if firstSet: new_query = set(keywords_set) else: new_query = new_query & set(keywords_set) firstSet = False return " ".join(new_query)
def main(): global lucene_vm_init if not lucene_vm_init: lucene.initVM(vmargs=['-Djava.awt.headless=true']) lucene_vm_init = True is_index_Exist = os.path.exists(LUCENE_INDEX_DIR) # specify index path index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR)) # configure search engine analyzer = StandardAnalyzer() config = IndexWriterConfig(analyzer) # load index to search engine reader = DirectoryReader.open(index_mm) searcher = IndexSearcher(reader) searcher.setSimilarity(BM25Similarity()) # read query read_query() # initialize mongodb client mongoObj = Mongo_Object('localhost', 27017) # initialize word2vec print 'load word2vec model' w2vmodel = gensim.models.Word2Vec.load_word2vec_format( "F:\\modified_w2v\\w2v_wiki_trigram_phrase_20170101\\wiki.en.text.vector.binary", binary=True) print 'finish loading word2vec model' # search global hitsPerPage fields = ['name', 'value'] #parser=MultiFieldQueryParser(fields,analyzer) #parser.setDefaultOperator(QueryParserBase.AND_OPERATOR) rec_result = open('pylucene.runs', 'w') for i in range(len(queries)): query = queries[i] print 'processing query ' + str(i) + ':' + query[0] querystr = remove_duplicate(stemSentence(query[1])) #q_lucene=MultiFieldQueryParser.parse(parser,querystr) q_lucene = QueryParser("all_text", analyzer).parse(querystr) print "q_lucene: " + q_lucene.toString() collector = TopScoreDocCollector.create(hitsPerPage) searcher.search(q_lucene, collector) hits = collector.topDocs().scoreDocs # build query object for computeScore #queryObj=Query_Object(query,mongoObj,w2vmodel) # initialize duplicate remover docDup = set() # find candidate results after 1st round filter candidates = PriorityQueue() for j in xrange(len(hits)): docID = hits[j].doc d = searcher.doc(docID) name = cleanSentence(d['title'].strip()) if name in docDup: continue docDup.add(name) # build entity object entityObj = Entity_Object(d, mongoObj, w2vmodel) #score = computeScore(queryObj,entityObj,mongoObj,w2vmodel) score = hits[j].score candidates.put((-score, j)) # output results from priority queue larger score first rank = 0 while candidates.empty() == False and rank < 100: rank = rank + 1 item = candidates.get() score = -item[0] j = item[1] # index of hits[] docID = hits[j].doc d = searcher.doc(docID) title = '<dbpedia:' + d.get('title') + '>' res_line = query[0] + '\t' + 'Q0' + '\t' + title + '\t' + str( rank) + '\t' + str(score) + '\t' + 'pylucene_multifield' rec_result.writelines(res_line + '\n') rec_result.close()