示例#1
0
 def form_new_query_from_rf(self, relevant_doc_ids):
     firstSet  = True
     new_query = set()
     for id in relevant_doc_ids:
         doc          = self.searcher.doc(id)
         contents     = re.sub('[/\*&^%$#@?\'`":()<>]', " ", doc.get("title")).strip()
         query        = QueryParser(Version.LUCENE_CURRENT, "contents", self.analyzer).parse(contents)
         keywords     = query.toString().split("contents:")
         keywords_set = set()
         for k in keywords:
             if k.strip() != "":
                 keywords_set.add(k)
         if firstSet:
             new_query = set(keywords_set)
         else:
             new_query = new_query & set(keywords_set)
             firstSet  = False
     return " ".join(new_query)
示例#2
0
def main():
    global lucene_vm_init
    if not lucene_vm_init:
        lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        lucene_vm_init = True

    is_index_Exist = os.path.exists(LUCENE_INDEX_DIR)
    # specify index path
    index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR))

    # configure search engine
    analyzer = StandardAnalyzer()
    config = IndexWriterConfig(analyzer)

    # load index to search engine
    reader = DirectoryReader.open(index_mm)
    searcher = IndexSearcher(reader)
    searcher.setSimilarity(BM25Similarity())

    # read query
    read_query()

    # initialize mongodb client
    mongoObj = Mongo_Object('localhost', 27017)

    # initialize word2vec
    print 'load word2vec model'
    w2vmodel = gensim.models.Word2Vec.load_word2vec_format(
        "F:\\modified_w2v\\w2v_wiki_trigram_phrase_20170101\\wiki.en.text.vector.binary",
        binary=True)
    print 'finish loading word2vec model'

    # search
    global hitsPerPage
    fields = ['name', 'value']
    #parser=MultiFieldQueryParser(fields,analyzer)
    #parser.setDefaultOperator(QueryParserBase.AND_OPERATOR)
    rec_result = open('pylucene.runs', 'w')

    for i in range(len(queries)):
        query = queries[i]
        print 'processing query ' + str(i) + ':' + query[0]
        querystr = remove_duplicate(stemSentence(query[1]))
        #q_lucene=MultiFieldQueryParser.parse(parser,querystr)
        q_lucene = QueryParser("all_text", analyzer).parse(querystr)
        print "q_lucene: " + q_lucene.toString()
        collector = TopScoreDocCollector.create(hitsPerPage)
        searcher.search(q_lucene, collector)
        hits = collector.topDocs().scoreDocs

        # build query object for computeScore
        #queryObj=Query_Object(query,mongoObj,w2vmodel)

        # initialize duplicate remover
        docDup = set()

        # find candidate results after 1st round filter
        candidates = PriorityQueue()
        for j in xrange(len(hits)):
            docID = hits[j].doc
            d = searcher.doc(docID)
            name = cleanSentence(d['title'].strip())
            if name in docDup:
                continue
            docDup.add(name)
            # build entity object
            entityObj = Entity_Object(d, mongoObj, w2vmodel)
            #score = computeScore(queryObj,entityObj,mongoObj,w2vmodel)
            score = hits[j].score
            candidates.put((-score, j))

        # output results from priority queue larger score first
        rank = 0
        while candidates.empty() == False and rank < 100:
            rank = rank + 1
            item = candidates.get()
            score = -item[0]
            j = item[1]  # index of hits[]
            docID = hits[j].doc
            d = searcher.doc(docID)
            title = '<dbpedia:' + d.get('title') + '>'
            res_line = query[0] + '\t' + 'Q0' + '\t' + title + '\t' + str(
                rank) + '\t' + str(score) + '\t' + 'pylucene_multifield'
            rec_result.writelines(res_line + '\n')
    rec_result.close()