Exemplo n.º 1
0
	def getIndexSearcher(self):

		indexSearcher = IndexSearcher(self.mIndexReader)
		if self.mSimilarity != None:
			indexSearcher.setSimilarity(self.mSimilarity)
		
		return indexSearcher	
Exemplo n.º 2
0
class SSQA_S_Searcher:
    def __init__(self, indexDir, analyzer):
        lucene.initVM()
        self.reader = DirectoryReader.open(indexDir)
        self.searcher = IndexSearcher(self.reader)
        self.searcher.setSimilarity(mySimilarity())
        self.analyzer = analyzer
        logger.debug("Search similarity func: {}".format(
            self.searcher.getSimilarity()))

    def search(self, query_text, top_n):
        query_text = query_text.strip()
        query = QueryParser("content", self.analyzer).parse(
            QueryParser.escape(query_text.strip()))
        scoreDocs = self.searcher.search(query, top_n).scoreDocs
        count = 0
        out_list = []
        for scoreDoc in tqdm(scoreDocs):
            docIndex = scoreDoc.doc
            doc = self.searcher.doc(docIndex)
            log_debug(doc, logger)
            log_debug(self.searcher.explain(query, docIndex), logger)

            out_list.append(doc['content'])
            count += 1
        logger.info("Added {} sentences".format(count))
        return out_list

    def close(self):
        self.reader.close()
Exemplo n.º 3
0
def process_q_test(q, out_q):
    lucene.initVM()
    lucene.getVMEnv().attachCurrentThread()

    index = DirectoryReader.open(SimpleFSDirectory(
        Paths.get(robust_index_dir)))
    searcher = IndexSearcher(index)
    searcher.setSimilarity(BM25Similarity())
    analyzer = EnglishAnalyzer()
    qparser = QueryParser("contents", analyzer)
    preprocessor = Preprocess()

    while not exitFlag:
        qid, query = q.get()
        tname = multiprocessing.current_process().name
        # print(tname, qid, query)
        if query == "DONE":
            break

        try:
            # dids, scores = get_lm_matched_docs(query, searcher, qparser, 2000)
            # if len(dids) >= 10:
            #     out_q.put((qid, dids, scores))
            dids_text = get_lm_doc_snippets(query, searcher, qparser, analyzer,
                                            preprocessor)
            out_q.put((qid, dids_text))
        except:
            print('%s exception %s, %s' % (tname, qid, query))
Exemplo n.º 4
0
class IndexAndTaxonomy(object):

    def __init__(self, settings, indexDirectory=None, taxoDirectory=None):
        self._settings = settings
        self._similarity = settings.similarity
        self._numberOfConcurrentTasks = settings.numberOfConcurrentTasks
        self._reader = DirectoryReader.open(indexDirectory)
        self.taxoReader = DirectoryTaxonomyReader(taxoDirectory)
        self._readerSettingsWrapper = ReaderSettingsWrapper()
        self._readerSettingsWrapper.get = lambda: {"similarity": self.searcher.getSimilarity().toString(), "numberOfConcurrentTasks": self._numberOfConcurrentTasks}
        self._readerSettingsWrapper.set = self._setReadSettings
        self._searcher = None
        self._executor = None
        self._reopenSearcher = True

    def reopen(self):
        reader = DirectoryReader.openIfChanged(self._reader)
        if reader is None:
            return
        self._reader.close()
        self._reader = reader
        self._reopenSearcher = True
        taxoReader = DirectoryTaxonomyReader.openIfChanged(self.taxoReader)
        if taxoReader is None:
            return
        self.taxoReader.close()
        self.taxoReader = taxoReader

    @property
    def searcher(self):
        if not self._reopenSearcher:
            return self._searcher

        if self._settings.multithreaded:
            if self._executor:
                self._executor.shutdown();
            self._executor = Executors.newFixedThreadPool(self._numberOfConcurrentTasks);
            self._searcher = SuperIndexSearcher(self._reader, self._executor, self._numberOfConcurrentTasks)
        else:
            self._searcher = IndexSearcher(self._reader)
        self._searcher.setSimilarity(self._similarity)
        self._reopenSearcher = False
        return self._searcher

    def _setReadSettings(self, similarity=None, numberOfConcurrentTasks=None):
        # This method must be thread-safe
        if similarity is None:
            self._similarity = self._settings.similarity
        else:
            self._similarity = BM25Similarity(similarity["k1"], similarity["b"])

        if numberOfConcurrentTasks is None:
            self._numberOfConcurrentTasks = self._settings.numberOfConcurrentTasks
        else:
            self._numberOfConcurrentTasks = numberOfConcurrentTasks
        self._reopenSearcher = True

    def close(self):
        self.taxoReader.close()
        self._reader.close()
def config():
    base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
    directory = SimpleFSDirectory(File(os.path.join(base_dir, INDEX_DIR)))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    bm25Sim = BM25Similarity(2.0,0.75) #BM25 with these default values: k1 = 1.2, b = 0.75.
    searcher.setSimilarity(bm25Sim)
    analyzer = SmartChineseAnalyzer(Version.LUCENE_CURRENT)
    return searcher,analyzer
def lucene_retrieval_multifield(q_string, q_class, feature_type, use_BM25=False):
    """
    multifield: different query string for different field
    not same word on different field
    :param q_string:
    :param feature_type:
    :param use_BM25:
    :return: retrieval_scores for each question-answer pair
    """
    index = set_lucene_index['ind']  # nonlocal variable index

    def retrieval_scores(hists):
        """
        return sorted document+score by score
        :param hists:
        """
        def doc_score(hists):
            """
            return doc_name & score
            :param hists:
            """
            for h in hists:
                # docID = h.doc
                # doc = searcher.doc(docID)
                # file_name = doc.get("corpus_name")
                # doc_name = doc.get("doc_name")
                # text = doc.get("text")
                score = h.score
                # yield (file_name, doc_name, score, text)
                yield score
        doc_score_list = list(doc_score(hists))
        return map(lambda f: f(doc_score_list), feature_type)  # feature_type is a list of function

    text_query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string))
    subject_query = QueryParser(version, 'corpus_name', analyzer).parse(QueryParser.escape(q_class))
    query = BooleanQuery()

    # BooleanClause.Occur
    # MUST implies that the keyword must occur
    #  SHOULD implies that the keyword SHOULD occur
    query.add(text_query, BooleanClause.Occur.SHOULD)
    query.add(subject_query, BooleanClause.Occur.SHOULD)

    # search
    reader = IndexReader.open(index)
    searcher = IndexSearcher(reader)

    if use_BM25:
        searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75))  # todo: BM25 parameters

    collector = TopScoreDocCollector.create(hitsPerPage, True)
    searcher.search(query, collector)
    hs = collector.topDocs().scoreDocs  # hists

    results = retrieval_scores(hs)
    # reader.close()
    return results  # retrieval_scores for each question-answer pair
Exemplo n.º 7
0
 def run(self):
     print("Starting " + self.name)
     lucene.getVMEnv().attachCurrentThread()
     index = DirectoryReader.open(
         SimpleFSDirectory(Paths.get(robust_index_dir)))
     searcher = IndexSearcher(index)
     searcher.setSimilarity(BM25Similarity())
     analyzer = EnglishAnalyzer()
     qparser = QueryParser("contents", analyzer)
     # process_query(self.name, self.q, self.out_q, searcher, qparser)
     print("Exiting " + self.name)
Exemplo n.º 8
0
def lucene_retrieval(q_string, feature_type, use_BM25=False):
    """

    :param q_string:
    :param feature_type:
    :param use_BM25:
    :return: retrieval_scores for each question-answer pair
    """
    index = set_lucene_index['ind']  # nonlocal variable index

    def retrieval_scores(hists):
        """
        return sorted document+score by score
        :param hists:
        """
        def doc_score(hists):
            """
            return doc_name & score
            :param hists:
            """
            for h in hists:
                # docID = h.doc
                # doc = searcher.doc(docID)
                # file_name = doc.get("corpus_name")
                # doc_name = doc.get("doc_name")
                # text = doc.get("text")
                score = h.score
                # yield (file_name, doc_name, score, text)
                yield score

        doc_score_list = list(doc_score(hists))
        return map(lambda f: f(doc_score_list),
                   feature_type) if len(doc_score_list) != 0 else [0] * len(
                       feature_type)  # feature_type is a list of function

    # escape special characters via escape function
    query = QueryParser(version, 'text',
                        analyzer).parse(QueryParser.escape(q_string))

    # search
    reader = IndexReader.open(index)
    searcher = IndexSearcher(reader)

    if use_BM25:
        searcher.setSimilarity(BM25Similarity(k1=1.5,
                                              b=0.75))  # todo: BM25 parameters

    collector = TopScoreDocCollector.create(hitsPerPage, True)
    searcher.search(query, collector)
    hs = collector.topDocs().scoreDocs  # hists

    results = retrieval_scores(hs)
    # reader.close()
    return results  # retrieval_scores for each question-answer pair
class DocRepo:
    def __init__(self):
        # self.analyzer = StandardAnalyzer()
        # self.analyzer = PersianAnalyzer(StopFilter.makeStopSet(sw))
        # self.analyzer = PersianAnalyzer()
        self.analyzer = StopAnalyzer(Paths.get(Config.stop_words_address))
        self.config = IndexWriterConfig(self.analyzer)
        self.index = RAMDirectory()
        self.w = IndexWriter(self.index, self.config)

    def addDocument(self, id):
        global answers_train
        preA = answers_train[id]
        doc = Document()
        doc.add(TextField("pa", preA, Field.Store.YES))
        doc.add(StringField("id", str(id), Field.Store.YES))
        self.w.addDocument(doc)
        self.w.commit()

    def __del__(self):
        self.w.close()

    def get_most_similar(self, sentence, do_log=False):
        # print('query string is',string)
        # q = QueryParser('pa', self.analyzer).parse(sentence)
        query_builder = BooleanQuery.Builder()
        for token in sentence.split(' '):
            if token not in sw:
                qtq = TermQuery(Term("pa", token))
                query_builder.add(
                    BooleanClause(qtq, BooleanClause.Occur.SHOULD))
        q = query_builder.build()
        hitsPerPage = 2
        reader = DirectoryReader.open(self.w)
        self.searcher = IndexSearcher(reader)
        simi = BM25Similarity(Config.k1, Config.b)
        # simi = ClassicSimilarity()
        self.searcher.setSimilarity(simi)

        docs = self.searcher.search(q, hitsPerPage)
        hits = docs.scoreDocs

        # print("Found " + str(len(hits)) + " hits.")
        if len(hits) > 0:
            mate = self.searcher.doc(hits[0].doc).get("id")
            if do_log:
                print("found something. mate: ", mate, "- score : ",
                      hits[0].score)
            return hits[0], int(mate)
        else:
            return None, -1
def lucene_retrieval(q_string, feature_type, use_BM25=False):
    """

    :param q_string:
    :param feature_type:
    :param use_BM25:
    :return: retrieval_scores for each question-answer pair
    """
    index = set_lucene_index['ind']  # nonlocal variable index

    def retrieval_scores(hists):
        """
        return sorted document+score by score
        :param hists:
        """
        def doc_score(hists):
            """
            return doc_name & score
            :param hists:
            """
            for h in hists:
                # docID = h.doc
                # doc = searcher.doc(docID)
                # file_name = doc.get("corpus_name")
                # doc_name = doc.get("doc_name")
                # text = doc.get("text")
                score = h.score
                # yield (file_name, doc_name, score, text)
                yield score
        doc_score_list = list(doc_score(hists))
        return map(lambda f: f(doc_score_list), feature_type) if len(doc_score_list) != 0 else [0]*len(feature_type) # feature_type is a list of function

    # escape special characters via escape function
    query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string))

    # search
    reader = IndexReader.open(index)
    searcher = IndexSearcher(reader)

    if use_BM25:
        searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75))  # todo: BM25 parameters

    collector = TopScoreDocCollector.create(hitsPerPage, True)
    searcher.search(query, collector)
    hs = collector.topDocs().scoreDocs  # hists

    results = retrieval_scores(hs)
    # reader.close()
    return results  # retrieval_scores for each question-answer pair
Exemplo n.º 11
0
    def __init__(self, index_dir, index_file, rawQuery):
        self.indexFile = os.path.join(index_dir, index_file)

#         lucene.initVM(vmargs=['-Djava.awt.headless=true']) # uncomment when run Retrieve separately
        directory = SimpleFSDirectory(File(self.indexFile))
        searcher = IndexSearcher(DirectoryReader.open(directory))
        searcher.setSimilarity(BM25Similarity(1.2, 0.75))  # set BM25 as the similarity metric, k=1.2, b=0.75
        if 'Standard' in self.indexFile:
            print "Use the StandardAnalyzer"
            analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)  # build a standard analyzer with default stop words
        if 'Porter' in self.indexFile:
            print "Use the PorterStemmer analyzer"
            analyzer = PorterStemmerAnalyzer()
        self.run(searcher, analyzer, rawQuery)
        del searcher
def main():
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])

    queries = makeQueryList(args["queryFile"])
    print 'lucene', lucene.VERSION
    print "\n"

    directory = SimpleFSDirectory(Paths.get(os.getcwd(), INDEX_DIR))
    print directory.getDirectory()
    searcher = IndexSearcher(DirectoryReader.open(directory))
    searcher.setSimilarity(ClassicSimilarity())
    analyzer = StandardAnalyzer()

    run(searcher, analyzer, queries)
    del searcher
Exemplo n.º 13
0
 def find(self, query):
     transformer = StringTransformer()
     analyzer = EnglishAnalyzer(Version.LUCENE_CURRENT)
     reader = IndexReader.open(SimpleFSDirectory(File("index/")))
     searcher = IndexSearcher(reader)
     searcher.setSimilarity(BM25Similarity())
     processed_query = ' '.join(
         self._preprocessor(transformer.transform(query)))
     query = QueryParser(Version.LUCENE_CURRENT, "content",
                         analyzer).parse(processed_query)
     hits = searcher.get_description(query, 10)
     result_list = []
     for hit in hits.scoreDocs:
         doc = searcher.doc(hit.doc)
         result_list.append(doc.get("path").encode("utf-8"))
     return result_list
Exemplo n.º 14
0
def lucene_retrieval(q_string, use_BM25=False):
    """

    :param q_string:
    :param use_BM25:
    :return: retrieval_scores for each question-answer pair
    """
    index = set_lucene_index['ind']  # nonlocal variable index

    def doc_text(hists):
        """
        return doc_name & score
        :param hists:
        """
        text = '_NONE_'
        for h in hists:
            docID = h.doc
            doc = searcher.doc(docID)
            # file_name = doc.get("corpus_name")
            # doc_name = doc.get("doc_name")
            text = doc.get("text")
            #score = h.score
            # yield (file_name, doc_name, score, text)
        return text

    result = '_NONE_'

    # escape special characters via escape function
    if q_string and q_string.strip():   # when pre-process answers, `none of the above` -> '' cause error here
        #print(q_string)
        query = QueryParser(version, 'text', analyzer).parse(QueryParser.escape(q_string))

        # search
        reader = IndexReader.open(index)
        searcher = IndexSearcher(reader)

        if use_BM25:
            searcher.setSimilarity(BM25Similarity(k1=1.5, b=0.75))  # todo: BM25 parameters

        collector = TopScoreDocCollector.create(hitsPerPage, True)
        searcher.search(query, collector)
        hs = collector.topDocs().scoreDocs  # hists
        result = doc_text(hs)

        # reader.close()
    return result  # text: also nodes
Exemplo n.º 15
0
    def get_sorted_results(self, query):
        SHOULD = BooleanClause.Occur.SHOULD
        parsed_query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT, query, ['docno', 'content'], [SHOULD, SHOULD], self.analyzer)

        reader = IndexReader.open(self.directory)
        searcher = IndexSearcher(reader)

        searcher.setSimilarity(BM25Similarity())
        topDocs = searcher.search(parsed_query, 10)

        j = 0
        for i in topDocs.scoreDocs:
            d = searcher.doc(i.doc)

            print 'No. %02d: ' % (j + 1) + d['docno'] + ' ' + str(i.score)

            j += 1
Exemplo n.º 16
0
class ParagSearcher:
    def __init__(self, Lid, db_path=config.DB_SSQA):
        lucene.initVM()
        self.db = SSQA_DB(db_path)

        lesson_str = self.db.get_lesson_str(Lid)
        parags = str_lesson2parags(lesson_str)

        # Index a Lesson
        myIndexer = _ChineseRamIndexer()
        myIndexer.index_lesson(parags)
        myIndexer.close()

        self.reader = DirectoryReader.open(myIndexer.indexDir)
        self.searcher = IndexSearcher(self.reader)
        self.searcher.setSimilarity(mySimilarity())
        self.analyzer = SmartChineseAnalyzer()
        logger.debug('search similarity:{}'.format(
            self.searcher.getSimilarity()))

    def __exit__(self, *args):
        self.close()

    def search(self, query_text, top_n=1):
        query_text = query_text.strip()
        # query = QueryParser("content", self.analyzer).parse(QueryParser.escape(query_text.strip()))
        query = QueryParser("content", self.analyzer).parse(query_text)
        scoreDocs = self.searcher.search(query, top_n).scoreDocs

        out_list = []
        for scoreDoc in scoreDocs:
            docIndex = scoreDoc.doc
            doc = self.searcher.doc(docIndex)
            log_debug(doc, logger)
            log_debug(self.searcher.explain(query, docIndex), logger)

            out_list.append((doc['pid'], doc['content'], scoreDoc.score))
        return out_list

    def close(self):
        self.db.close()
        self.reader.close()
Exemplo n.º 17
0
    def get_sorted_results(self, query):
        SHOULD = BooleanClause.Occur.SHOULD
        parsed_query = MultiFieldQueryParser.parse(Version.LUCENE_CURRENT,
                                                   query, ['docno', 'content'],
                                                   [SHOULD, SHOULD],
                                                   self.analyzer)

        reader = IndexReader.open(self.directory)
        searcher = IndexSearcher(reader)

        searcher.setSimilarity(BM25Similarity())
        topDocs = searcher.search(parsed_query, 10)

        j = 0
        for i in topDocs.scoreDocs:
            d = searcher.doc(i.doc)

            print 'No. %02d: ' % (j + 1) + d['docno'] + ' ' + str(i.score)

            j += 1
Exemplo n.º 18
0
class CosQASearcher:
    def __init__(self, lang):
        lucene.initVM()

        if lang == 'zh':
            indexDir = SimpleFSDirectory(Paths.get(str(config.IDX_COS_ZH)))
            analyzer = SmartChineseAnalyzer()
        elif lang == 'en':
            indexDir = SimpleFSDirectory(Paths.get(str(config.IDX_COS_EN)))
            analyzer = EnglishAnalyzer()
        else:
            raise ValueError(
                'lang should be "zh" or "en", {} is invalid!'.format(lang))

        self.reader = DirectoryReader.open(indexDir)
        self.searcher = IndexSearcher(self.reader)
        self.searcher.setSimilarity(mySimilarity())
        self.analyzer = analyzer
        logger.debug('search similarity func: {}'.format(
            self.searcher.getSimilarity()))

    def search(self, query_text, top_n=1):
        query_text = query_text.strip()
        query = QueryParser("content", self.analyzer).parse(
            QueryParser.escape(query_text.strip()))
        #         query = QueryParser("content", self.analyzer).parse(query_text)
        scoreDocs = self.searcher.search(query, top_n).scoreDocs

        out_list = []
        for scoreDoc in scoreDocs:
            docIndex = scoreDoc.doc
            doc = self.searcher.doc(docIndex)
            log_debug(doc, logger)
            log_debug(self.searcher.explain(query, docIndex), logger)

            out_list.append(
                (doc['did'], doc['title_en'], doc['content'], scoreDoc.score))
        return out_list

    def close(self):
        self.reader.close()
Exemplo n.º 19
0
class QuestionLuceneSearch():

    def __init__(self):

        self.env = lucene.initVM(initialheap='6g', maxheap='6g', vmargs=['-Djava.awt.headless=true'])
        self.vocab = None

        BooleanQuery.setMaxClauseCount(2048)

        if not os.path.exists(prm.index_folder):
            print('Creating index at', prm.index_folder)
            if prm.docs_path == prm.docs_path_term:
                add_terms = True
            else:
                add_terms = False
            self.create_index(prm.index_folder, prm.docs_path, add_terms)

        if prm.local_index_folder:
            print('copying index from', prm.index_folder, 'to', prm.local_index_folder)
            if os.path.exists(prm.local_index_folder):
                print('Folder', prm.local_index_folder, 'already exists! Doing nothing.')
            else:
                shutil.copytree(prm.index_folder, prm.local_index_folder)
            self.index_folder = prm.local_index_folder
        else:
            self.index_folder = prm.index_folder

        fsDir = MMapDirectory(Paths.get(prm.index_folder))
        self.searcher = IndexSearcher(DirectoryReader.open(fsDir))

        self.searcher.setSimilarity(BM25Similarity())

        if prm.docs_path != prm.docs_path_term:
            if not os.path.exists(prm.index_folder_term):
                print('Creating index at', prm.index_folder_term)
                self.create_index(prm.index_folder_term, prm.docs_path_term, add_terms=True)

            if prm.local_index_folder_term:
                print('copying index from', prm.index_folder_term, 'to', prm.local_index_folder_term)
                if os.path.exists(prm.local_index_folder_term):
                    print('Folder', prm.local_index_folder_term, 'already exists! Doing nothing.')
                else:
                    shutil.copytree(prm.index_folder_term, prm.local_index_folder_term)
                self.index_folder_term = prm.local_index_folder_term
            else:
                self.index_folder_term = prm.index_folder_term
            fsDir_term = MMapDirectory(Paths.get(prm.index_folder_term))
            self.searcher_term = IndexSearcher(DirectoryReader.open(fsDir_term))

        self.analyzer = StandardAnalyzer()
        self.pool = ThreadPool(processes=prm.n_threads)
        self.cache = {}
        
        print('Loading Text-ID mapping...')
        self.text_id_map, self.id_text_map = self.get_text_id_map()

    def get_text_id_map(self):

        # get number of docs
        n_docs = self.searcher.getIndexReader().numDocs()

        text_id = {}
        id_text = {}
        query = MatchAllDocsQuery()
        hits = self.searcher.search(query, n_docs)
        for hit in hits.scoreDocs:
            doc = self.searcher.doc(hit.doc)
            idd = int(doc['id'])
            text = doc['text']
            text_id[text] = idd
            id_text[idd] = text

        return text_id, id_text


    # def add_doc(self, doc_id, title, txt, add_terms):
    def add_doc(self, doc_id, txt, add_terms):

        doc = Document()
        txt = utils.clean(txt)

        if add_terms:
            txt_ = txt.lower()
            words_idx, words = utils.text2idx2([txt_], self.vocab, prm.max_terms_per_doc)
            words_idx = words_idx[0]
            words = words[0]

        doc.add(Field("id", str(doc_id), self.t1))
        # doc.add(Field("title", title, self.t1))
        doc.add(Field("text", txt, self.t2))
        if add_terms:
            doc.add(Field("word_idx", ' '.join(map(str,words_idx)), self.t3))
            doc.add(Field("word", '<&>'.join(words), self.t3))
        self.writer.addDocument(doc)


    def create_index(self, index_folder, docs_path, add_terms=False):

        print('Loading Vocab...')
        if not self.vocab:
            self.vocab = utils.load_vocab(prm.vocab_path, prm.n_words)

        os.mkdir(index_folder)

        self.t1 = FieldType()
        self.t1.setStored(True)
        self.t1.setIndexOptions(IndexOptions.DOCS)

        self.t2 = FieldType()
        self.t2.setStored(False)
        self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        self.t3 = FieldType()
        self.t3.setStored(True)
        self.t3.setIndexOptions(IndexOptions.NONE)
       
        fsDir = MMapDirectory(Paths.get(index_folder))
        writerConfig = IndexWriterConfig(StandardAnalyzer())
        self.writer = IndexWriter(fsDir, writerConfig)
        print("%d docs in index" % self.writer.numDocs())
        print("Indexing documents...")


        # import corpus_hdf5
        # corpus = corpus_hdf5.MSMARCOCorpusHDF5(docs_path)
        import pickle
        with open(docs_path, "rb") as read_file:
            corpus = pickle.load(read_file)
        idx_cnt = 0
        # for doc_id, txt in zip(corpus.get_id_iter(), corpus.get_text_iter()):
        # for doc_id, txt in corpus.items():
        for txt in corpus:
            self.add_doc(idx_cnt, txt, add_terms)  # not lowered
            if idx_cnt % 1000 == 0:
                print('indexing doc', idx_cnt)
            idx_cnt += 1
        print("Index of %d docs..." % self.writer.numDocs())
        self.writer.close()


    def search_multithread(self, qs, max_cand, max_full_cand, searcher):

        self.max_cand = max_cand
        self.max_full_cand = max_full_cand
        self.curr_searcher = searcher
        out = self.pool.map(self.search_multithread_part, qs)
 
        return out


    def search_multithread_part(self, q):

        if not self.env.isCurrentThreadAttached():
            self.env.attachCurrentThread()
    
        if q in self.cache:
            return self.cache[q]
        else:

            try:
                q = q.replace('AND','\\AND').replace('OR','\\OR').replace('NOT','\\NOT')
                query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))
            except:
                print('Unexpected error when processing query:', str(q))
                print('Using query "dummy".')
                q = 'dummy'
                query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))

            c = OrderedDict()
            hits = self.curr_searcher.search(query, self.max_cand)

            for i, hit in enumerate(hits.scoreDocs):
                doc = self.curr_searcher.doc(hit.doc)
                if i < self.max_full_cand:
                    word_idx = list(map(int, doc['word_idx'].split(' ')))
                    word = doc['word'].split('<&>')
                else:
                    word_idx = []
                    word = []
                # c[int(doc['id'])] = [word_idx, word]
                c[int(doc['id'])] = [word_idx, word, hit.score]
            # print(c)
            return c

    
    def search_singlethread(self, qs, max_cand, max_full_cand, curr_searcher):

        out = []
        for q in qs:
            if q in self.cache:
                out.append(self.cache[q])
            else:
                try:
                    q = q.replace('AND','\\AND').replace('OR','\\OR').replace('NOT','\\NOT')
                    query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))
                except:
                    print('Unexpected error when processing query:', str(q))
                    print('Using query "dummy".')
                    query = QueryParser("text", self.analyzer).parse(QueryParser.escape('dummy'))

                c = OrderedDict()
                hits = curr_searcher.search(query, max_cand)

                for i, hit in enumerate(hits.scoreDocs):
                    doc = curr_searcher.doc(hit.doc)
                    if i < max_full_cand:
                        word_idx = list(map(int, doc['word_idx'].split(' ')))
                        word = doc['word'].split('<&>')
                    else:
                        word_idx = []
                        word = []
                    # c[int(doc['id'])] = [word_idx, word]
                    c[int(doc['id'])] = [word_idx, word, hit.score]
                out.append(c)

        return out


    def get_candidates(self, qs, max_cand, max_full_cand=None, save_cache=False, extra_terms=True):
        if not max_full_cand:
            max_full_cand = max_cand

        if prm.docs_path != prm.docs_path_term:
            max_cand2 = 0
        else:
            max_cand2 = max_full_cand
        if prm.n_threads > 1:
            out = self.search_multithread(qs, max_cand, max_cand2, self.searcher)
            if (prm.docs_path != prm.docs_path_term) and extra_terms:
                terms = self.search_multithread(qs, max_full_cand, max_full_cand, self.searcher_term)
        else:
            out = self.search_singlethread(qs, max_cand, max_cand2, self.searcher)
            if (prm.docs_path != prm.docs_path_term) and extra_terms:
                terms = self.search_singlethread(qs, max_full_cand, max_full_cand, self.searcher_term)

        if (prm.docs_path != prm.docs_path_term) and extra_terms:
            for outt, termss in zip(out, terms):                
                for cand_id, term in zip(list(outt.keys())[:max_full_cand], list(termss.values())):
                    outt[cand_id] = term
  
        if save_cache:
            for q, c in zip(qs, out):
                if q not in self.cache:
                    self.cache[q] = c

        return out



    def get_pair_scores(self, q, doc_int, save_cache=False, extra_terms=True):

        # if prm.n_threads > 1:
        #     out = self.search_pair_score_multithread(qs_trailing_doc, self.searcher)
        #     if (prm.docs_path != prm.docs_path_term) and extra_terms:
        #         terms = self.search_pair_score_multithread(qs_trailing_doc, self.searcher_term)
        # else:
        # out = self.search_pair_score_singlethread(qs_trailing_doc, self.searcher)
        # if (prm.docs_path != prm.docs_path_term) and extra_terms:
        #     terms = self.search_pair_score_singlethread(qs_trailing_doc, self.searcher_term)
        out = []

        try:
            q = q.replace('AND', '\\AND').replace('OR', '\\OR').replace('NOT', '\\NOT')
            query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))
            # query = QueryParser("text", StandardAnalyzer()).parse(QueryParser.escape(q))
        except:
            print('Unexpected error when processing query:', str(q))
            print('Using query "dummy".')
            q = 'dummy'
            query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))
            # query = QueryParser("text", StandardAnalyzer()).parse(QueryParser.escape(q))

        c = OrderedDict()
        exp = self.searcher.explain(query, doc_int)
        c[1] = exp
        out.append(c)

        return out

    def search_pair_score_singlethread(self, q, doc_int, searcher):

        out = []

        try:
            q = q.replace('AND', '\\AND').replace('OR', '\\OR').replace('NOT', '\\NOT')
            query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))
            # query = QueryParser("text", StandardAnalyzer()).parse(QueryParser.escape(q))
        except:
            print('Unexpected error when processing query:', str(q))
            print('Using query "dummy".')
            q = 'dummy'
            query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))
            # query = QueryParser("text", StandardAnalyzer()).parse(QueryParser.escape(q))


        c = OrderedDict()
        exp = searcher.explain(query, doc_int)
        c[1] = exp

        out.append(c)

        return out

    def search_pair_score_multithread(self, qs_trailing_doc, searcher):

        self.curr_searcher = searcher
        # out = self.pool.map(self.search_pair_score_multithread_part, product(qs,doc_int))
        out = self.pool.map(self.search_pair_score_multithread_part, qs_trailing_doc)

        return out

    def search_pair_score_multithread_part(self, q_doc_int):

        # print(q_doc_int)
        spl=q_doc_int.split('<|endoftext|>')
        q = spl[0]
        print(q)
        doc_int = int(spl[1])
        print(doc_int)

        if not self.env.isCurrentThreadAttached():
            self.env.attachCurrentThread()

        try:
            q = q.replace('AND', '\\AND').replace('OR', '\\OR').replace('NOT', '\\NOT')
            query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))
        except:
            print('Unexpected error when processing query:', str(q))
            print('Using query "dummy".')
            q = 'dummy'
            query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))


        c = OrderedDict()
        exp = self.curr_searcher.explain(query, doc_int)
        c[1] = exp

        return c
Exemplo n.º 20
0
            total_recall += recall
            total_precision += precision
            total_FB += FB

            print '%3s Recall: %.6f  Precision: %.6f  FB: %.6f' % (qid, recall, precision, FB)

        query_data_length = len(query_data)
        avg_recall = total_recall/query_data_length
        avg_precision = total_precision/query_data_length
        avg_FB = total_FB/query_data_length

        print 'Avg Recall: %.6f  Avg Precision: %.6f Avg FB: %.6f' % (avg_recall, avg_precision, avg_FB)

if __name__ == '__main__':
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    print 'lucene', lucene.VERSION
    base_dir = os.path.dirname(os.path.abspath(sys.argv[0]))
    directory = SimpleFSDirectory(File(os.path.join(base_dir, INDEX_DIR)))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    
    searcher.setSimilarity(similarities.BM25Similarity())
    #Available similarity: BM25Similarity, MultiSimilarity, PerFieldSimilarityWrapper, SimilarityBase, TFIDFSimilarity
    # analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
    analyzer = MyAnalyzer(Version.LUCENE_CURRENT)
    fs = FileSearcher(searcher, analyzer)
    if len(sys.argv) < 2:
        fs.perform_user_query(searcher, analyzer)
    else:
        fs.results_comparison(searcher, analyzer, sys.argv[1])
    del searcher
Exemplo n.º 21
0
    def __recs_query(self, positive_rated_document_list, scores, recs_number,
                     items_directory, candidate_list: List) -> pd.DataFrame:
        """
        Builds a query using the contents that the user liked. The terms relative to the contents that
        the user liked are boosted by the rating he/she gave. A filter clause is added to the query to
        consider only candidate items
        Args:
            positive_rated_document_list: List of contents that the user liked
            scores: Ratings given by the user
            recs_number: How many items must be recommended. You can only specify the number, not
            a specific item for which compute the prediction
            items_directory: Directory where the items are stored

        Returns:
            score_frame (pd.DataFrame): DataFrame containing the recommendations for the user
        """
        BooleanQuery.setMaxClauseCount(2000000)
        searcher = IndexSearcher(
            DirectoryReader.open(SimpleFSDirectory(
                Paths.get(items_directory))))
        if self.__classic_similarity:
            searcher.setSimilarity(ClassicSimilarity())

        field_list = searcher.doc(positive_rated_document_list[0]).getFields()
        user_fields = {}
        field_parsers = {}
        analyzer = SimpleAnalyzer()
        for field in field_list:
            if field.name() == 'content_id':
                continue
            user_fields[field.name()] = field.stringValue()
            field_parsers[field.name()] = QueryParser(field.name(), analyzer)

        positive_rated_document_list.remove(positive_rated_document_list[0])

        for _ in positive_rated_document_list:
            for field in field_list:
                if field.name() == 'content_id':
                    continue
                user_fields[field.name()] += field.stringValue()

        logger.info("Building query")

        query_builder = BooleanQuery.Builder()
        for score in scores:
            for field_name in user_fields.keys():
                if field_name == 'content_id':
                    continue
                field_parsers[field_name].setDefaultOperator(
                    QueryParser.Operator.OR)

                field_query = field_parsers[field_name].escape(
                    user_fields[field_name])
                field_query = field_parsers[field_name].parse(field_query)
                field_query = BoostQuery(field_query, score)
                query_builder.add(field_query, BooleanClause.Occur.SHOULD)

        if candidate_list is not None:
            id_query_string = ' OR '.join("content_id:\"" + content_id + "\""
                                          for content_id in candidate_list)
            id_query = QueryParser("testo_libero",
                                   KeywordAnalyzer()).parse(id_query_string)
            query_builder.add(id_query, BooleanClause.Occur.MUST)

        query = query_builder.build()
        docs_to_search = len(positive_rated_document_list) + recs_number
        scoreDocs = searcher.search(query, docs_to_search).scoreDocs

        logger.info("Building score frame to return")

        recorded_items = 0
        columns = ['to_id', 'rating']
        score_frame = pd.DataFrame(columns=columns)
        for scoreDoc in scoreDocs:
            if recorded_items >= recs_number:
                break
            if scoreDoc.doc not in positive_rated_document_list:
                doc = searcher.doc(scoreDoc.doc)
                item_id = doc.getField("content_id").stringValue()
                recorded_items += 1

                score_frame = pd.concat([
                    score_frame,
                    pd.DataFrame.from_records([(item_id, scoreDoc.score)],
                                              columns=columns)
                ])

        return score_frame
Exemplo n.º 22
0
class IndexAndTaxonomy(object):
    def __init__(self, settings, indexDirectory=None, taxoDirectory=None):
        self._settings = settings
        self._similarity = settings.similarity
        self._numberOfConcurrentTasks = settings.numberOfConcurrentTasks
        self._reader = DirectoryReader.open(indexDirectory)
        self.taxoReader = DirectoryTaxonomyReader(taxoDirectory)
        self._readerSettingsWrapper = ReaderSettingsWrapper()
        self._readerSettingsWrapper.get = lambda: {
            "similarity": self.searcher.getSimilarity().toString(),
            "numberOfConcurrentTasks": self._numberOfConcurrentTasks
        }
        self._readerSettingsWrapper.set = self._setReadSettings
        self._searcher = None
        self._executor = None
        self._reopenSearcher = True

    def reopen(self):
        reader = DirectoryReader.openIfChanged(self._reader)
        if reader is None:
            return
        self._reader.close()
        self._reader = reader
        self._reopenSearcher = True
        taxoReader = DirectoryTaxonomyReader.openIfChanged(self.taxoReader)
        if taxoReader is None:
            return
        self.taxoReader.close()
        self.taxoReader = taxoReader

    @property
    def searcher(self):
        if not self._reopenSearcher:
            return self._searcher

        if self._settings.multithreaded:
            if self._executor:
                self._executor.shutdown()
            self._executor = Executors.newFixedThreadPool(
                self._numberOfConcurrentTasks)
            self._searcher = SuperIndexSearcher(self._reader, self._executor,
                                                self._numberOfConcurrentTasks)
        else:
            self._searcher = IndexSearcher(self._reader)
        self._searcher.setSimilarity(self._similarity)
        self._reopenSearcher = False
        return self._searcher

    def _setReadSettings(self, similarity=None, numberOfConcurrentTasks=None):
        # This method must be thread-safe
        if similarity is None:
            self._similarity = self._settings.similarity
        else:
            self._similarity = BM25Similarity(similarity["k1"],
                                              similarity["b"])

        if numberOfConcurrentTasks is None:
            self._numberOfConcurrentTasks = self._settings.numberOfConcurrentTasks
        else:
            self._numberOfConcurrentTasks = numberOfConcurrentTasks
        self._reopenSearcher = True

    def close(self):
        self.taxoReader.close()
        self._reader.close()
Exemplo n.º 23
0
def lucene_retrieval_multifield(q_string,
                                q_class,
                                feature_type,
                                use_BM25=False):
    """
    multifield: different query string for different field
    not same word on different field
    :param q_string:
    :param feature_type:
    :param use_BM25:
    :return: retrieval_scores for each question-answer pair
    """
    index = set_lucene_index['ind']  # nonlocal variable index

    def retrieval_scores(hists):
        """
        return sorted document+score by score
        :param hists:
        """
        def doc_score(hists):
            """
            return doc_name & score
            :param hists:
            """
            for h in hists:
                # docID = h.doc
                # doc = searcher.doc(docID)
                # file_name = doc.get("corpus_name")
                # doc_name = doc.get("doc_name")
                # text = doc.get("text")
                score = h.score
                # yield (file_name, doc_name, score, text)
                yield score

        doc_score_list = list(doc_score(hists))
        return map(lambda f: f(doc_score_list),
                   feature_type)  # feature_type is a list of function

    text_query = QueryParser(version, 'text',
                             analyzer).parse(QueryParser.escape(q_string))
    subject_query = QueryParser(version, 'corpus_name',
                                analyzer).parse(QueryParser.escape(q_class))
    query = BooleanQuery()

    # BooleanClause.Occur
    # MUST implies that the keyword must occur
    #  SHOULD implies that the keyword SHOULD occur
    query.add(text_query, BooleanClause.Occur.SHOULD)
    query.add(subject_query, BooleanClause.Occur.SHOULD)

    # search
    reader = IndexReader.open(index)
    searcher = IndexSearcher(reader)

    if use_BM25:
        searcher.setSimilarity(BM25Similarity(k1=1.5,
                                              b=0.75))  # todo: BM25 parameters

    collector = TopScoreDocCollector.create(hitsPerPage, True)
    searcher.search(query, collector)
    hs = collector.topDocs().scoreDocs  # hists

    results = retrieval_scores(hs)
    # reader.close()
    return results  # retrieval_scores for each question-answer pair
Exemplo n.º 24
0
class Searcher(object):  # 搜索类
    def __init__(self, indexDir,
                 computeLengthNorm=True):  # 初始化 indexDir-索引文件目录 computeLengthNorm-是否应用SIM(true-不用 false-应用)
        #         if not jpype.isJVMStarted():
        #         lucene.initVM()
        lucene.getVMEnv().attachCurrentThread()
        self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)  # 标准分词 在针对英文时 以分隔符分词
        self.path = os.path.join(INDEX_PATH, indexDir)  # 存储路径
        self.store = SimpleFSDirectory(File(self.path))  # 存储***?
        # self.reader = DirectoryReader.open(self.store)
        self.reader = IndexReader.open(self.store)
        self.numDocs = self.reader.maxDoc()
        self.searcher = IndexSearcher(self.reader)  # IndexSearch类
        sim = CustomSimilarity()  # addby zmq
        if not computeLengthNorm:  # SIM
            sim = CustomSimilarity()
            self.searcher.setSimilarity(sim)
        self.mlt = MoreLikeThis(self.reader, sim)  # mlt?
        self.mlt.setAnalyzer(self.analyzer)
        self.mlt.setMinTermFreq(1)
        self.mlt.setMinDocFreq(1)
        # debug
        self.mlt.setMinWordLen(1)
        self.mlt.setMaxNumTokensParsed(100000000)
        BooleanQuery.setMaxClauseCount(1024 * 1024)  # 修改最长query clause BUG
        # debug

    def searchKeyWords(self, key_value, max_num):  # 查询text域中的指定字符匹配的文档
        key_value = str(key_value.encode('utf-8'))
        if type(key_value) != type('') or len(key_value) == 0:
            raise Exception('Please provide a string.')
        # term = ('text',key_value)
        # termquery = TermQuery(Term(*term))
        query = QueryParser(Version.LUCENE_CURRENT, "title", self.analyzer).parse(key_value)
        #         query = FuzzyQuery(Version.LUCENE_CURRENT, "title",self.analyzer).parse(key_value)###模糊查询

        self.query = query
        results = self.searcher.search(query, max_num)
        result_list = []
        # print len(results.scoreDocs),results.scoreDocs
        for each_result in results.scoreDocs:
            docid = each_result.doc
            result_list.append(self.searcher.doc(docid)['key'])
        return result_list

    def getDocID(self, dictID):  # 查询某个term命中的文档号 dictID为Key-Value(例如key-'title' value-seller_id)
        if len(dictID) != 1:
            raise Exception('Please provide a dict with one pair of field and value.')
        term = dictID.items()[0]
        termquery = TermQuery(Term(*term))
        self.query = termquery
        results = self.searcher.search(termquery, 10)
        try:
            if len(results.scoreDocs) < 1:
                return None
            docid = results.scoreDocs[0].doc
            print 'id:', docid
            return docid
        except Exception, e:
            logger.error('Doc not found: %s', str(dictID))
            raise Exception('Doc not found: %s', str(dictID))
Exemplo n.º 25
0
class SearchEngine(object):
    def __init__(self, root, storedir, isindexing=False, isBM25=True):

        if not os.path.exists(storedir):
            os.mkdir(storedir)

        self.analyzer = LimitTokenCountAnalyzer(StandardAnalyzer(), 1048576)

        if isindexing:
            store = SimpleFSDirectory(Paths.get(storedir))
            config = IndexWriterConfig(self.analyzer)
            # TODO BM25 parameter tuning
            if isBM25:
                config.setSimilarity(BM25Similarity())
            config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
            writer = IndexWriter(store, config)

            self.indexer(root, writer)
            ticker = Ticker()
            print('commit index')
            threading.Thread(target=ticker.run).start()
            writer.commit()
            writer.close()
            ticker.tick = False
            print('done')

        search_dir = SimpleFSDirectory(Paths.get(storedir))
        self.searcher = IndexSearcher(DirectoryReader.open(search_dir))
        if isBM25:
            self.searcher.setSimilarity(BM25Similarity())

    def indexer(self, root, writer):

        t1 = FieldType()
        t1.setStored(True)
        t1.setTokenized(True)
        t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        def repalcer(text):
            chars = '\\`*_{}[]()>#+-.!$‘'
            for c in chars:
                if c in text:
                    text = text.replace(c, ' ')
            return text

        for root, dirnames, filenames in os.walk(root):
            i = 0
            for filename in filenames:
                i += 1
                with open(os.path.join(root, filename)) as f:
                    for line in f.readlines():
                        line = line.split(' ', 2)
                        docname = line[0] + ' ' + line[1]
                        name = repalcer(line[0])
                        contents = line[2]
                        doc = Document()
                        doc.add(Field('docname', docname, t1))
                        doc.add(Field('name', name, t1))
                        doc.add(Field('contents', contents, t1))
                        writer.addDocument(doc)
                print('File %d done indexing' % i)

    def search(self, query, topk=10):

        qp = PythonMultiFieldQueryParser(['name', 'contents'], self.analyzer)
        query = qp.parse(
            query, ['name', 'contents'],
            [BooleanClause.Occur.SHOULD, BooleanClause.Occur.SHOULD],
            self.analyzer)
        # print(query)
        scores = self.searcher.search(query, topk).scoreDocs
        # print('%s total matching documents.' % len(scores))

        docnames = []
        doccontents = []
        for score in scores:
            doc = self.searcher.doc(score.doc)
            docnames.append(doc.get('docname'))
            doccontents.append(doc.get('contents'))

        return docnames, doccontents

    def retrieve(self, term, sid):

        query = term + ' ' + str(sid)
        query = QueryParser.escape(query)
        query = QueryParser('docname', self.analyzer).parse(query)
        score = self.searcher.search(query, 1).scoreDocs

        doc = self.searcher.doc(score[0].doc)
        return doc.get('docname'), doc.get('contents')
Exemplo n.º 26
0
class SearchIndex:

  def __init__(self, indexPath):
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    print 'lucene', lucene.VERSION

    #initialize the index
    self.INDEX_DIR = indexPath  #"Clue_Index"
    self.results = None
    self.searcher = IndexSearcher(DirectoryReader.open(
        SimpleFSDirectory(File(self.INDEX_DIR))))

    self.searcher.setSimilarity(BM25Similarity())

  def initializeAnalyzer(self):
    #self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT,JavaSet(stopSet))
    sSet = CharArraySet(Version.LUCENE_CURRENT, 0, True)
    for entry in stopSet:
      sSet.add(entry)
    self.stopSet = sSet
    #self.analyzer = EnglishAnalyzer(Version.LUCENE_CURRENT,sSet)
    self.analyzer = EnglishAnalyzer(Version.LUCENE_CURRENT)

  def getTopDocuments(self, query, limit, sfield, dfield):
    queryObj = QueryParser(Version.LUCENE_CURRENT, sfield,
                           self.analyzer).parse(query)
    print queryObj
    scoreDocs = self.searcher.search(queryObj, limit).scoreDocs
    print '%s total matching documents.' % len(scoreDocs)
    self.results = scoreDocs
    rresults = []
    i = 0
    #reader = self.searcher.getIndexReader();
    #print type(reader)
    for scoreDoc in scoreDocs:
      doc = self.searcher.doc(scoreDoc.doc)
      rresults.append((doc.get(dfield), scoreDoc.score))
      #rresults.append(doc.get(dfield));#,scoreDoc.score))
      i += 1
      if i == limit:
        break
    return rresults
    #print 'path:', doc.get("URL"), 'name:', doc.get("id"), 'title:', doc.get("title")

  def getTopDocumentsWithExpansion(self, query, expTerms, limit, sfield, dfield
                                 ):
    print expTerms
    query = query + ' ' + ' '.join('{0}^{1}'.format(x[0], round(x[1], 2))
                                   for x in expTerms)
    sSet = CharArraySet(Version.LUCENE_CURRENT, 0, True)
    for entry in expTerms:
      sSet.add(entry[0])

    analyzer = EnglishAnalyzer(Version.LUCENE_CURRENT, self.stopSet, sSet)

    queryObj = QueryParser(Version.LUCENE_CURRENT, sfield,
                           analyzer).parse(query)
    scoreDocs = self.searcher.search(queryObj, limit).scoreDocs
    print '%s total matching documents.' % len(scoreDocs), queryObj
    self.results = scoreDocs
    rresults = []
    i = 0

    for scoreDoc in scoreDocs:
      doc = self.searcher.doc(scoreDoc.doc)
      #rresults.append(doc.get(dfield));#,scoreDoc.score))
      rresults.append((doc.get(dfield), scoreDoc.score))

      i += 1
      if i == limit:
        break
    return rresults

  def getField(self, dfield, name, limit):
    toReturn = []
    i = 0
    for scoreDoc in self.results:
      doc = self.searcher.doc(scoreDoc.doc)
      toReturn.append((doc.get(dfield), doc.get(name)))
      i += 1
      if i == limit:
        break
    return toReturn

  def close(self):
    del self.searcher
Exemplo n.º 27
0
    def __init__(self, tweets, storeDir, analyzer):

        # first, index the tweets
        if not path.exists(storeDir):
            mkdir(storeDir)

        store = SimpleFSDirectory(File(storeDir))
        analyzer = LimitTokenCountAnalyzer(analyzer, 1048576)
        config = IndexWriterConfig(Version.LUCENE_CURRENT, analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        writer = IndexWriter(store, config)

        self.index_docs(tweets, writer)
        writer.commit()
        writer.close()

        # set up IndexSearcher
        reader = IndexReader.open(store)
        n_docs = reader.numDocs()
        searcher = IndexSearcher(reader)
        searcher.setSimilarity(BM25Similarity())
        queryparser = QueryParser(Version.LUCENE_CURRENT, "contents", StandardAnalyzer(Version.LUCENE_CURRENT))

        # create document vectors
        doc_vectors = self.get_doc_vectors(reader, tweets, n_docs)

        cs_scorer = CosineSimilarityScorer(doc_vectors, reader, searcher, tweets)
        bm25_scorer = BM25Scorer(doc_vectors, searcher, queryparser)

        # find relevant tweets
        for fav_doc in (1, 26, 51):
            cs_scores = cs_scorer.get_scores(fav_doc)
            bm25_scores = bm25_scorer.get_scores(fav_doc)

            top_cs_scores = dict(sorted(cs_scores.iteritems(), key=itemgetter(1), reverse=True)[:5])
            top_bm25_scores = dict(sorted(bm25_scores.iteritems(), key=itemgetter(1), reverse=True)[:5])

            # print "top_cs_scores", top_cs_scores
            # print "top_bm25_scores", top_bm25_scores

            # calculate composite score by multiplying cs scores by 100 and keeping bm25 scores as is.
            # cs is bounded from 0.0-1.0. bm25 scores is actually idf * bm25_similarity_score so values
            # above 10.0 are not uncommon
            top_blended_scores = {}
            for key, value in top_cs_scores.iteritems():
                top_blended_scores[key] = value * 100.0

            for key, value in top_bm25_scores.iteritems():
                if key not in top_blended_scores:
                    top_blended_scores[key] = 0.0
                top_blended_scores[key] += value

            top_score = dict(sorted(top_blended_scores.iteritems(), key=itemgetter(1), reverse=True)[:1])

            # print "\n"
            # print "results for", fav_doc
            # print tweets[fav_doc]
            print searcher.doc(fav_doc).get("contents")
            print top_score

            # if the top score fails to reach 10.0, this result is probably not of high quality so onlyworthy
            # will decline to identify a relevant match
            if top_score.values()[0] < 10.0:
                print "skipping"
                continue

            # print tweets[top_score.keys()[0]]
            print searcher.doc(top_score.keys()[0]).get("contents")
            print "\n"
Exemplo n.º 28
0
# for txtName in gutenberg_list:
#   words = nltk.corpus.gutenberg.words(txtName)
#   sents = " ".join(words).split(".")
#   print(sents[:100])
# #   print("Indexing ", txtName, "...")
# #   for i in range(0, len(sents), 10):
# #     text = " ".join(sents[i:i+10])
# #     doc = Document()
# #     doc.add(Field("fieldname", text, TextField.TYPE_STORED))
# #     iwriter.addDocument(doc)
# # iwriter.close()

# now search the index
ireader = DirectoryReader.open(directory)
isearcher = IndexSearcher(ireader)

# set similarity method
bm25 = BM25Similarity()
isearcher.setSimilarity(bm25)

# parse a simple query that searches for "text"
parser = QueryParser("fieldname", analyzer)
query = parser.parse("her sister was reading")
hits = isearcher.search(query, 5).scoreDocs
print(len(hits))

for hit in hits:
    result = isearcher.doc(hit.doc)
    print("[%8.4f] %s" % (hit.score, result.get("fieldname")))
Exemplo n.º 29
0
class LuceneSearch(object):
    def __init__(self, args):

        self.env = lucene.initVM(initialheap='28g',
                                 maxheap='28g',
                                 vmargs=['-Djava.awt.headless=true'])
        self.args = args

        index_folder = os.path.join(DATA_DIR, args.index_folder)
        if not os.path.exists(index_folder):
            self.doc_db = DocDB()
            logger.info(f'Creating index at {index_folder}')
            self.create_index(index_folder)

        fsDir = MMapDirectory(Paths.get(index_folder))
        self.searcher = IndexSearcher(DirectoryReader.open(fsDir))
        self.searcher.setSimilarity(MyTFIDFSimilarity())
        self.analyzer = MySimpleAnalyzer(
            CharArraySet(collections.JavaSet(utils.STOPWORDS), True))
        self.pool = ThreadPool(processes=args.num_search_workers)

    def add_doc(self, title, text, tokens):

        doc = Document()
        doc.add(Field("title", title, self.t1))
        doc.add(Field("text", text, self.t2))
        doc.add(Field("token", tokens, self.t3))

        self.writer.addDocument(doc)

    def create_index(self, index_folder):
        os.mkdir(index_folder)

        self.t1 = FieldType()
        self.t1.setStored(True)
        self.t1.setIndexOptions(IndexOptions.DOCS)

        self.t2 = FieldType()
        self.t2.setStored(True)
        self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        self.t3 = FieldType()
        self.t3.setStored(True)
        self.t3.setIndexOptions(IndexOptions.NONE)

        fsDir = MMapDirectory(Paths.get(index_folder))
        writerConfig = IndexWriterConfig(
            MySimpleAnalyzer(
                CharArraySet(collections.JavaSet(utils.STOPWORDS), True)))
        writerConfig.setSimilarity(MyTFIDFSimilarity())
        writerConfig.setRAMBufferSizeMB(16384.0)  # 14g
        self.writer = IndexWriter(fsDir, writerConfig)
        logger.info(f"{self.writer.numDocs()} docs in index")
        logger.info("Indexing documents...")

        doc_ids = self.doc_db.get_doc_ids()
        for doc_id in tqdm(doc_ids, total=len(doc_ids)):
            text = self.doc_db.get_doc_text(doc_id)
            tokens = self.doc_db.get_doc_tokens(doc_id)
            self.add_doc(doc_id, text, tokens)

        logger.info(f"Indexed {self.writer.numDocs()} docs.")
        self.writer.forceMerge(1)  # to increase search performance
        self.writer.close()

    def search_multithread(self, qs, ranker_doc_max, searcher):
        self.ranker_doc_max = ranker_doc_max
        self.curr_searcher = searcher
        out = self.pool.map(self.search_multithread_part, qs)

        return out

    def search_multithread_part(self, q):
        if not self.env.isCurrentThreadAttached():
            self.env.attachCurrentThread()

        try:
            if self.args.ngram == 2:
                query = self._parse_query(field_name='text', query=q)
            else:
                # self.args.ngram == 1
                query = QueryParser('text',
                                    self.analyzer).parse(QueryParser.escape(q))
        except Exception as e:
            logger.warning(colored(f'{e}: {q}, use query dummy.'), 'yellow')
            if self.args.ngram == 2:
                query = self._parse_query(field_name='text', query=q)
            else:
                # self.args.ngram == 1
                query = QueryParser('text', self.analyzer).parse('dummy')

        doc_scores, doc_titles, doc_texts, doc_words = [], [], [], []
        hits = self.curr_searcher.search(query, self.ranker_doc_max)

        for i, hit in enumerate(hits.scoreDocs):
            doc = self.curr_searcher.doc(hit.doc)

            doc_score = hit.score
            doc_title = doc['title']
            doc_word = doc['token'].split('<&>')
            doc_text = doc['text']

            doc_scores.append(doc_score)
            doc_titles.append(doc_title)
            doc_words.append(doc_word)
            doc_texts.append(doc_text)

        if len(doc_scores) == 0:
            logger.warning(
                colored(
                    f'WARN: search engine returns no results for query: {q}.',
                    'yellow'))

        return doc_scores, doc_titles, doc_texts, doc_words

    def search_singlethread(self, qs, ranker_doc_max, curr_searcher):
        out = []
        for q in qs:
            try:
                if self.args.ngram == 2:
                    query = self._parse_query(field_name='text', query=q)
                else:
                    # self.args.ngram == 1
                    query = QueryParser('text', self.analyzer).parse(
                        QueryParser.escape(q))
            except Exception as e:
                logger.warning(colored(f'{e}: {q}, use query dummy.'),
                               'yellow')
                if self.args.ngram == 2:
                    query = self._parse_query(field_name='text', query=q)
                else:
                    # self.args.ngram == 1
                    query = QueryParser('text', self.analyzer).parse('dummy')

            doc_scores, doc_titles, doc_texts, doc_words = [], [], [], []
            hits = curr_searcher.search(query, ranker_doc_max)

            for i, hit in enumerate(hits.scoreDocs):
                doc = curr_searcher.doc(hit.doc)

                doc_score = hit.score
                doc_title = doc['title']
                doc_word = doc['token'].split('<&>')
                doc_text = doc['text']

                doc_scores.append(doc_score)
                doc_titles.append(doc_title)
                doc_words.append(doc_word)
                doc_texts.append(doc_text)

            if len(doc_scores) == 0:
                logger.warning(
                    colored(
                        f'WARN: search engine returns no results for query: {q}.',
                        'yellow'))

            out.append((doc_scores, doc_titles, doc_texts, doc_words))

        return out

    def batch_closest_docs(self, qs, ranker_doc_max):

        if self.args.num_search_workers > 1:
            out = self.search_multithread(qs, ranker_doc_max, self.searcher)
        else:
            out = self.search_singlethread(qs, ranker_doc_max, self.searcher)

        return out

    def _parse_query(self, field_name, query):
        ts = self.analyzer.tokenStream("dummy", StringReader(query))
        termAtt = ts.getAttribute(CharTermAttribute.class_)
        ts.reset()
        tokens = []
        while ts.incrementToken():
            tokens.append(termAtt.toString())
        ts.end()
        ts.close()

        booleanQuery = BooleanQuery.Builder()
        for token in tokens:
            builder = PhraseQuery.Builder()
            for i, word in enumerate(token.split(' ')):
                builder.add(Term(field_name, word), i)
            pq = builder.build()
            booleanQuery.add(pq, BooleanClause.Occur.SHOULD)
        final_query = booleanQuery.build()
        return final_query
Exemplo n.º 30
0
def createIndexSearcher(indexDir):
    directory = DirectoryReader.open(FSDirectory.open(Paths.get(indexDir)))
    searcher = IndexSearcher(directory)
    similarity = BM25Similarity(K1, B)
    searcher.setSimilarity(similarity)
    return searcher
Exemplo n.º 31
0
class SearchBuilder(object):
    def __init__(self,
                 index_path,
                 field,
                 similarity="boolean",
                 use_relevance_feedback=False,
                 feedback_index_path=None):
        self.reader = DirectoryReader.open(
            FSDirectory.open(Paths.get(index_path)))
        self.searcher = IndexSearcher(self.reader)
        if use_relevance_feedback and feedback_index_path is not None:
            self.feedback_reader = DirectoryReader.open(
                FSDirectory.open(Paths.get(feedback_index_path)))
            self.feedback_searcher = IndexSearcher(self.feedback_reader)
        self.similarity = similarity
        self.stopwords = stop_words()
        if similarity == "boolean":
            self.searcher.setSimilarity(BooleanSimilarity())
        elif similarity == "tf":
            self.searcher.setSimilarity(TFSimilarity())
        elif similarity == "tfidf":
            self.searcher.setSimilarity(ClassicSimilarity())
        elif similarity == "BM25":
            self.searcher.setSimilarity(BM25Similarity(1.2, 0.2))
        else:
            print("Unknown similarity, so we use BM25(1.2, 0.2) as default")
            self.searcher.setSimilarity(BM25Similarity(1.2, 0.2))
        analyzer = StandardAnalyzer()
        print(self.searcher.getSimilarity())
        self.parser = QueryParser(field, analyzer)

    def remove_stopwords(self, query_text):
        new_query_tokens = []
        query_tokens = query_text.split()
        for query_token in query_tokens:
            if query_token not in self.stopwords:
                new_query_tokens.append(query_token)
        return " ".join(new_query_tokens)

    def search_query(self,
                     query,
                     num_returns=50,
                     use_multipass_pseudo_relevance_feedback=False,
                     doc_counts=None,
                     add_nums=None):

        query_text = query["description"]
        print(query_text.lower())
        query_text = " ".join(tokenizer.tokenize(query_text))
        query_text = self.remove_stopwords(query_text.lower())
        print(query_text)
        query_search = self.parser.parse(query_text)
        if use_multipass_pseudo_relevance_feedback:
            if doc_counts is None:
                doc_counts = [5, 9]
            if add_nums is None:
                add_nums = [2, 13]
            assert len(doc_counts) == len(
                add_nums), "The number of pass is inconsistent!"
            for doc_count, add_num in zip(doc_counts, add_nums):
                final_list = []
                initial_hits = self.searcher.search(query_search,
                                                    doc_count).scoreDocs
                term_tf_idf = {}
                for initial_hit in initial_hits:
                    termVector = self.reader.getTermVector(
                        initial_hit.doc, "text")
                    terms_enum = termVector.iterator()
                    termsref = BytesRefIterator.cast_(terms_enum)
                    N_terms = 0
                    term_idf = {}
                    term_freq = {}
                    term_list = []
                    while (termsref.next()):
                        termval = TermsEnum.cast_(termsref)
                        termText = termval.term().utf8ToString()
                        if termText in self.stopwords:
                            continue
                        tc = termval.totalTermFreq()
                        if termText in term_freq:
                            term_freq[termText] += tc
                        else:
                            term_freq[termText] = tc
                        if termText in term_idf:
                            term_idf[termText] += 1
                        else:
                            term_idf[termText] = 1
                        if termText not in term_list:
                            term_list.append(termText)
                        N_terms = N_terms + 1

                    for term in term_list:
                        if term in term_tf_idf:
                            term_tf_idf[term] += term_freq[term] / N_terms * (
                                1 + math.log(doc_count / (term_idf[term] + 1)))
                        else:
                            term_tf_idf[term] = term_freq[term] / N_terms * (
                                1 + math.log(doc_count / (term_idf[term] + 1)))
                sorted_term_tf_idf = sorted(term_tf_idf.items(),
                                            key=lambda x: x[1],
                                            reverse=True)
                for each in sorted_term_tf_idf:
                    if each[0] not in self.stopwords:
                        final_list.append(each[0])
                print("added query tokens:", final_list[:add_num])
                query_text = query_text + " " + " ".join(final_list[:add_num])
                query_search = self.parser.parse(query_text)
        results = self.searcher.search(query_search, num_returns)
        hits = results.scoreDocs
        trec_results = []
        for rank, hit in enumerate(hits):
            doc = self.searcher.doc(hit.doc)
            trec_result = {
                "QueryID":
                query["Number"],
                "Q0":
                "Q0",
                "DocID":
                doc.get(".U"),
                "Rank":
                str(rank + 1),
                "Score":
                str(hit.score),
                "RunID":
                self.similarity + "-mpprf-" + str(len(doc_counts)) + "passes"
                if use_multipass_pseudo_relevance_feedback else self.similarity
            }
            trec_results.append(trec_result)
        return trec_results

    def search_query_with_relevance_feedback(self,
                                             query,
                                             feedback_qrels,
                                             num_returns=50,
                                             add_num=1):
        query_text = query["description"]
        print(query_text)
        query_text = " ".join(tokenizer.tokenize(query_text))
        query_text = self.remove_stopwords(query_text.lower())
        print(query_text)
        query_number = query["Number"]
        qrel_doc_ids = [
            qrel["docno"] for qrel in feedback_qrels
            if qrel["qid"] == query_number
        ]
        final_list = []
        term_tf_idf = {}
        doc_count = len(qrel_doc_ids)
        for qrel_doc_id in qrel_doc_ids:
            initial_hit = self.feedback_searcher.search(
                TermQuery(Term(".U", qrel_doc_id)), 1).scoreDocs
            if len(initial_hit) == 0:
                continue
            assert len(initial_hit) == 1
            termVector = self.reader.getTermVector(initial_hit[0].doc, "text")
            terms_enum = termVector.iterator()
            termsref = BytesRefIterator.cast_(terms_enum)
            N_terms = 0
            term_idf = {}
            term_freq = {}
            term_list = []
            while (termsref.next()):
                termval = TermsEnum.cast_(termsref)
                termText = termval.term().utf8ToString()
                if termText in self.stopwords:
                    continue
                tc = termval.totalTermFreq()
                if termText in term_freq:
                    term_freq[termText] += tc
                else:
                    term_freq[termText] = tc
                if termText in term_idf:
                    term_idf[termText] += 1
                else:
                    term_idf[termText] = 1
                if termText not in term_list:
                    term_list.append(termText)
                N_terms = N_terms + 1

            for term in term_list:
                if term in term_tf_idf:
                    term_tf_idf[term] += term_freq[term] / N_terms * (
                        1 + math.log(doc_count / (term_idf[term] + 1)))
                else:
                    term_tf_idf[term] = term_freq[term] / N_terms * (
                        1 + math.log(doc_count / (term_idf[term] + 1)))

        sorted_tf_idf = sorted(term_tf_idf.items(),
                               key=lambda x: x[1],
                               reverse=True)
        for each in sorted_tf_idf:
            if each[0] not in self.stopwords and not str(each[0]).isnumeric(
            ) and each[0] not in query_text.split(" "):
                final_list.append(each[0])
        print(final_list[:add_num])
        query_text = query_text + " " + " ".join(final_list[:add_num])
        query_text = " ".join(query_text.split(" "))
        print(query_text)
        query_search = self.parser.parse(query_text)
        results = self.searcher.search(query_search, num_returns)
        hits = results.scoreDocs
        trec_results = []
        for rank, hit in enumerate(hits):
            doc = self.searcher.doc(hit.doc)
            trec_result = {
                "QueryID": query["Number"],
                "Q0": "Q0",
                "DocID": doc.get(".U"),
                "Rank": str(rank + 1),
                "Score": str(hit.score),
                "RunID": self.similarity
            }
            trec_results.append(trec_result)
        return trec_results

    # def search_query_with_glove(self,  query, doc_vectors, num_returns=50, index2word_set=None):
    #     query_text = query["description"]
    #     query_text = " ".join(word_tokenize(query_text))
    #     query_text = self.remove_stopwords(query_text)
    #     query_vec = avg_feature_vector(query_text, model=glove_vectors, num_features=300, index2word_set=index2word_set)
    #     doc_similarity = {}
    #     for doc_id in tqdm(doc_vectors, desc="compute doc similarity:", total=len(doc_vectors.items())):
    #         doc_similarity[doc_id] = 1 - spatial.distance.cosine(query_vec, doc_vectors[doc_id])
    #     doc_similarity = sorted(doc_similarity.items(), key=lambda x: x[1], reverse=True)[:num_returns]
    #     trec_results = []
    #     for i, doc_id in tqdm(enumerate(doc_similarity), desc="output results:", total=len(doc_similarity)):
    #         trec_result = {"QueryID": query["Number"],
    #                        "Q0": "Q0",
    #                        "DocID": doc_id[0],
    #                        "Rank": str(i + 1),
    #                        "Score": str(doc_id[1]),
    #                        "RunID": self.similarity+"+embedding"}
    #         trec_results.append(trec_result)
    #     return trec_results
    #
    # def search_query_with_transformers(self,  query, doc_vectors, num_returns=50):
    #     query_text = query["description"]
    #     query_text = " ".join(word_tokenize(query_text))
    #     query_text = self.remove_stopwords(query_text)
    #     query_vec = distilroberta_model.encode(query_text, convert_to_tensor=True)
    #     doc_similarity = {}
    #     for doc_id in tqdm(doc_vectors, desc="compute doc similarity:", total=len(doc_vectors.items())):
    #         doc_similarity[doc_id] = util.pytorch_cos_sim(query_vec, doc_vectors[doc_id])
    #     doc_similarity = sorted(doc_similarity.items(), key=lambda x: x[1], reverse=True)[:num_returns]
    #     trec_results = []
    #     for i, doc_id in tqdm(enumerate(doc_similarity), desc="output results:", total=len(doc_similarity)):
    #         trec_result = {"QueryID": query["Number"],
    #                        "Q0": "Q0",
    #                        "DocID": doc_id[0],
    #                        "Rank": str(i + 1),
    #                        "Score": str(doc_id[1]),
    #                        "RunID": self.similarity+"+embedding"}
    #         trec_results.append(trec_result)
    #     return trec_results

    def get_results_from_queries(self,
                                 queries,
                                 num_returns=50,
                                 use_pseudo_relevance_feedback=False):
        trec_results = []
        for query in queries:
            search_results = self.search_query(query, num_returns,
                                               use_pseudo_relevance_feedback)
            trec_results = trec_results + search_results
        return trec_results

    #
    # def get_results_from_queries_with_pretrained_embedding_similariy(self, queries, doc_vectors, num_returns=50):
    #     trec_results = []
    #     for query in tqdm(queries, desc="queries", total=len(queries)):
    #         search_results = self.search_query_with_glove(query, doc_vectors, num_returns)
    #         trec_results = trec_results + search_results
    #     return trec_results
    #
    # def get_results_from_queries_with_transformers(self, queries, doc_vectors, num_returns=50):
    #     trec_results = []
    #     for query in tqdm(queries, desc="queries", total=len(queries)):
    #         search_results = self.search_query_with_transformers(query, doc_vectors, num_returns)
    #         trec_results = trec_results + search_results
    #     return trec_results

    def get_results_from_queries_with_relevance_feedback(
            self, queries, feedback_qrels, num_returns=50):
        trec_results = []
        for query in queries:
            search_results = self.search_query_with_relevance_feedback(
                query, feedback_qrels, num_returns=num_returns)
            trec_results = trec_results + search_results
        return trec_results
Exemplo n.º 32
0
def main():
    global lucene_vm_init
    if not lucene_vm_init:
       lucene.initVM(vmargs=['-Djava.awt.headless=true'])
       lucene_vm_init = True
    
    is_index_Exist = os.path.exists(LUCENE_INDEX_DIR)
    # specify index path 
    index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR))
    
    # configure search engine
    analyzer = StandardAnalyzer()
    config = IndexWriterConfig(analyzer)
    
    # load index to search engine
    reader = DirectoryReader.open(index_mm)
    searcher1 = IndexSearcher(reader)
    searcher1.setSimilarity(BM25Similarity())
    searcher2 = IndexSearcher(reader)
    w = IndexWriter(index_mm,config)
    # read query
    read_query()
    
    # initialize mongodb client
    mongoObj=Mongo_Object('localhost',27017)
      
    # search
    docDup=set()
    finalDup={}
    
    for i in xrange(len(queries)):
        print 'process query %d' %(i)
        query = queries[i]
        querystr = stemSentence(query[3])
        # build searcher
        q_lucene = QueryParser("all_text", analyzer).parse(querystr)
        collector = TopScoreDocCollector.create(hitsPerPage);
        searcher1.search(q_lucene, collector);
        hits = collector.topDocs().scoreDocs;
        
        
        # find candidate results after 1st round filter
        docDup.clear()
        for j in xrange(len(hits)):
            docID=hits[j].doc
            d=searcher1.doc(docID)
            if d['title'] in docDup:
               finalDup[d['title']]=d
               continue
            docDup.add(d['title'])
            
        docDup.clear()
        for j in xrange(len(hits)):
            docID=hits[j].doc
            d=searcher1.doc(docID)
            title=d['title']
            if d['title'] in docDup:
               continue
            docDup.add(title)
            
            item=(mongoObj.conn_me).find_one({'title':title})
            if item is None:
               continue
            entitylist=item['entitylist'].split('|')
            for en_title in entitylist:
                if title==en_title:
                   continue
                t=Term('title',en_title)
                q=TermQuery(t)
                docs=searcher2.search(q,2)
                if docs.totalHits<=1:
                   continue
                
                docID2=(docs.scoreDocs)[0].doc
                doc=searcher2.doc(docID2)
                finalDup[doc['title']]=doc

    print 'begin to clean index, there are %d dup records' %(len(finalDup))
    for title in finalDup:
        doc=finalDup[title]
        # title,name,value,category,skos_category,all_text,raw_name,raw_value,abstract
        
        name=doc['name']
        value=doc['value']
        category=doc['category']
        skos_category=doc['skos_category']
        all_text=doc['all_text']
        raw_name=doc['raw_name']
        raw_value=doc['raw_value']
        abstract=doc['abstract']
        
        print 'process '+title
        t=Term('title',title)
        q=TermQuery(t)
        w.deleteDocuments(q)
        addDoc(w,title,name,value,category,skos_category,all_text,raw_name,raw_value,abstract)
    # process remaining records
    #global batch,cnt_batch
    #if cnt_batch>0:
       #w.addDocuments(batch)
       #cnt_batch=0
       #del batch[:]
    w.close()
Exemplo n.º 33
0
class LuceneRetrieval(BaseRetrieval):
    """
        Encapsulates the Lucene retrieval engine
    """
    def __init__(self, index_path, method, logger=None, use_default_similarity=False):
        self.index_path=index_path
        directory = SimpleFSDirectory(File(self.index_path))
        self.analyzer = StandardAnalyzer(LuceneVersion.LUCENE_CURRENT)
        self.reader=DirectoryReader.open(directory)
        self.searcher = IndexSearcher(self.reader)

        # uncomment one of these lines to change the type of parser, query and weight used
        if use_default_similarity:
            self.query_parser=QueryParser
        else:
            self.query_parser=FieldAgnosticQueryParser

        if use_default_similarity:
            similarity=DefaultSimilarity()
            self.useExplainQuery=False
        else:
            similarity=FieldAgnosticSimilarity()
            self.useExplainQuery=True
        # by default, FieldAgnosticSimilarity uses coord factor, can be disabled
##        similarity.useCoord=False

        self.searcher.setSimilarity(similarity)
        self.method=method # never used?
        self.logger=logger

    def runQueryViaExplain(self,query, max_results):
        """
            Really crappy solution to make sure that explanations and searches are the same
            while I fix Lucene
        """
        results=[]

        index=0
        for index in range(self.reader.numDocs()):
            explanation=self.searcher.explain(query,index)
            score=explanation.getValue()
##            match=re.search(r"(.*?)\s=",explanation.toString(),re.IGNORECASE|re.DOTALL)
##            if match:
##                score=float(match.group(1))
            hit=namedtuple("Hit",["doc","score"])
            hit.doc=index
            hit.score=score
##            heapq.heappush(results,hit)
            results.append(hit)

        results.sort(key=lambda x:x.score,reverse=True)

        if max_results < self.reader.numDocs():
            results=results[:max_results]

        return results

    def runQuery(self, structured_query, max_results=MAX_RESULTS_RECALL):
        """
            LOTS OF SWEET LUCENE
        """
        original_query=structured_query

        if not structured_query or len(structured_query) == 0 :
            return []

        self.last_query=structured_query
        query_text=self.rewriteQuery(structured_query["structured_query"], ["text"])

        try:
            query = self.query_parser(lucene.Version.LUCENE_CURRENT, "text", self.analyzer).parse(query_text)
        except:
            print("Lucene exception:",sys.exc_info()[:2])
            return None

        structured_query["lucene_query"]=query_text

        if self.useExplainQuery:
            # this should only exist until I fix the lucene bulkScorer to give the same results
            hits=self.runQueryViaExplain(query,max_results)
        else:
            collector=TopScoreDocCollector.create(max_results, True)
            self.searcher.search(query, collector)
            hits = collector.topDocs().scoreDocs

##        print("Found %d document(s) that matched query '%s':" % (hits.totalHits, query))
        res=[]


##        if len(hits.scoreDocs) ==0:
##            print "Original query:",original_query
##            print "Query:", query

        for hit in hits:
            doc = self.searcher.doc(hit.doc)
            metadata= json.loads(doc.get("metadata"))
            res.append((hit.score,metadata))
        return res


    def formulaFromExplanation(self, query, doc_id):
        """
            Runs .explain() for one query/doc pair, generates and returns a \
            StoredFormula instance from it

            :param query: Elastic DSL Query
            :param doc_id: id of document to run .explain() for
            :returns:
        """
        explanation=self.searcher.explain(query,doc_id)

        formula=StoredFormula()
        formula.fromLuceneExplanation(explanation)
        return formula
Exemplo n.º 34
0
class LuceneRetrieval(BaseRetrieval):
    """
        Encapsulates the Lucene retrieval engine
    """
    def __init__(self, index_path, method, logger=None, use_default_similarity=False):
        self.index_path=index_path
        directory = SimpleFSDirectory(File(self.index_path))
        self.analyzer = StandardAnalyzer(LuceneVersion.LUCENE_CURRENT)
        self.reader=DirectoryReader.open(directory)
        self.searcher = IndexSearcher(self.reader)

        # uncomment one of these lines to change the type of parser, query and weight used
        if use_default_similarity:
            self.query_parser=QueryParser
        else:
            self.query_parser=FieldAgnosticQueryParser

        if use_default_similarity:
            similarity=DefaultSimilarity()
            self.useExplainQuery=False
        else:
            similarity=FieldAgnosticSimilarity()
            self.useExplainQuery=True
        # by default, FieldAgnosticSimilarity uses coord factor, can be disabled
##        similarity.useCoord=False

        self.searcher.setSimilarity(similarity)
        self.method=method # never used?
        self.logger=logger

    def runQueryViaExplain(self,query, max_results):
        """
            Really crappy solution to make sure that explanations and searches are the same
            while I fix Lucene
        """
        results=[]

        index=0
        for index in range(self.reader.numDocs()):
            explanation=self.searcher.explain(query,index)
            score=explanation.getValue()
##            match=re.search(r"(.*?)\s=",explanation.toString(),re.IGNORECASE|re.DOTALL)
##            if match:
##                score=float(match.group(1))
            hit=namedtuple("Hit",["doc","score"])
            hit.doc=index
            hit.score=score
##            heapq.heappush(results,hit)
            results.append(hit)

        results.sort(key=lambda x:x.score,reverse=True)

        if max_results < self.reader.numDocs():
            results=results[:max_results]

        return results

    def runQuery(self, structured_query, max_results=MAX_RESULTS_RECALL):
        """
            LOTS OF SWEET LUCENE
        """
        original_query=structured_query

        if not structured_query or len(structured_query) == 0 :
            return []

        self.last_query=structured_query
        query_text=self.rewriteQuery(structured_query["structured_query"], ["text"])

        try:
            query = self.query_parser(lucene.Version.LUCENE_CURRENT, "text", self.analyzer).parse(query_text)
        except:
            print("Lucene exception:",sys.exc_info()[:2])
            return None

        structured_query["lucene_query"]=query_text

        if self.useExplainQuery:
            # this should only exist until I fix the lucene bulkScorer to give the same results
            hits=self.runQueryViaExplain(query,max_results)
        else:
            collector=TopScoreDocCollector.create(max_results, True)
            self.searcher.search(query, collector)
            hits = collector.topDocs().scoreDocs

##        print("Found %d document(s) that matched query '%s':" % (hits.totalHits, query))
        res=[]


##        if len(hits.scoreDocs) ==0:
##            print "Original query:",original_query
##            print "Query:", query

        for hit in hits:
            doc = self.searcher.doc(hit.doc)
            metadata= json.loads(doc.get("metadata"))
            res.append((hit.score,metadata))
        return res


    def formulaFromExplanation(self, query, doc_id):
        """
            Runs .explain() for one query/doc pair, generates and returns a \
            StoredFormula instance from it

            :param query: Elastic DSL Query
            :param doc_id: id of document to run .explain() for
            :returns:
        """
        explanation=self.searcher.explain(query,doc_id)

        formula=StoredFormula()
        formula.fromLuceneExplanation(explanation)
        return formula
Exemplo n.º 35
0
class Searcher:
    """
    Class that contains the search methods
    """
    def __init__(self, searchDir):

        self.analyzer = MyPythonEnglishAnalyzer(
            stopwords=Indexer.ENGLISH_STOP_WORDS_SET)
        self.directory = FSDirectory.open(Paths.get(searchDir))
        self.reader = DirectoryReader.open(self.directory)
        self.searcher = IndexSearcher(self.reader)

    def simpleSearch(self, query, sim):
        """
        Method that searches through documents using only content_section Field
        searchDir : the path to the folder that contains the index.
        """
        # Now search the index:
        parser = QueryParser("content_section", self.analyzer)
        query = parser.parse(QueryParser.escape(query))
        self.searcher.setSimilarity(sim)
        hits = self.searcher.search(query, 6).scoreDocs
        return hits

    def simpleSearchID(self, query, sim):
        """
        Method that searches through documents using only content_section Field
        searchDir : the path to the folder that contains the index.
        """
        # Now search the index:
        parser = QueryParser("id_section", self.analyzer)
        query = parser.parse(QueryParser.escape(query))
        self.searcher.setSimilarity(sim)
        hits = self.searcher.search(query, 6).scoreDocs
        return hits

    def multiFieldsSearch(self, query, sim):
        """
        Method that searches through documents using content_section and title_article Fields
        searchDir : the path to the folder that contains the index.
        """
        # Now search the index:
        lucene.getVMEnv().attachCurrentThread()

        parser = MultiFieldQueryParser(["content_section", "title_article"],
                                       self.analyzer)
        parser.setDefaultOperator(QueryParserBase.OR_OPERATOR)
        query = MultiFieldQueryParser.parse(parser, QueryParser.escape(query))

        self.searcher.setSimilarity(sim)
        hits = self.searcher.search(query, 6).scoreDocs
        return hits

    def pairSearch(self, pair, sim):
        """
        Method that searches through documents using only content_section Field
        searchDir : the path to the folder that contains the index.
        """
        # Now search the index:
        title = pair[0].replace('_', ' ')
        content = pair[1]
        parser = QueryParser("content_section", self.analyzer)
        query1 = parser.parse(QueryParser.escape(title))
        query2 = parser.parse(QueryParser.escape(content))

        bq = BooleanQuery.Builder()
        bq.add(query1, BooleanClause.Occur.FILTER)
        bq.add(query2, BooleanClause.Occur.SHOULD)

        self.searcher.setSimilarity(sim)
        hits = self.searcher.search(bq.build(), 6).scoreDocs
        return hits

    def multiFieldsPairSearch(self, pair, sim):
        """
        Method that searches through documents using only content_section Field
        searchDir : the path to the folder that contains the index.
        """
        # Now search the index:
        title = pair[0].replace('_', ' ')
        content = pair[1]
        parser = MultiFieldQueryParser(["content_section", "title_article"],
                                       self.analyzer)
        parser.setDefaultOperator(QueryParserBase.OR_OPERATOR)
        query1 = MultiFieldQueryParser.parse(parser, QueryParser.escape(title))
        query2 = MultiFieldQueryParser.parse(parser,
                                             QueryParser.escape(content))

        bq = BooleanQuery.Builder()
        bq.add(query1, BooleanClause.Occur.FILTER)
        bq.add(query2, BooleanClause.Occur.SHOULD)

        self.searcher.setSimilarity(sim)
        hits = self.searcher.search(bq.build(), 6).scoreDocs
        return hits
Exemplo n.º 36
0
    import argparse
    parser = argparse.ArgumentParser(
        description='Execute queries on comment body')
    parser.add_argument('user_name', type=str,
                        help="User name (profile to use)")
    parser.add_argument('index_dir', metavar='dir', type=str,
                        help="Index directory")
    parser.add_argument('--sim', type=str, nargs='?',
                        default="tfidf", help="Similarity (in [tfidf, lm, bm25])")
    parser.add_argument('--reorder', type=str, nargs='?',
                        default="no", help="Reordering (in [ups, normups])")
    parser.add_argument('--short', action='store_false',
                        help="Don't show the body of comments")
    args = parser.parse_args()


    if args.sim in ['bm25']:
        similarity = BM25Similarity()
    elif args.sim in ['lm']:
        similarity = LMDirichletSimilarity()
    else:
        similarity = ClassicSimilarity()

    # Sample query
    storeDir = SimpleFSDirectory(Paths.get(args.index_dir))
    searcher = IndexSearcher(DirectoryReader.open(storeDir))
    if similarity is not None:
        searcher.setSimilarity(similarity)
    analyzer = StandardAnalyzer()
    run(searcher, analyzer, args.user_name, reordering=args.reorder, show_bodies=not args.short)
Exemplo n.º 37
0
class LuceneCorpus(object):
    # to init a LuceneCorpus, we need the outputdir, which is passed as index_dir
    # we need filenames that contains one for more corpus we just created
    # we need a parser, this parser should implement function 'parse' which knows how to split, how to stem
    def __init__(self, index_dir, filenames, parser, similarity=None):
        """
        :param index_dir: where to store the Lucene index
        :param filenames: the corpus created previously. Note that the format of corpus that has been created is consistent
        :param parser: SimpleWordParser in Parser.py, where we can apply functions such as stemming
        :param similarity: We can put None here(then default Vector Space Model with TF-IDF is used) or we can use BM25 similarity to index
        :return:
        """
        self._index_dir = index_dir
        self._filenames = filenames
        self._parser = parser
        self._similarity = similarity
        lucene.initVM()
        # the WhitespaceAnalyzer split the text based on whitespace
        self._analyzer = WhitespaceAnalyzer(Version.LUCENE_CURRENT)
        self._store = SimpleFSDirectory(File(self._index_dir))
        self._searcher = None

    def prp_index(self):
        '''
        Prepare the index given our "corpus" file(s)
        '''
        print '=> Preparing Lucene index %s' % self._index_dir
        writer = self._get_writer(create=True)
        print '   Currently %d docs (dir %s)' % (writer.numDocs(), self._index_dir)
        num_pages, num_sections = 0, 0
        page_name, section_name = None, None
        num_lines = 0
        for ifname,fname in enumerate(self._filenames):
            print '   Adding lines to index from file #%d: %s' % (ifname, fname)
            with open(fname,'rt') as infile:
                for text in infile:
                    if len(text)==0:
                        print 'Reached EOF'
                        break # EOF
                    # CorpusReader.PAGE_NAME_PREFIX is <Page>
                    # all our corpus we manipulated them to have this tag as the start of a page
                    if text.startswith(CorpusReader.PAGE_NAME_PREFIX):
                        page_name = text[len(CorpusReader.PAGE_NAME_PREFIX):].strip()
                        section_name = None
                        num_pages += 1
                    elif text.startswith(CorpusReader.SECTION_NAME_PREFIX):
                        section_name = text[len(CorpusReader.SECTION_NAME_PREFIX):].strip()
                        num_sections += 1
                    else:
                        assert (page_name is not None) and (section_name is not None)
                        if self._parser is None:
                            luc_text = text
                        else:
                            # note in our case the we always have SimpleWordParser
                            section_words = self._parser.parse(text, calc_weights=False) #True)
                            luc_text = ' '.join(section_words)
                        # for each section, we add the whole section to Lucene index, we store the text and makes it searchable
                        # seems like page is not necessary here since we do not add document page by page but section by section
                        doc = Document()
                        # there is only one field for each document, which is the text field
                        # section_name is not used as a field
                        doc.add(Field("text", luc_text, Field.Store.YES, Field.Index.ANALYZED))
                        writer.addDocument(doc)
                    num_lines += 1
                    if num_lines % 100000 == 0:
                        print '    read %d lines so far: %d pages, %d sections' % (num_lines, num_pages, num_sections)

        print '   Finished - %d docs (dir %s)' % (writer.numDocs(), self._index_dir)
        writer.close()

    def search(self, words, max_docs, weight_func=lambda n: np.ones(n), score_func=lambda s: s):
        '''
        Search the index for the given words, return total score
        '''
        searcher = self._get_searcher()
        if type(words)==str:
            search_text = words
            search_text = AsciiConvertor.convert(search_text)
            for c in '/+-&|!(){}[]^"~*?:':
                search_text = search_text.replace('%s'%c, '\%s'%c)
        else:
            search_text = ' '.join(words)
        print 'search_text: %s' % search_text
        # note that whatever parser that we put as our argument, eventually when searching with query, we will use Lucene parser to split query words
        query = QueryParser(Version.LUCENE_CURRENT, "text", self._analyzer).parse(search_text)
        hits = searcher.search(query, max_docs)

        score_sum = 0.0
        weights = weight_func(len(hits.scoreDocs))
        for hit,weight in zip(hits.scoreDocs, weights):
            score_sum += weight * score_func(hit.score)
        return score_sum

    def _get_writer(self, analyzer=None, create=False):
        config = IndexWriterConfig(Version.LUCENE_CURRENT, self._analyzer)
        if create:
            config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        if self._similarity is not None:
            config.setSimilarity(self._similarity)
        writer = IndexWriter(self._store, config)
        return writer

    def _get_searcher(self):
        if self._searcher is None:
            self._searcher = IndexSearcher(DirectoryReader.open(self._store))
            if self._similarity is not None:
                self._searcher.setSimilarity(self._similarity)
        return self._searcher
Exemplo n.º 38
0
class Lucene(object):

    # default fieldnames for id and contents
    FIELDNAME_ID = "id"
    FIELDNAME_CONTENTS = "contents"

    # internal fieldtypes
    # used as Enum, the actual values don't matter
    FIELDTYPE_ID = "id"
    FIELDTYPE_ID_TV = "id_tv"
    FIELDTYPE_TEXT = "text"
    FIELDTYPE_TEXT_TV = "text_tv"
    FIELDTYPE_TEXT_TVP = "text_tvp"
    FIELDTYPE_TEXT_NTV = "text_ntv"
    FIELDTYPE_TEXT_NTVP = "text_ntvp"

    def __init__(self, index_dir, max_shingle_size=None):
        global lucene_vm_init

        if not lucene_vm_init:
            lucene.initVM(vmargs=['-Djava.awt.headless=true'])
            lucene_vm_init = True
        self.dir = SimpleFSDirectory(File(index_dir))
        self.max_shingle_size = max_shingle_size
        self.analyzer = None
        self.reader = None
        self.searcher = None
        self.writer = None
        self.ldf = None

    @staticmethod
    def get_version():
        """Get Lucene version."""
        return Version.LUCENE_48

    @staticmethod
    def preprocess(text):
        """Tokenize and stop the input text."""
        ts = StandardTokenizer(Lucene.get_version(),
                               StringReader(text.lower()))
        ts = StopFilter(Lucene.get_version(), ts,
                        StopAnalyzer.ENGLISH_STOP_WORDS_SET)
        string_builder = StringBuilder()
        ts.reset()
        char_term_attr = ts.addAttribute(CharTermAttribute.class_)
        while ts.incrementToken():
            if string_builder.length() > 0:
                string_builder.append(" ")
            string_builder.append(char_term_attr.toString())
        return string_builder.toString()

    def get_analyzer(self):
        """Get analyzer."""
        if self.analyzer is None:
            std_analyzer = StandardAnalyzer(Lucene.get_version())
            if self.max_shingle_size is None:
                self.analyzer = std_analyzer
            else:
                self.analyzer = ShingleAnalyzerWrapper(std_analyzer,
                                                       self.max_shingle_size)
        return self.analyzer

    def open_reader(self):
        """Open IndexReader."""
        if self.reader is None:
            self.reader = DirectoryReader.open(self.dir)

    def get_reader(self):
        return self.reader

    def close_reader(self):
        """Close IndexReader."""
        if self.reader is not None:
            self.reader.close()
            self.reader = None
        else:
            raise Exception("There is no open IndexReader to close")

    def open_searcher(self):
        """
        Open IndexSearcher. Automatically opens an IndexReader too,
        if it is not already open. There is no close method for the
        searcher.
        """
        if self.searcher is None:
            self.open_reader()
            self.searcher = IndexSearcher(self.reader)

    def get_searcher(self):
        """Returns index searcher (opens it if needed)."""
        self.open_searcher()
        return self.searcher

    def set_lm_similarity_jm(self, method="jm", smoothing_param=0.1):
        """
        Set searcher to use LM similarity.

        :param method: LM similarity ("jm" or "dirichlet")
        :param smoothing_param: smoothing parameter (lambda or mu)
        """
        if method == "jm":
            similarity = LMJelinekMercerSimilarity(smoothing_param)
        elif method == "dirichlet":
            similarity = LMDirichletSimilarity(smoothing_param)
        else:
            raise Exception("Unknown method")

        if self.searcher is None:
            raise Exception("Searcher has not been created")
        self.searcher.setSimilarity(similarity)

    def open_writer(self):
        """Open IndexWriter."""
        if self.writer is None:
            config = IndexWriterConfig(Lucene.get_version(),
                                       self.get_analyzer())
            config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
            self.writer = IndexWriter(self.dir, config)
        else:
            raise Exception("IndexWriter is already open")

    def close_writer(self):
        """Close IndexWriter."""
        if self.writer is not None:
            self.writer.close()
            self.writer = None
        else:
            raise Exception("There is no open IndexWriter to close")

    def add_document(self, contents):
        """
        Adds a Lucene document with the specified contents to the index.
        See LuceneDocument.create_document() for the explanation of contents.
        """
        if self.ldf is None:  # create a single LuceneDocument object that will be reused
            self.ldf = LuceneDocument()
        self.writer.addDocument(self.ldf.create_document(contents))

    def get_lucene_document_id(self, doc_id):
        """Loads a document from a Lucene index based on its id."""
        self.open_searcher()
        query = TermQuery(Term(self.FIELDNAME_ID, doc_id))
        tophit = self.searcher.search(query, 1).scoreDocs
        if len(tophit) == 1:
            return tophit[0].doc
        else:
            return None

    def get_document_id(self, lucene_doc_id):
        """Gets lucene document id and returns the document id."""
        self.open_reader()
        return self.reader.document(lucene_doc_id).get(self.FIELDNAME_ID)

    def print_document(self, lucene_doc_id, term_vect=False):
        """Prints document contents."""
        if lucene_doc_id is None:
            print "Document is not found in the index."
        else:
            doc = self.reader.document(lucene_doc_id)
            print "Document ID (field '" + self.FIELDNAME_ID + "'): " + doc.get(
                self.FIELDNAME_ID)

            # first collect (unique) field names
            fields = []
            for f in doc.getFields():
                if f.name() != self.FIELDNAME_ID and f.name() not in fields:
                    fields.append(f.name())

            for fname in fields:
                print fname
                for fv in doc.getValues(
                        fname):  # printing (possibly multiple) field values
                    print "\t" + fv
                # term vector
                if term_vect:
                    print "-----"
                    termfreqs = self.get_doc_termfreqs(lucene_doc_id, fname)
                    for term in termfreqs:
                        print term + " : " + str(termfreqs[term])
                    print "-----"

    def get_lucene_query(self, query, field=FIELDNAME_CONTENTS):
        """Creates Lucene query from keyword query."""
        query = query.replace("(", "").replace(")", "").replace("!", "")
        return QueryParser(Lucene.get_version(), field,
                           self.get_analyzer()).parse(query)

    def analyze_query(self, query, field=FIELDNAME_CONTENTS):
        """
        Analyses the query and returns query terms.

        :param query: query
        :param field: field name
        :return: list of query terms
        """
        qterms = []  # holds a list of analyzed query terms
        ts = self.get_analyzer().tokenStream(field, query)
        term = ts.addAttribute(CharTermAttribute.class_)
        ts.reset()
        while ts.incrementToken():
            qterms.append(term.toString())
        ts.end()
        ts.close()
        return qterms

    def get_id_lookup_query(self, id, field=None):
        """Creates Lucene query for searching by (external) document id."""
        if field is None:
            field = self.FIELDNAME_ID
        return TermQuery(Term(field, id))

    def get_and_query(self, queries):
        """Creates an AND Boolean query from multiple Lucene queries."""
        # empty boolean query with Similarity.coord() disabled
        bq = BooleanQuery(False)
        for q in queries:
            bq.add(q, BooleanClause.Occur.MUST)
        return bq

    def get_or_query(self, queries):
        """Creates an OR Boolean query from multiple Lucene queries."""
        # empty boolean query with Similarity.coord() disabled
        bq = BooleanQuery(False)
        for q in queries:
            bq.add(q, BooleanClause.Occur.SHOULD)
        return bq

    def get_phrase_query(self, query, field):
        """Creates phrase query for searching exact phrase."""
        phq = PhraseQuery()
        for t in query.split():
            phq.add(Term(field, t))
        return phq

    def get_span_query(self, terms, field, slop, ordered=True):
        """
        Creates near span query

        :param terms: list of terms
        :param field: field name
        :param slop: number of terms between the query terms
        :param ordered: If true, ordered search; otherwise unordered search
        :return: lucene span near query
        """
        span_queries = []
        for term in terms:
            span_queries.append(SpanTermQuery(Term(field, term)))
        span_near_query = SpanNearQuery(span_queries, slop, ordered)
        return span_near_query

    def get_doc_phrase_freq(self, phrase, field, slop, ordered):
        """
        Returns collection frequency for a given phrase and field.

        :param phrase: str
        :param field: field name
        :param slop: number of terms in between
        :param ordered: If true, term occurrences should be ordered
        :return: dictionary {doc: freq, ...}
        """
        # creates span near query
        span_near_query = self.get_span_query(phrase.split(" "),
                                              field,
                                              slop=slop,
                                              ordered=ordered)

        # extracts document frequency
        self.open_searcher()
        index_reader_context = self.searcher.getTopReaderContext()
        term_contexts = HashMap()
        terms = TreeSet()
        span_near_query.extractTerms(terms)
        for term in terms:
            term_contexts.put(term,
                              TermContext.build(index_reader_context, term))
        leaves = index_reader_context.leaves()
        doc_phrase_freq = {}
        # iterates over all atomic readers
        for atomic_reader_context in leaves:
            bits = atomic_reader_context.reader().getLiveDocs()
            spans = span_near_query.getSpans(atomic_reader_context, bits,
                                             term_contexts)
            while spans.next():
                lucene_doc_id = spans.doc()
                doc_id = atomic_reader_context.reader().document(
                    lucene_doc_id).get(self.FIELDNAME_ID)
                if doc_id not in doc_phrase_freq:
                    doc_phrase_freq[doc_id] = 1
                else:
                    doc_phrase_freq[doc_id] += 1
        return doc_phrase_freq

    def get_id_filter(self):
        return FieldValueFilter(self.FIELDNAME_ID)

    def __to_retrieval_results(self, scoredocs, field_id=FIELDNAME_ID):
        """Converts Lucene scoreDocs results to RetrievalResults format."""
        rr = RetrievalResults()
        if scoredocs is not None:
            for i in xrange(len(scoredocs)):
                score = scoredocs[i].score
                lucene_doc_id = scoredocs[i].doc  # internal doc_id
                doc_id = self.reader.document(lucene_doc_id).get(field_id)
                rr.append(doc_id, score, lucene_doc_id)
        return rr

    def score_query(self,
                    query,
                    field_content=FIELDNAME_CONTENTS,
                    field_id=FIELDNAME_ID,
                    num_docs=100):
        """Scores a given query and return results as a RetrievalScores object."""
        lucene_query = self.get_lucene_query(query, field_content)
        scoredocs = self.searcher.search(lucene_query, num_docs).scoreDocs
        return self.__to_retrieval_results(scoredocs, field_id)

    def num_docs(self):
        """Returns number of documents in the index."""
        self.open_reader()
        return self.reader.numDocs()

    def num_fields(self):
        """Returns number of fields in the index."""
        self.open_reader()
        atomic_reader = SlowCompositeReaderWrapper.wrap(self.reader)
        return atomic_reader.getFieldInfos().size()

    def get_fields(self):
        """Returns name of fields in the index."""
        fields = []
        self.open_reader()
        atomic_reader = SlowCompositeReaderWrapper.wrap(self.reader)
        for fieldInfo in atomic_reader.getFieldInfos().iterator():
            fields.append(fieldInfo.name)
        return fields

    def get_doc_termvector(self, lucene_doc_id, field):
        """Outputs the document term vector as a generator."""
        terms = self.reader.getTermVector(lucene_doc_id, field)
        if terms:
            termenum = terms.iterator(None)
            for bytesref in BytesRefIterator.cast_(termenum):
                yield bytesref.utf8ToString(), termenum

    def get_doc_termfreqs(self, lucene_doc_id, field):
        """
        Returns term frequencies for a given document field.

        :param lucene_doc_id: Lucene document ID
        :param field: document field
        :return dict: with terms
        """
        termfreqs = {}
        for term, termenum in self.get_doc_termvector(lucene_doc_id, field):
            termfreqs[term] = int(termenum.totalTermFreq())
        return termfreqs

    def get_doc_termfreqs_all_fields(self, lucene_doc_id):
        """
        Returns term frequency for all fields in the given document.

        :param lucene_doc_id: Lucene document ID
        :return: dictionary {field: {term: freq, ...}, ...}
        """
        doc_termfreqs = {}
        vectors = self.reader.getTermVectors(lucene_doc_id)
        if vectors:
            for field in vectors.iterator():
                doc_termfreqs[field] = {}
                terms = vectors.terms(field)
                if terms:
                    termenum = terms.iterator(None)
                    for bytesref in BytesRefIterator.cast_(termenum):
                        doc_termfreqs[field][bytesref.utf8ToString()] = int(
                            termenum.totalTermFreq())
                    print doc_termfreqs[field]
        return doc_termfreqs

    def get_coll_termvector(self, field):
        """ Returns collection term vector for the given field."""
        self.open_reader()
        fields = MultiFields.getFields(self.reader)
        if fields is not None:
            terms = fields.terms(field)
            if terms:
                termenum = terms.iterator(None)
                for bytesref in BytesRefIterator.cast_(termenum):
                    yield bytesref.utf8ToString(), termenum

    def get_coll_termfreq(self, term, field):
        """ 
        Returns collection term frequency for the given field.

        :param term: string
        :param field: string, document field
        :return: int
        """
        self.open_reader()
        return self.reader.totalTermFreq(Term(field, term))

    def get_doc_freq(self, term, field):
        """
        Returns document frequency for the given term and field.

        :param term: string, term
        :param field: string, document field
        :return: int
        """
        self.open_reader()
        return self.reader.docFreq(Term(field, term))

    def get_doc_count(self, field):
        """
        Returns number of documents with at least one term for the given field.

        :param field: string, field name
        :return: int
        """
        self.open_reader()
        return self.reader.getDocCount(field)

    def get_coll_length(self, field):
        """ 
        Returns length of field in the collection.

        :param field: string, field name
        :return: int
        """
        self.open_reader()
        return self.reader.getSumTotalTermFreq(field)

    def get_avg_len(self, field):
        """ 
        Returns average length of a field in the collection.

        :param field: string, field name
        """
        self.open_reader()
        n = self.reader.getDocCount(
            field)  # number of documents with at least one term for this field
        len_all = self.reader.getSumTotalTermFreq(field)
        if n == 0:
            return 0
        else:
            return len_all / float(n)
Exemplo n.º 39
0
class Index:
    def __init__(self, folder=None, fields=[], similarity="tfidf"):

        self.jcc = lucene.initVM()

        if folder:
            self.directory = SimpleFSDirectory(File(folder))
        else:
            self.directory = RAMDirectory()

        self.fields = {}

        for field in fields:
            ft = FieldType()
            for pname, pvalue in field.props.items():
                setter = getattr(ft, "set" + pname.capitalize())
                setter(pvalue)

            ft.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS)
            # 			ft.setOmitNorms(True)

            self.fields[field.name] = ft

        self.similarity = similarity.lower()
        self.analyzer = StandardAnalyzer(Version.LUCENE_CURRENT)
        self.writer = None
        self.searcher = None

    def attach_thread(self):
        self.jcc.attachCurrentThread()

    def open_writer(self):

        config = IndexWriterConfig(Version.LUCENE_CURRENT, self.analyzer)
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)

        self.writer = IndexWriter(self.directory, config)

    def add(self, **doc):

        if not self.writer:
            self.open_writer()

        d = Document()
        for field, value in doc.items():
            #			try :
            d.add(Field(field, value, self.fields[field]))


#			except Exception, e :
#				print
#				print "Fudeu"
#				pass

        self.writer.addDocument(d)

    def commit(self):
        self.writer.commit()

    def close(self):
        if self.writer:
            self.writer.close()

    def open_searcher(self):
        self.reader = DirectoryReader.open(self.directory)
        self.searcher = IndexSearcher(self.reader)
        if (self.similarity == "bm25"):
            self.searcher.setSimilarity(BM25Similarity())

    def preprocess_query(self, query, fields, mode="ANY"):
        '''
		Fix query according to provided mode. If the value is not supported, 
		the query remains unchanged
		'''

        terms = query.lower().strip().split()
        if mode == "ANY":
            query = " OR ".join(terms)
        elif mode == "ALL":
            query = " AND ".join(terms)
        else:
            print "Invalid mode parameter '%s'." % mode

        query = QueryParser.escape(query)
        parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields,
                                       self.analyzer)
        query = MultiFieldQueryParser.parse(parser, query)
        return query

    def search(self,
               query,
               search_fields,
               return_fields,
               filter=None,
               ignore=set(),
               mode="ANY",
               return_scores=False,
               limit=1000000):
        '''
		Search documents in the index using a standard analyzer (tokenizes and 
		removes top words). Supports two search modes: ANY and ALL
		  ANY: include documents that contain at least one term of the query.
		  ALL: include only documents that contain all terms of the query. 
		'''

        if not self.searcher:
            self.open_searcher()

        # Return empty results if query is empty (Lucene can't handle it nicely)
        if query.strip() == '':
            if return_scores:
                return [], []
            else:
                return []

        query = self.preprocess_query(query, search_fields, mode)

        # If limit is not provided, return all matched documents. A little hack is required
        # to do that. We query for one document and get the count total matched documents.
        #		if not limit :
        #			hits = self.searcher.search(query, 1)
        #			limit = hits.totalHits

        # Fetch more than asked in case we have to remove entries from the ignore set
        if limit != None:
            limit += len(ignore)

        hits = self.searcher.search(query, filter, limit)
        hits = hits.scoreDocs

        docs = []
        for hit in hits:
            doc = self.searcher.doc(hit.doc)
            if doc['id'] not in ignore:
                docs.append([doc[f] for f in return_fields])

        if return_scores:
            scores = [hit.score for hit in hits]
            return docs[:limit], scores[:limit]

        return docs[:limit]

    def explain(self, query, fields, doc):

        if not self.searcher:
            self.open_searcher()

        query = QueryParser.escape(query)

        parser = MultiFieldQueryParser(Version.LUCENE_CURRENT, fields,
                                       self.analyzer)
        query = MultiFieldQueryParser.parse(parser, query)

        return self.searcher.explain(query, doc)

    def get_documents(self, doc_ids, fields):

        docs = []
        for doc_id in doc_ids:
            doc = self.reader.document(doc_id)
            if isinstance(fields, basestring):
                docs.append(doc.get(fields))
            else:
                docs.append({f: doc.get(f) for f in fields})

        return docs

    def get_query_scores(self, query, fields, doc_ids, mode="ANY"):

        # Creates pre-filter to ignore all other documents
        filter = TermsFilter([Term("id", id) for id in doc_ids])

        query = self.preprocess_query(query, fields, mode)
        hits = self.searcher.search(query, filter, len(doc_ids)).scoreDocs

        # Creates scores' mapping using entity id instead of internal index id
        scores = {
            str(self.reader.document(hit.doc).get("id")): hit.score
            for hit in hits
        }

        # Normalize to 0..1 interval
        #		n = 1.0/sum(scores.values())
        #		scores

        # Adds to the mapping entries for the non-returned docs (no term found)
        for doc_id in doc_ids:
            if doc_id not in scores:
                scores[doc_id] = 0.0

        return scores
        print "Hit enter with no input to quit."
        command = raw_input("Query:")
        if command == '':
            return
        translate = makeFrenchQuery.makeFrenchQuery(command)
        commande = ""
        for word in translate:
            commande += word
            commande += " "
        print
        print "Searching for:", commande
        query = QueryParser("contents", analyzer).parse(commande)
        scoreDocs = searcher.search(query, 50).scoreDocs
        print "%s total matching documents." % len(scoreDocs)

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            print 'path:', doc.get("path"), 'name:', doc.get("name"), (
                'score: %f' % (scoreDoc.score))


if __name__ == '__main__':
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    print 'lucene', lucene.VERSION
    directory = SimpleFSDirectory(Paths.get(os.getcwd(), INDEX_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    searcher.setSimilarity(ClassicSimilarity())
    analyzer = FrenchAnalyzer()
    run(searcher, analyzer)
    del searcher
class Lucene(object):

    # default fieldnames for id and contents
    FIELDNAME_ID = "id"
    FIELDNAME_CONTENTS = "contents"

    # internal fieldtypes
    # used as Enum, the actual values don't matter
    FIELDTYPE_ID = "id"
    FIELDTYPE_ID_TV = "id_tv"
    FIELDTYPE_TEXT = "text"
    FIELDTYPE_TEXT_TV = "text_tv"
    FIELDTYPE_TEXT_TVP = "text_tvp"
    FIELDTYPE_TEXT_NTV = "text_ntv"
    FIELDTYPE_TEXT_NTVP = "text_ntvp"

    def __init__(self, index_dir, max_shingle_size=None):
        global lucene_vm_init

        if not lucene_vm_init:
            lucene.initVM(vmargs=['-Djava.awt.headless=true'])
            lucene_vm_init = True
        self.dir = SimpleFSDirectory(File(index_dir))
        self.max_shingle_size = max_shingle_size
        self.analyzer = None
        self.reader = None
        self.searcher = None
        self.writer = None
        self.ldf = None

    @staticmethod
    def get_version():
        """Get Lucene version."""
        return Version.LUCENE_48

    @staticmethod
    def preprocess(text):
        """Tokenize and stop the input text."""
        ts = StandardTokenizer(Lucene.get_version(), StringReader(text.lower()))
        ts = StopFilter(Lucene.get_version(), ts,  StopAnalyzer.ENGLISH_STOP_WORDS_SET)
        string_builder = StringBuilder()
        ts.reset()
        char_term_attr = ts.addAttribute(CharTermAttribute.class_)
        while ts.incrementToken():
            if string_builder.length() > 0:
                string_builder.append(" ")
            string_builder.append(char_term_attr.toString())
        return string_builder.toString()

    def get_analyzer(self):
        """Get analyzer."""
        if self.analyzer is None:
            std_analyzer = StandardAnalyzer(Lucene.get_version())
            if self.max_shingle_size is None:
                self.analyzer = std_analyzer
            else:
                self.analyzer = ShingleAnalyzerWrapper(std_analyzer, self.max_shingle_size)
        return self.analyzer

    def open_reader(self):
        """Open IndexReader."""
        if self.reader is None:
            self.reader = DirectoryReader.open(self.dir)

    def get_reader(self):
        return self.reader

    def close_reader(self):
        """Close IndexReader."""
        if self.reader is not None:
            self.reader.close()
            self.reader = None
        else:
            raise Exception("There is no open IndexReader to close")

    def open_searcher(self):
        """
        Open IndexSearcher. Automatically opens an IndexReader too,
        if it is not already open. There is no close method for the
        searcher.
        """
        if self.searcher is None:
            self.open_reader()
            self.searcher = IndexSearcher(self.reader)

    def get_searcher(self):
        """Returns index searcher (opens it if needed)."""
        self.open_searcher()
        return self.searcher

    def set_lm_similarity_jm(self, method="jm", smoothing_param=0.1):
        """
        Set searcher to use LM similarity.

        :param method: LM similarity ("jm" or "dirichlet")
        :param smoothing_param: smoothing parameter (lambda or mu)
        """
        if method == "jm":
            similarity = LMJelinekMercerSimilarity(smoothing_param)
        elif method == "dirichlet":
            similarity = LMDirichletSimilarity(smoothing_param)
        else:
            raise Exception("Unknown method")

        if self.searcher is None:
            raise Exception("Searcher has not been created")
        self.searcher.setSimilarity(similarity)

    def open_writer(self):
        """Open IndexWriter."""
        if self.writer is None:
            config = IndexWriterConfig(Lucene.get_version(), self.get_analyzer())
            config.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
            self.writer = IndexWriter(self.dir, config)
        else:
            raise Exception("IndexWriter is already open")

    def close_writer(self):
        """Close IndexWriter."""
        if self.writer is not None:
            self.writer.close()
            self.writer = None
        else:
            raise Exception("There is no open IndexWriter to close")

    def add_document(self, contents):
        """
        Adds a Lucene document with the specified contents to the index.
        See LuceneDocument.create_document() for the explanation of contents.
        """
        if self.ldf is None:  # create a single LuceneDocument object that will be reused
            self.ldf = LuceneDocument()
        self.writer.addDocument(self.ldf.create_document(contents))

    def get_lucene_document_id(self, doc_id):
        """Loads a document from a Lucene index based on its id."""
        self.open_searcher()
        query = TermQuery(Term(self.FIELDNAME_ID, doc_id))
        tophit = self.searcher.search(query, 1).scoreDocs
        if len(tophit) == 1:
            return tophit[0].doc
        else:
            return None

    def get_document_id(self, lucene_doc_id):
        """Gets lucene document id and returns the document id."""
        self.open_reader()
        return self.reader.document(lucene_doc_id).get(self.FIELDNAME_ID)

    def print_document(self, lucene_doc_id, term_vect=False):
        """Prints document contents."""
        if lucene_doc_id is None:
            print "Document is not found in the index."
        else:
            doc = self.reader.document(lucene_doc_id)
            print "Document ID (field '" + self.FIELDNAME_ID + "'): " + doc.get(self.FIELDNAME_ID)

            # first collect (unique) field names
            fields = []
            for f in doc.getFields():
                if f.name() != self.FIELDNAME_ID and f.name() not in fields:
                    fields.append(f.name())

            for fname in fields:
                print fname
                for fv in doc.getValues(fname):  # printing (possibly multiple) field values
                    print "\t" + fv
                # term vector
                if term_vect:
                    print "-----"
                    termfreqs = self.get_doc_termfreqs(lucene_doc_id, fname)
                    for term in termfreqs:
                        print term + " : " + str(termfreqs[term])
                    print "-----"

    def get_lucene_query(self, query, field=FIELDNAME_CONTENTS):
        """Creates Lucene query from keyword query."""
        query = query.replace("(", "").replace(")", "").replace("!", "")
        return QueryParser(Lucene.get_version(), field,
                           self.get_analyzer()).parse(query)

    def analyze_query(self, query, field=FIELDNAME_CONTENTS):
        """
        Analyses the query and returns query terms.

        :param query: query
        :param field: field name
        :return: list of query terms
        """
        qterms = []  # holds a list of analyzed query terms
        ts = self.get_analyzer().tokenStream(field, query)
        term = ts.addAttribute(CharTermAttribute.class_)
        ts.reset()
        while ts.incrementToken():
            qterms.append(term.toString())
        ts.end()
        ts.close()
        return qterms

    def get_id_lookup_query(self, id, field=None):
        """Creates Lucene query for searching by (external) document id."""
        if field is None:
            field = self.FIELDNAME_ID
        return TermQuery(Term(field, id))

    def get_and_query(self, queries):
        """Creates an AND Boolean query from multiple Lucene queries."""
        # empty boolean query with Similarity.coord() disabled
        bq = BooleanQuery(False)
        for q in queries:
            bq.add(q, BooleanClause.Occur.MUST)
        return bq

    def get_or_query(self, queries):
        """Creates an OR Boolean query from multiple Lucene queries."""
        # empty boolean query with Similarity.coord() disabled
        bq = BooleanQuery(False)
        for q in queries:
            bq.add(q, BooleanClause.Occur.SHOULD)
        return bq

    def get_phrase_query(self, query, field):
        """Creates phrase query for searching exact phrase."""
        phq = PhraseQuery()
        for t in query.split():
            phq.add(Term(field, t))
        return phq

    def get_span_query(self, terms, field, slop, ordered=True):
        """
        Creates near span query

        :param terms: list of terms
        :param field: field name
        :param slop: number of terms between the query terms
        :param ordered: If true, ordered search; otherwise unordered search
        :return: lucene span near query
        """
        span_queries = []
        for term in terms:
            span_queries.append(SpanTermQuery(Term(field, term)))
        span_near_query = SpanNearQuery(span_queries, slop, ordered)
        return span_near_query

    def get_doc_phrase_freq(self, phrase, field, slop, ordered):
        """
        Returns collection frequency for a given phrase and field.

        :param phrase: str
        :param field: field name
        :param slop: number of terms in between
        :param ordered: If true, term occurrences should be ordered
        :return: dictionary {doc: freq, ...}
        """
        # creates span near query
        span_near_query = self.get_span_query(phrase.split(" "), field, slop=slop, ordered=ordered)

        # extracts document frequency
        self.open_searcher()
        index_reader_context = self.searcher.getTopReaderContext()
        term_contexts = HashMap()
        terms = TreeSet()
        span_near_query.extractTerms(terms)
        for term in terms:
            term_contexts.put(term, TermContext.build(index_reader_context, term))
        leaves = index_reader_context.leaves()
        doc_phrase_freq = {}
        # iterates over all atomic readers
        for atomic_reader_context in leaves:
            bits = atomic_reader_context.reader().getLiveDocs()
            spans = span_near_query.getSpans(atomic_reader_context, bits, term_contexts)
            while spans.next():
                lucene_doc_id = spans.doc()
                doc_id = atomic_reader_context.reader().document(lucene_doc_id).get(self.FIELDNAME_ID)
                if doc_id not in doc_phrase_freq:
                    doc_phrase_freq[doc_id] = 1
                else:
                    doc_phrase_freq[doc_id] += 1
        return doc_phrase_freq

    def get_id_filter(self):
        return FieldValueFilter(self.FIELDNAME_ID)

    def __to_retrieval_results(self, scoredocs, field_id=FIELDNAME_ID):
        """Converts Lucene scoreDocs results to RetrievalResults format."""
        rr = RetrievalResults()
        if scoredocs is not None:
            for i in xrange(len(scoredocs)):
                score = scoredocs[i].score
                lucene_doc_id = scoredocs[i].doc  # internal doc_id
                doc_id = self.reader.document(lucene_doc_id).get(field_id)
                rr.append(doc_id, score, lucene_doc_id)
        return rr

    def score_query(self, query, field_content=FIELDNAME_CONTENTS, field_id=FIELDNAME_ID, num_docs=100):
        """Scores a given query and return results as a RetrievalScores object."""
        lucene_query = self.get_lucene_query(query, field_content)
        scoredocs = self.searcher.search(lucene_query, num_docs).scoreDocs
        return self.__to_retrieval_results(scoredocs, field_id)

    def num_docs(self):
        """Returns number of documents in the index."""
        self.open_reader()
        return self.reader.numDocs()

    def num_fields(self):
        """Returns number of fields in the index."""
        self.open_reader()
        atomic_reader = SlowCompositeReaderWrapper.wrap(self.reader)
        return atomic_reader.getFieldInfos().size()

    def get_fields(self):
        """Returns name of fields in the index."""
        fields = []
        self.open_reader()
        atomic_reader = SlowCompositeReaderWrapper.wrap(self.reader)
        for fieldInfo in atomic_reader.getFieldInfos().iterator():
            fields.append(fieldInfo.name)
        return fields

    def get_doc_termvector(self, lucene_doc_id, field):
        """Outputs the document term vector as a generator."""
        terms = self.reader.getTermVector(lucene_doc_id, field)
        if terms:
            termenum = terms.iterator(None)
            for bytesref in BytesRefIterator.cast_(termenum):
                yield bytesref.utf8ToString(), termenum

    def get_doc_termfreqs(self, lucene_doc_id, field):
        """
        Returns term frequencies for a given document field.

        :param lucene_doc_id: Lucene document ID
        :param field: document field
        :return dict: with terms
        """
        termfreqs = {}
        for term, termenum in self.get_doc_termvector(lucene_doc_id, field):
            termfreqs[term] = int(termenum.totalTermFreq())
        return termfreqs

    def get_doc_termfreqs_all_fields(self, lucene_doc_id):
        """
        Returns term frequency for all fields in the given document.

        :param lucene_doc_id: Lucene document ID
        :return: dictionary {field: {term: freq, ...}, ...}
        """
        doc_termfreqs = {}
        vectors = self.reader.getTermVectors(lucene_doc_id)
        if vectors:
            for field in vectors.iterator():
                doc_termfreqs[field] = {}
                terms = vectors.terms(field)
                if terms:
                    termenum = terms.iterator(None)
                    for bytesref in BytesRefIterator.cast_(termenum):
                        doc_termfreqs[field][bytesref.utf8ToString()] = int(termenum.totalTermFreq())
                    print doc_termfreqs[field]
        return doc_termfreqs

    def get_coll_termvector(self, field):
        """ Returns collection term vector for the given field."""
        self.open_reader()
        fields = MultiFields.getFields(self.reader)
        if fields is not None:
            terms = fields.terms(field)
            if terms:
                termenum = terms.iterator(None)
                for bytesref in BytesRefIterator.cast_(termenum):
                    yield bytesref.utf8ToString(), termenum

    def get_coll_termfreq(self, term, field):
        """ 
        Returns collection term frequency for the given field.

        :param term: string
        :param field: string, document field
        :return: int
        """
        self.open_reader()
        return self.reader.totalTermFreq(Term(field, term))

    def get_doc_freq(self, term, field):
        """
        Returns document frequency for the given term and field.

        :param term: string, term
        :param field: string, document field
        :return: int
        """
        self.open_reader()
        return self.reader.docFreq(Term(field, term))

    def get_doc_count(self, field):
        """
        Returns number of documents with at least one term for the given field.

        :param field: string, field name
        :return: int
        """
        self.open_reader()
        return self.reader.getDocCount(field)

    def get_coll_length(self, field):
        """ 
        Returns length of field in the collection.

        :param field: string, field name
        :return: int
        """
        self.open_reader()
        return self.reader.getSumTotalTermFreq(field)

    def get_avg_len(self, field):
        """ 
        Returns average length of a field in the collection.

        :param field: string, field name
        """
        self.open_reader()
        n = self.reader.getDocCount(field)  # number of documents with at least one term for this field
        len_all = self.reader.getSumTotalTermFreq(field)
        if n == 0:
            return 0
        else:
            return len_all / float(n)
Exemplo n.º 42
0

def run(searcher, analyzer):
    while True:
        print
        print "Hit enter with no input to quit."
        command = raw_input("Query:")
        if command == '':
            return

        print
        print "Searching for:", command
        query = QueryParser("contents", analyzer).parse(command)
        scoreDocs = searcher.search(query, 50).scoreDocs
        print "%s total matching documents." % len(scoreDocs)

        for scoreDoc in scoreDocs:
            doc = searcher.doc(scoreDoc.doc)
            print 'path:', doc.get("path"), 'name:', doc.get("name")


if __name__ == '__main__':
    STORE_DIR = "/usr/src/pylucene/aclImdb/index"
    lucene.initVM(vmargs=['-Djava.awt.headless=true'])
    print 'lucene', lucene.VERSION
    directory = SimpleFSDirectory(Paths.get(STORE_DIR))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    analyzer = StandardAnalyzer()
    searcher.setSimilarity(BM25Similarity())
    run(searcher, analyzer)
    del searcher
Exemplo n.º 43
0
def main():
    global lucene_vm_init
    if not lucene_vm_init:
        lucene.initVM(vmargs=['-Djava.awt.headless=true'])
        lucene_vm_init = True

    is_index_Exist = os.path.exists(LUCENE_INDEX_DIR)
    # specify index path
    index_mm = MMapDirectory(Paths.get(LUCENE_INDEX_DIR))

    # configure search engine
    analyzer = StandardAnalyzer()
    config = IndexWriterConfig(analyzer)

    # load index to search engine
    reader = DirectoryReader.open(index_mm)
    searcher = IndexSearcher(reader)
    searcher.setSimilarity(BM25Similarity())

    # read query
    read_query()

    # initialize mongodb client
    mongoObj = Mongo_Object('localhost', 27017)

    # initialize word2vec
    print 'load word2vec model'
    w2vmodel = gensim.models.Word2Vec.load_word2vec_format(
        "F:\\modified_w2v\\w2v_wiki_trigram_phrase_20170101\\wiki.en.text.vector.binary",
        binary=True)
    print 'finish loading word2vec model'

    # search
    global hitsPerPage
    fields = ['name', 'value']
    #parser=MultiFieldQueryParser(fields,analyzer)
    #parser.setDefaultOperator(QueryParserBase.AND_OPERATOR)
    rec_result = open('pylucene.runs', 'w')

    for i in range(len(queries)):
        query = queries[i]
        print 'processing query ' + str(i) + ':' + query[0]
        querystr = remove_duplicate(stemSentence(query[1]))
        #q_lucene=MultiFieldQueryParser.parse(parser,querystr)
        q_lucene = QueryParser("all_text", analyzer).parse(querystr)
        print "q_lucene: " + q_lucene.toString()
        collector = TopScoreDocCollector.create(hitsPerPage)
        searcher.search(q_lucene, collector)
        hits = collector.topDocs().scoreDocs

        # build query object for computeScore
        #queryObj=Query_Object(query,mongoObj,w2vmodel)

        # initialize duplicate remover
        docDup = set()

        # find candidate results after 1st round filter
        candidates = PriorityQueue()
        for j in xrange(len(hits)):
            docID = hits[j].doc
            d = searcher.doc(docID)
            name = cleanSentence(d['title'].strip())
            if name in docDup:
                continue
            docDup.add(name)
            # build entity object
            entityObj = Entity_Object(d, mongoObj, w2vmodel)
            #score = computeScore(queryObj,entityObj,mongoObj,w2vmodel)
            score = hits[j].score
            candidates.put((-score, j))

        # output results from priority queue larger score first
        rank = 0
        while candidates.empty() == False and rank < 100:
            rank = rank + 1
            item = candidates.get()
            score = -item[0]
            j = item[1]  # index of hits[]
            docID = hits[j].doc
            d = searcher.doc(docID)
            title = '<dbpedia:' + d.get('title') + '>'
            res_line = query[0] + '\t' + 'Q0' + '\t' + title + '\t' + str(
                rank) + '\t' + str(score) + '\t' + 'pylucene_multifield'
            rec_result.writelines(res_line + '\n')
    rec_result.close()