Python IndexSearcher.getIndexReader примеры использования

Язык программирования: Python

Пространство имен/Пакет: org.apache.lucene.search

Класс/Тип: IndexSearcher

Метод/Функция: getIndexReader

Примеров на hotexamples.com: 6

Python IndexSearcher.getIndexReader - 6 примеров найдено. Это лучшие примеры Python кода для org.apache.lucene.search.IndexSearcher.getIndexReader, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

IndexSearcher(30)

doc(30)

search(30)

setSimilarity(30)

explain(12)

getIndexReader(5)

getSimilarity(4)

close(2)

count(1)

getTopReaderContext(1)

get_description(1)

searchAfter(1)

Пример #1

Показать файл

Файл: lucenedriver.py Проект: bradleyjones/apiary

    def query(self, data):
        if self.fil.exists():
            searcher = IndexSearcher(DirectoryReader.open(self.d))
            query = QueryParser(
                Version.LUCENE_30,
                "id",
                self.analyzer).parse(
                data['query'])
            hits = searcher.search(query, 100000)

            results = {}

            results['totalHits'] = hits.totalHits
            results['hits'] = {}

            for hit in hits.scoreDocs:
                record = {}
                doc = searcher.doc(hit.doc)
                fields = doc.getFields()
                record['score'] = hit.score
                for field in fields:
                    if field.name() != "id":
                        record[field.name()] = field.stringValue()
                results['hits'][doc.get('id')] = record

            searcher.getIndexReader().close()
            return results

Пример #2

Показать файл

def main(args):
    global verbose
    verbose = args.verbose

    if verbose:
        logger.info(f'Read {args.dir_index}')
    directory = SimpleFSDirectory.open(Paths.get(args.dir_index))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    reader = searcher.getIndexReader()

    if verbose:
        logger.info(f'Write to {args.path_output}')
    with open(args.path_output, 'w') as f:
        for idx in trange(reader.maxDoc()):
            doc = reader.document(idx)
            babelnet_id = doc.get('ID')
            synset_id = doc.get('SYNSET_ID')
            pos = doc.get('POS')
            synset_type = doc.get('TYPE')
            main_sense = doc.get('MAIN_SENSE')
            categories = list(doc.getValues('CATEGORY'))
            translation_mappings = list(doc.getValues('TRANSLATION_MAPPING'))
            images = list(doc.getValues('IMAGE'))
            lemmas = doc.getValues('LEMMA')
            forms = []
            for i in range(len(lemmas)):
                forms.append({
                    'lemma': lemmas[i],
                    'source': doc.getValues('LEMMA_SOURCE')[i],
                    'lang': doc.getValues('LEMMA_LANGUAGE')[i],
                    'weight': doc.getValues('LEMMA_WEIGHT')[i],
                    'sense_key': doc.getValues('LEMMA_SENSEKEY')[i],
                })
            entry = {
                'id': babelnet_id,
                'synset': synset_id,
                'pos': pos,
                'type': synset_type,
                'main_sense': main_sense,
                'categories': categories,
                'translation_mappings': translation_mappings,
                'images': images,
                'forms': forms
            }
            f.write(json.dumps(entry, ensure_ascii=False) + '\n')

    return 0

Пример #3

Показать файл

Файл: text_interface.py Проект: Silleellie/Docker-test

    def get_tf_idf(self, field_name: str, content_id: str):
        """
        Calculates the tf-idf for the words contained in the field of the content whose id
        is content_id

        Args:
            field_name (str): Name of the field containing the words for which calculate the tf-idf
            content_id (str): Id of the content that contains the specified field

        Returns:
             words_bag (Dict <str, float>):
             Dictionary whose keys are the words contained in the field,
             and the corresponding values are the tf-idf values.
        """
        searcher = IndexSearcher(
            DirectoryReader.open(SimpleFSDirectory(Paths.get(self.directory))))
        query = QueryParser("testo_libero",
                            KeywordAnalyzer()).parse("content_id:\"" +
                                                     content_id + "\"")
        score_docs = searcher.search(query, 1).scoreDocs
        document_offset = -1
        for score_doc in score_docs:
            document_offset = score_doc.doc

        reader = searcher.getIndexReader()
        words_bag = {}
        term_vector = reader.getTermVector(document_offset, field_name)
        term_enum = term_vector.iterator()
        for term in BytesRefIterator.cast_(term_enum):
            term_text = term.utf8ToString()
            postings = term_enum.postings(None)
            postings.nextDoc()
            term_frequency = 1 + math.log10(
                postings.freq())  # normalized term frequency
            inverse_document_frequency = math.log10(
                reader.maxDoc() / reader.docFreq(Term(field_name, term)))
            tf_idf = term_frequency * inverse_document_frequency
            words_bag[term_text] = tf_idf

        reader.close()
        return words_bag

Пример #4

Показать файл

class LuceneSearch():
    def __init__(self):

        self.env = lucene.initVM(initialheap='28g',
                                 maxheap='28g',
                                 vmargs=['-Djava.awt.headless=true'])
        self.vocab = None

        BooleanQuery.setMaxClauseCount(2048)

        if not os.path.exists(prm.index_folder):
            print 'Creating index at', prm.index_folder
            if prm.docs_path == prm.docs_path_term:
                add_terms = True
            else:
                add_terms = False
            self.create_index(prm.index_folder, prm.docs_path, add_terms)

        if prm.local_index_folder:
            print 'copying index from', prm.index_folder, 'to', prm.local_index_folder
            if os.path.exists(prm.local_index_folder):
                print 'Folder', prm.local_index_folder, 'already exists! Doing nothing.'
            else:
                shutil.copytree(prm.index_folder, prm.local_index_folder)
            self.index_folder = prm.local_index_folder
        else:
            self.index_folder = prm.index_folder

        fsDir = MMapDirectory(Paths.get(prm.index_folder))
        self.searcher = IndexSearcher(DirectoryReader.open(fsDir))

        if prm.docs_path != prm.docs_path_term:
            if not os.path.exists(prm.index_folder_term):
                print 'Creating index at', prm.index_folder_term
                self.create_index(prm.index_folder_term,
                                  prm.docs_path_term,
                                  add_terms=True)

            if prm.local_index_folder_term:
                print 'copying index from', prm.index_folder_term, 'to', prm.local_index_folder_term
                if os.path.exists(prm.local_index_folder_term):
                    print 'Folder', prm.local_index_folder_term, 'already exists! Doing nothing.'
                else:
                    shutil.copytree(prm.index_folder_term,
                                    prm.local_index_folder_term)
                self.index_folder_term = prm.local_index_folder_term
            else:
                self.index_folder_term = prm.index_folder_term
            fsDir_term = MMapDirectory(Paths.get(prm.index_folder_term))
            self.searcher_term = IndexSearcher(
                DirectoryReader.open(fsDir_term))

        self.analyzer = StandardAnalyzer()
        self.pool = ThreadPool(processes=prm.n_threads)
        self.cache = {}

        print 'Loading Title-ID mapping...'
        self.title_id_map, self.id_title_map = self.get_title_id_map()

    def get_title_id_map(self):

        # get number of docs
        n_docs = self.searcher.getIndexReader().numDocs()

        title_id = {}
        id_title = {}
        query = MatchAllDocsQuery()
        hits = self.searcher.search(query, n_docs)
        for hit in hits.scoreDocs:
            doc = self.searcher.doc(hit.doc)
            idd = int(doc['id'])
            title = doc['title']
            title_id[title] = idd
            id_title[idd] = title

        return title_id, id_title

    def add_doc(self, doc_id, title, txt, add_terms):

        doc = Document()
        txt = utils.clean(txt)

        if add_terms:
            txt_ = txt.lower()
            words_idx, words = utils.text2idx2([txt_], self.vocab,
                                               prm.max_terms_per_doc)
            words_idx = words_idx[0]
            words = words[0]

        doc.add(Field("id", str(doc_id), self.t1))
        doc.add(Field("title", title, self.t1))
        doc.add(Field("text", txt, self.t2))
        if add_terms:
            doc.add(Field("word_idx", ' '.join(map(str, words_idx)), self.t3))
            doc.add(Field("word", '<&>'.join(words), self.t3))
        self.writer.addDocument(doc)

    def create_index(self, index_folder, docs_path, add_terms=False):

        print 'Loading Vocab...'
        if not self.vocab:
            self.vocab = utils.load_vocab(prm.vocab_path, prm.n_words)

        os.mkdir(index_folder)

        self.t1 = FieldType()
        self.t1.setStored(True)
        self.t1.setIndexOptions(IndexOptions.DOCS)

        self.t2 = FieldType()
        self.t2.setStored(False)
        self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        self.t3 = FieldType()
        self.t3.setStored(True)
        self.t3.setIndexOptions(IndexOptions.NONE)

        fsDir = MMapDirectory(Paths.get(index_folder))
        writerConfig = IndexWriterConfig(StandardAnalyzer())
        self.writer = IndexWriter(fsDir, writerConfig)
        print "%d docs in index" % self.writer.numDocs()
        print "Indexing documents..."

        doc_id = 0

        import corpus_hdf5
        corpus = corpus_hdf5.CorpusHDF5(docs_path)
        for txt in corpus.get_text_iter():
            title = corpus.get_article_title(doc_id)
            self.add_doc(doc_id, title, txt, add_terms)
            if doc_id % 1000 == 0:
                print 'indexing doc', doc_id
            doc_id += 1

        print "Index of %d docs..." % self.writer.numDocs()
        self.writer.close()

    def search_multithread(self, qs, max_cand, max_full_cand, searcher):

        self.max_cand = max_cand
        self.max_full_cand = max_full_cand
        self.curr_searcher = searcher
        out = self.pool.map(self.search_multithread_part, qs)

        return out

    def search_multithread_part(self, q):

        if not self.env.isCurrentThreadAttached():
            self.env.attachCurrentThread()

        if q in self.cache:
            return self.cache[q]
        else:

            try:
                q = q.replace('AND',
                              '\\AND').replace('OR',
                                               '\\OR').replace('NOT', '\\NOT')
                query = QueryParser("text",
                                    self.analyzer).parse(QueryParser.escape(q))
            except:
                print 'Unexpected error when processing query:', str(q)
                print 'Using query "dummy".'
                q = 'dummy'
                query = QueryParser("text",
                                    self.analyzer).parse(QueryParser.escape(q))

            c = OrderedDict()
            hits = self.curr_searcher.search(query, self.max_cand)

            for i, hit in enumerate(hits.scoreDocs):
                doc = self.curr_searcher.doc(hit.doc)
                if i < self.max_full_cand:
                    word_idx = map(int, doc['word_idx'].split(' '))
                    word = doc['word'].split('<&>')
                else:
                    word_idx = []
                    word = []
                c[int(doc['id'])] = [word_idx, word]

            return c

    def search_singlethread(self, qs, max_cand, max_full_cand, curr_searcher):

        out = []
        for q in qs:
            if q in self.cache:
                out.append(self.cache[q])
            else:
                try:
                    q = q.replace('AND', '\\AND').replace('OR',
                                                          '\\OR').replace(
                                                              'NOT', '\\NOT')
                    query = QueryParser("text", self.analyzer).parse(
                        QueryParser.escape(q))
                except:
                    print 'Unexpected error when processing query:', str(q)
                    print 'Using query "dummy".'
                    query = QueryParser("text", self.analyzer).parse(
                        QueryParser.escape('dummy'))

                c = OrderedDict()
                hits = curr_searcher.search(query, max_cand)

                for i, hit in enumerate(hits.scoreDocs):
                    doc = curr_searcher.doc(hit.doc)
                    if i < max_full_cand:
                        word_idx = map(int, doc['word_idx'].split(' '))
                        word = doc['word'].split('<&>')
                    else:
                        word_idx = []
                        word = []
                    c[int(doc['id'])] = [word_idx, word]

                out.append(c)

        return out

    def get_candidates(self,
                       qs,
                       max_cand,
                       max_full_cand=None,
                       save_cache=False,
                       extra_terms=True):
        if not max_full_cand:
            max_full_cand = max_cand

        if prm.docs_path != prm.docs_path_term:
            max_cand2 = 0
        else:
            max_cand2 = max_full_cand
        if prm.n_threads > 1:
            out = self.search_multithread(qs, max_cand, max_cand2,
                                          self.searcher)
            if (prm.docs_path != prm.docs_path_term) and extra_terms:
                terms = self.search_multithread(qs, max_full_cand,
                                                max_full_cand,
                                                self.searcher_term)
        else:
            out = self.search_singlethread(qs, max_cand, max_cand2,
                                           self.searcher)
            if (prm.docs_path != prm.docs_path_term) and extra_terms:
                terms = self.search_singlethread(qs, max_full_cand,
                                                 max_full_cand,
                                                 self.searcher_term)

        if (prm.docs_path != prm.docs_path_term) and extra_terms:
            for outt, termss in itertools.izip(out, terms):
                for cand_id, term in itertools.izip(
                        outt.keys()[:max_full_cand], termss.values()):
                    outt[cand_id] = term

        if save_cache:
            for q, c in itertools.izip(qs, out):
                if q not in self.cache:
                    self.cache[q] = c

        return out

Пример #5

Показать файл

class QuestionLuceneSearch():

    def __init__(self):

        self.env = lucene.initVM(initialheap='6g', maxheap='6g', vmargs=['-Djava.awt.headless=true'])
        self.vocab = None

        BooleanQuery.setMaxClauseCount(2048)

        if not os.path.exists(prm.index_folder):
            print('Creating index at', prm.index_folder)
            if prm.docs_path == prm.docs_path_term:
                add_terms = True
            else:
                add_terms = False
            self.create_index(prm.index_folder, prm.docs_path, add_terms)

        if prm.local_index_folder:
            print('copying index from', prm.index_folder, 'to', prm.local_index_folder)
            if os.path.exists(prm.local_index_folder):
                print('Folder', prm.local_index_folder, 'already exists! Doing nothing.')
            else:
                shutil.copytree(prm.index_folder, prm.local_index_folder)
            self.index_folder = prm.local_index_folder
        else:
            self.index_folder = prm.index_folder

        fsDir = MMapDirectory(Paths.get(prm.index_folder))
        self.searcher = IndexSearcher(DirectoryReader.open(fsDir))

        self.searcher.setSimilarity(BM25Similarity())

        if prm.docs_path != prm.docs_path_term:
            if not os.path.exists(prm.index_folder_term):
                print('Creating index at', prm.index_folder_term)
                self.create_index(prm.index_folder_term, prm.docs_path_term, add_terms=True)

            if prm.local_index_folder_term:
                print('copying index from', prm.index_folder_term, 'to', prm.local_index_folder_term)
                if os.path.exists(prm.local_index_folder_term):
                    print('Folder', prm.local_index_folder_term, 'already exists! Doing nothing.')
                else:
                    shutil.copytree(prm.index_folder_term, prm.local_index_folder_term)
                self.index_folder_term = prm.local_index_folder_term
            else:
                self.index_folder_term = prm.index_folder_term
            fsDir_term = MMapDirectory(Paths.get(prm.index_folder_term))
            self.searcher_term = IndexSearcher(DirectoryReader.open(fsDir_term))

        self.analyzer = StandardAnalyzer()
        self.pool = ThreadPool(processes=prm.n_threads)
        self.cache = {}
        
        print('Loading Text-ID mapping...')
        self.text_id_map, self.id_text_map = self.get_text_id_map()

    def get_text_id_map(self):

        # get number of docs
        n_docs = self.searcher.getIndexReader().numDocs()

        text_id = {}
        id_text = {}
        query = MatchAllDocsQuery()
        hits = self.searcher.search(query, n_docs)
        for hit in hits.scoreDocs:
            doc = self.searcher.doc(hit.doc)
            idd = int(doc['id'])
            text = doc['text']
            text_id[text] = idd
            id_text[idd] = text

        return text_id, id_text


    # def add_doc(self, doc_id, title, txt, add_terms):
    def add_doc(self, doc_id, txt, add_terms):

        doc = Document()
        txt = utils.clean(txt)

        if add_terms:
            txt_ = txt.lower()
            words_idx, words = utils.text2idx2([txt_], self.vocab, prm.max_terms_per_doc)
            words_idx = words_idx[0]
            words = words[0]

        doc.add(Field("id", str(doc_id), self.t1))
        # doc.add(Field("title", title, self.t1))
        doc.add(Field("text", txt, self.t2))
        if add_terms:
            doc.add(Field("word_idx", ' '.join(map(str,words_idx)), self.t3))
            doc.add(Field("word", '<&>'.join(words), self.t3))
        self.writer.addDocument(doc)


    def create_index(self, index_folder, docs_path, add_terms=False):

        print('Loading Vocab...')
        if not self.vocab:
            self.vocab = utils.load_vocab(prm.vocab_path, prm.n_words)

        os.mkdir(index_folder)

        self.t1 = FieldType()
        self.t1.setStored(True)
        self.t1.setIndexOptions(IndexOptions.DOCS)

        self.t2 = FieldType()
        self.t2.setStored(False)
        self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        self.t3 = FieldType()
        self.t3.setStored(True)
        self.t3.setIndexOptions(IndexOptions.NONE)
       
        fsDir = MMapDirectory(Paths.get(index_folder))
        writerConfig = IndexWriterConfig(StandardAnalyzer())
        self.writer = IndexWriter(fsDir, writerConfig)
        print("%d docs in index" % self.writer.numDocs())
        print("Indexing documents...")


        # import corpus_hdf5
        # corpus = corpus_hdf5.MSMARCOCorpusHDF5(docs_path)
        import pickle
        with open(docs_path, "rb") as read_file:
            corpus = pickle.load(read_file)
        idx_cnt = 0
        # for doc_id, txt in zip(corpus.get_id_iter(), corpus.get_text_iter()):
        # for doc_id, txt in corpus.items():
        for txt in corpus:
            self.add_doc(idx_cnt, txt, add_terms)  # not lowered
            if idx_cnt % 1000 == 0:
                print('indexing doc', idx_cnt)
            idx_cnt += 1
        print("Index of %d docs..." % self.writer.numDocs())
        self.writer.close()


    def search_multithread(self, qs, max_cand, max_full_cand, searcher):

        self.max_cand = max_cand
        self.max_full_cand = max_full_cand
        self.curr_searcher = searcher
        out = self.pool.map(self.search_multithread_part, qs)
 
        return out


    def search_multithread_part(self, q):

        if not self.env.isCurrentThreadAttached():
            self.env.attachCurrentThread()
    
        if q in self.cache:
            return self.cache[q]
        else:

            try:
                q = q.replace('AND','\\AND').replace('OR','\\OR').replace('NOT','\\NOT')
                query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))
            except:
                print('Unexpected error when processing query:', str(q))
                print('Using query "dummy".')
                q = 'dummy'
                query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))

            c = OrderedDict()
            hits = self.curr_searcher.search(query, self.max_cand)

            for i, hit in enumerate(hits.scoreDocs):
                doc = self.curr_searcher.doc(hit.doc)
                if i < self.max_full_cand:
                    word_idx = list(map(int, doc['word_idx'].split(' ')))
                    word = doc['word'].split('<&>')
                else:
                    word_idx = []
                    word = []
                # c[int(doc['id'])] = [word_idx, word]
                c[int(doc['id'])] = [word_idx, word, hit.score]
            # print(c)
            return c

    
    def search_singlethread(self, qs, max_cand, max_full_cand, curr_searcher):

        out = []
        for q in qs:
            if q in self.cache:
                out.append(self.cache[q])
            else:
                try:
                    q = q.replace('AND','\\AND').replace('OR','\\OR').replace('NOT','\\NOT')
                    query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))
                except:
                    print('Unexpected error when processing query:', str(q))
                    print('Using query "dummy".')
                    query = QueryParser("text", self.analyzer).parse(QueryParser.escape('dummy'))

                c = OrderedDict()
                hits = curr_searcher.search(query, max_cand)

                for i, hit in enumerate(hits.scoreDocs):
                    doc = curr_searcher.doc(hit.doc)
                    if i < max_full_cand:
                        word_idx = list(map(int, doc['word_idx'].split(' ')))
                        word = doc['word'].split('<&>')
                    else:
                        word_idx = []
                        word = []
                    # c[int(doc['id'])] = [word_idx, word]
                    c[int(doc['id'])] = [word_idx, word, hit.score]
                out.append(c)

        return out


    def get_candidates(self, qs, max_cand, max_full_cand=None, save_cache=False, extra_terms=True):
        if not max_full_cand:
            max_full_cand = max_cand

        if prm.docs_path != prm.docs_path_term:
            max_cand2 = 0
        else:
            max_cand2 = max_full_cand
        if prm.n_threads > 1:
            out = self.search_multithread(qs, max_cand, max_cand2, self.searcher)
            if (prm.docs_path != prm.docs_path_term) and extra_terms:
                terms = self.search_multithread(qs, max_full_cand, max_full_cand, self.searcher_term)
        else:
            out = self.search_singlethread(qs, max_cand, max_cand2, self.searcher)
            if (prm.docs_path != prm.docs_path_term) and extra_terms:
                terms = self.search_singlethread(qs, max_full_cand, max_full_cand, self.searcher_term)

        if (prm.docs_path != prm.docs_path_term) and extra_terms:
            for outt, termss in zip(out, terms):                
                for cand_id, term in zip(list(outt.keys())[:max_full_cand], list(termss.values())):
                    outt[cand_id] = term
  
        if save_cache:
            for q, c in zip(qs, out):
                if q not in self.cache:
                    self.cache[q] = c

        return out



    def get_pair_scores(self, q, doc_int, save_cache=False, extra_terms=True):

        # if prm.n_threads > 1:
        #     out = self.search_pair_score_multithread(qs_trailing_doc, self.searcher)
        #     if (prm.docs_path != prm.docs_path_term) and extra_terms:
        #         terms = self.search_pair_score_multithread(qs_trailing_doc, self.searcher_term)
        # else:
        # out = self.search_pair_score_singlethread(qs_trailing_doc, self.searcher)
        # if (prm.docs_path != prm.docs_path_term) and extra_terms:
        #     terms = self.search_pair_score_singlethread(qs_trailing_doc, self.searcher_term)
        out = []

        try:
            q = q.replace('AND', '\\AND').replace('OR', '\\OR').replace('NOT', '\\NOT')
            query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))
            # query = QueryParser("text", StandardAnalyzer()).parse(QueryParser.escape(q))
        except:
            print('Unexpected error when processing query:', str(q))
            print('Using query "dummy".')
            q = 'dummy'
            query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))
            # query = QueryParser("text", StandardAnalyzer()).parse(QueryParser.escape(q))

        c = OrderedDict()
        exp = self.searcher.explain(query, doc_int)
        c[1] = exp
        out.append(c)

        return out

    def search_pair_score_singlethread(self, q, doc_int, searcher):

        out = []

        try:
            q = q.replace('AND', '\\AND').replace('OR', '\\OR').replace('NOT', '\\NOT')
            query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))
            # query = QueryParser("text", StandardAnalyzer()).parse(QueryParser.escape(q))
        except:
            print('Unexpected error when processing query:', str(q))
            print('Using query "dummy".')
            q = 'dummy'
            query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))
            # query = QueryParser("text", StandardAnalyzer()).parse(QueryParser.escape(q))


        c = OrderedDict()
        exp = searcher.explain(query, doc_int)
        c[1] = exp

        out.append(c)

        return out

    def search_pair_score_multithread(self, qs_trailing_doc, searcher):

        self.curr_searcher = searcher
        # out = self.pool.map(self.search_pair_score_multithread_part, product(qs,doc_int))
        out = self.pool.map(self.search_pair_score_multithread_part, qs_trailing_doc)

        return out

    def search_pair_score_multithread_part(self, q_doc_int):

        # print(q_doc_int)
        spl=q_doc_int.split('<|endoftext|>')
        q = spl[0]
        print(q)
        doc_int = int(spl[1])
        print(doc_int)

        if not self.env.isCurrentThreadAttached():
            self.env.attachCurrentThread()

        try:
            q = q.replace('AND', '\\AND').replace('OR', '\\OR').replace('NOT', '\\NOT')
            query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))
        except:
            print('Unexpected error when processing query:', str(q))
            print('Using query "dummy".')
            q = 'dummy'
            query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))


        c = OrderedDict()
        exp = self.curr_searcher.explain(query, doc_int)
        c[1] = exp

        return c

Пример #6

Показать файл

Файл: lucene_search.py Проект: zhanglae/QueryReformulator

class LuceneSearch():
    def __init__(self):

        self.env = lucene.initVM(initialheap='28g',
                                 maxheap='28g',
                                 vmargs=['-Djava.awt.headless=true'])
        self.vocab = None

        BooleanQuery.setMaxClauseCount(2048)

        if not os.path.exists(prm.index_folder):
            print 'Creating index at', prm.index_folder
            if prm.docs_path == prm.docs_path_term:
                add_terms = True
            else:
                add_terms = False
            self.create_index(prm.index_folder, prm.docs_path, add_terms)

        if prm.local_index_folder:
            print 'copying index from', prm.index_folder, 'to', prm.local_index_folder
            if os.path.exists(prm.local_index_folder):
                print 'Folder', prm.local_index_folder, 'already exists! Doing nothing.'
            else:
                shutil.copytree(prm.index_folder, prm.local_index_folder)
            self.index_folder = prm.local_index_folder
        else:
            self.index_folder = prm.index_folder

        fsDir = MMapDirectory(Paths.get(prm.index_folder))
        self.searcher = IndexSearcher(DirectoryReader.open(fsDir))

        if prm.docs_path != prm.docs_path_term:
            if not os.path.exists(prm.index_folder_term):
                print 'Creating index at', prm.index_folder_term
                self.create_index(prm.index_folder_term,
                                  prm.docs_path_term,
                                  add_terms=True)

            if prm.local_index_folder_term:
                print 'copying index from', prm.index_folder_term, 'to', prm.local_index_folder_term
                if os.path.exists(prm.local_index_folder_term):
                    print 'Folder', prm.local_index_folder_term, 'already exists! Doing nothing.'
                else:
                    shutil.copytree(prm.index_folder_term,
                                    prm.local_index_folder_term)
                self.index_folder_term = prm.local_index_folder_term
            else:
                self.index_folder_term = prm.index_folder_term
            fsDir_term = MMapDirectory(Paths.get(prm.index_folder_term))
            self.searcher_term = IndexSearcher(
                DirectoryReader.open(fsDir_term))

        self.analyzer = StandardAnalyzer()
        self.pool = ThreadPool(processes=prm.n_threads)
        self.cache = {}

        print 'Loading Title-ID mapping...'
        self.title_id_map, self.id_title_map = self.get_title_id_map()

        if prm.idf_path:
            print 'Loading IDF dictionary...'
            self.idf = pkl.load(open(prm.idf_path))

    def get_title_id_map(self):

        # get number of docs
        n_docs = self.searcher.getIndexReader().numDocs()

        title_id = {}
        id_title = {}
        query = MatchAllDocsQuery()
        hits = self.searcher.search(query, n_docs)
        for hit in hits.scoreDocs:
            doc = self.searcher.doc(hit.doc)
            idd = int(doc['id'])
            title = doc['title']
            title_id[title] = idd
            id_title[idd] = title

        return title_id, id_title

    def add_idf(self, txt):
        txt = utils.clean(txt)
        txt = txt.lower()
        df = set()
        for word in wordpunct_tokenize(txt):
            if word not in df:
                df.add(word)
                self.idf[word] += 1.

    def add_doc(self, doc_id, title, txt, add_terms):

        doc = Document()
        txt = utils.clean(txt)

        if add_terms:
            if prm.top_tfidf > 0:
                words_idx = []
                words, _ = utils.top_tfidf(txt.lower(), self.idf,
                                           prm.top_tfidf, prm.min_term_freq)

                if len(words) == 0:
                    words.append('unk')

                for w in words:
                    if w in self.vocab:
                        words_idx.append(self.vocab[w])
                    else:
                        words_idx.append(-1)  # unknown words.

            else:
                txt_ = txt.lower()
                words_idx, words = utils.text2idx2([txt_], self.vocab,
                                                   prm.max_terms_per_doc)
                words_idx = words_idx[0]
                words = words[0]

        doc.add(Field("id", str(doc_id), self.t1))
        doc.add(Field("title", title, self.t1))
        doc.add(Field("text", txt, self.t2))
        if add_terms:
            doc.add(Field("word_idx", ' '.join(map(str, words_idx)), self.t3))
            doc.add(Field("word", '<&>'.join(words), self.t3))
        self.writer.addDocument(doc)

    def create_index(self, index_folder, docs_path, add_terms=False):

        print 'Loading Vocab...'
        if not self.vocab:
            self.vocab = utils.load_vocab(prm.vocab_path, prm.n_words)

        os.mkdir(index_folder)

        self.t1 = FieldType()
        self.t1.setStored(True)
        self.t1.setIndexOptions(IndexOptions.DOCS)

        self.t2 = FieldType()
        self.t2.setStored(False)
        self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        self.t3 = FieldType()
        self.t3.setStored(True)
        self.t3.setIndexOptions(IndexOptions.NONE)

        if add_terms:
            if prm.top_tfidf > 0 or prm.idf_path:
                print 'Creating IDF dictionary...'
                self.idf = defaultdict(int)
                doc_id = 0
                if docs_path.lower().endswith('.hdf5'):
                    import corpus_hdf5
                    corpus = corpus_hdf5.CorpusHDF5(docs_path)
                    for txt in corpus.get_text_iter():
                        self.add_idf(txt)

                        if doc_id % 1000 == 0:
                            print 'Creating IDF, doc', doc_id
                        doc_id += 1

                else:
                    # ClueWeb09
                    import warc
                    import gzip
                    from bs4 import BeautifulSoup
                    # list all files in the folder.
                    paths = []
                    for root, directories, filenames in os.walk(docs_path):
                        for filename in filenames:
                            paths.append(os.path.join(root, filename))

                    for path in paths:
                        with gzip.open(path, mode='rb') as gzf:
                            for record in warc.WARCFile(fileobj=gzf):
                                # remove html tags
                                txt = BeautifulSoup(
                                    record.payload[:1000 * 1000],
                                    "lxml").get_text()
                                # remove WARC headers.
                                txt = '\n'.join(txt.split('\n')[10:])

                                self.add_idf(txt)

                                if doc_id % 1000 == 0:
                                    print 'Creating IDF, doc', doc_id
                                doc_id += 1

                for key, val in self.idf.items():
                    self.idf[key] = math.log(float(doc_id) / val)

                pkl.dump(self.idf, open(prm.idf_path, 'wb'))

        fsDir = MMapDirectory(Paths.get(index_folder))
        writerConfig = IndexWriterConfig(StandardAnalyzer())
        self.writer = IndexWriter(fsDir, writerConfig)
        print "%d docs in index" % self.writer.numDocs()
        print "Indexing documents..."

        doc_id = 0
        if docs_path.lower().endswith('.hdf5'):
            import corpus_hdf5
            corpus = corpus_hdf5.CorpusHDF5(docs_path)
            for txt in corpus.get_text_iter():
                title = corpus.get_article_title(doc_id)
                self.add_doc(doc_id, title, txt, add_terms)
                if doc_id % 1000 == 0:
                    print 'indexing doc', doc_id
                doc_id += 1
        else:
            # ClueWeb09
            import warc
            import gzip
            from bs4 import BeautifulSoup

            # list all files in the folder.
            paths = []
            for root, directories, filenames in os.walk(docs_path):
                for filename in filenames:
                    paths.append(os.path.join(root, filename))

            for path in paths:
                with gzip.open(path, mode='rb') as gzf:
                    for record in warc.WARCFile(fileobj=gzf):
                        if 'warc-trec-id' in record:
                            title = record['warc-trec-id']
                        else:
                            title = record['warc-record-id']
                        # remove html tags
                        #txt = BeautifulSoup(record.payload[:1000*1000], "lxml").get_text()
                        txt = record.payload[:1000 * 1000]
                        # remove WARC headers.
                        txt = '\n'.join(txt.split('\n')[10:])

                        self.add_doc(doc_id, title, txt, add_terms)
                        if doc_id % 1000 == 0:
                            print 'indexing doc', doc_id
                        doc_id += 1

        print "Index of %d docs..." % self.writer.numDocs()
        self.writer.close()

    def search_multithread(self, qs, max_cand, max_full_cand, searcher):

        self.max_cand = max_cand
        self.max_full_cand = max_full_cand
        self.curr_searcher = searcher
        out = self.pool.map(self.search_multithread_part, qs)

        return out

    def search_multithread_part(self, q):

        if not self.env.isCurrentThreadAttached():
            self.env.attachCurrentThread()

        if q in self.cache:
            return self.cache[q]
        else:

            try:
                q = q.replace('AND',
                              '\\AND').replace('OR',
                                               '\\OR').replace('NOT', '\\NOT')
                query = QueryParser("text",
                                    self.analyzer).parse(QueryParser.escape(q))
            except:
                print 'Unexpected error when processing query:', str(q)
                print 'Using query "dummy".'
                q = 'dummy'
                query = QueryParser("text",
                                    self.analyzer).parse(QueryParser.escape(q))

            c = OrderedDict()
            hits = self.curr_searcher.search(query, self.max_cand)

            for i, hit in enumerate(hits.scoreDocs):
                doc = self.curr_searcher.doc(hit.doc)
                if i < self.max_full_cand:
                    word_idx = map(int, doc['word_idx'].split(' '))
                    word = doc['word'].split('<&>')
                else:
                    word_idx = []
                    word = []
                c[int(doc['id'])] = [word_idx, word]

            return c

    def search_singlethread(self, qs, max_cand, max_full_cand, curr_searcher):

        out = []
        for q in qs:
            if q in self.cache:
                out.append(self.cache[q])
            else:
                try:
                    q = q.replace('AND', '\\AND').replace('OR',
                                                          '\\OR').replace(
                                                              'NOT', '\\NOT')
                    query = QueryParser("text", self.analyzer).parse(
                        QueryParser.escape(q))
                except:
                    print 'Unexpected error when processing query:', str(q)
                    print 'Using query "dummy".'
                    query = QueryParser("text", self.analyzer).parse(
                        QueryParser.escape('dummy'))

                c = OrderedDict()
                hits = curr_searcher.search(query, max_cand)

                for i, hit in enumerate(hits.scoreDocs):
                    doc = curr_searcher.doc(hit.doc)
                    if i < max_full_cand:
                        word_idx = map(int, doc['word_idx'].split(' '))
                        word = doc['word'].split('<&>')
                    else:
                        word_idx = []
                        word = []
                    c[int(doc['id'])] = [word_idx, word]

                out.append(c)

        return out

    def get_candidates(self,
                       qs,
                       max_cand,
                       max_full_cand=None,
                       save_cache=False,
                       extra_terms=True):
        if not max_full_cand:
            max_full_cand = max_cand

        if prm.docs_path != prm.docs_path_term:
            max_cand2 = 0
        else:
            max_cand2 = max_full_cand
        if prm.n_threads > 1:
            out = self.search_multithread(qs, max_cand, max_cand2,
                                          self.searcher)
            if (prm.docs_path != prm.docs_path_term) and extra_terms:
                terms = self.search_multithread(qs, max_full_cand,
                                                max_full_cand,
                                                self.searcher_term)
        else:
            out = self.search_singlethread(qs, max_cand, max_cand2,
                                           self.searcher)
            if (prm.docs_path != prm.docs_path_term) and extra_terms:
                terms = self.search_singlethread(qs, max_full_cand,
                                                 max_full_cand,
                                                 self.searcher_term)

        if (prm.docs_path != prm.docs_path_term) and extra_terms:
            for outt, termss in itertools.izip(out, terms):
                for cand_id, term in itertools.izip(
                        outt.keys()[:max_full_cand], termss.values()):
                    outt[cand_id] = term

        if save_cache:
            for q, c in itertools.izip(qs, out):
                if q not in self.cache:
                    self.cache[q] = c

        return out