Пример #1
0
    def query(self, data):
        if self.fil.exists():
            searcher = IndexSearcher(DirectoryReader.open(self.d))
            query = QueryParser(
                Version.LUCENE_30,
                "id",
                self.analyzer).parse(
                data['query'])
            hits = searcher.search(query, 100000)

            results = {}

            results['totalHits'] = hits.totalHits
            results['hits'] = {}

            for hit in hits.scoreDocs:
                record = {}
                doc = searcher.doc(hit.doc)
                fields = doc.getFields()
                record['score'] = hit.score
                for field in fields:
                    if field.name() != "id":
                        record[field.name()] = field.stringValue()
                results['hits'][doc.get('id')] = record

            searcher.getIndexReader().close()
            return results
Пример #2
0
def main(args):
    global verbose
    verbose = args.verbose

    if verbose:
        logger.info(f'Read {args.dir_index}')
    directory = SimpleFSDirectory.open(Paths.get(args.dir_index))
    searcher = IndexSearcher(DirectoryReader.open(directory))
    reader = searcher.getIndexReader()

    if verbose:
        logger.info(f'Write to {args.path_output}')
    with open(args.path_output, 'w') as f:
        for idx in trange(reader.maxDoc()):
            doc = reader.document(idx)
            babelnet_id = doc.get('ID')
            synset_id = doc.get('SYNSET_ID')
            pos = doc.get('POS')
            synset_type = doc.get('TYPE')
            main_sense = doc.get('MAIN_SENSE')
            categories = list(doc.getValues('CATEGORY'))
            translation_mappings = list(doc.getValues('TRANSLATION_MAPPING'))
            images = list(doc.getValues('IMAGE'))
            lemmas = doc.getValues('LEMMA')
            forms = []
            for i in range(len(lemmas)):
                forms.append({
                    'lemma': lemmas[i],
                    'source': doc.getValues('LEMMA_SOURCE')[i],
                    'lang': doc.getValues('LEMMA_LANGUAGE')[i],
                    'weight': doc.getValues('LEMMA_WEIGHT')[i],
                    'sense_key': doc.getValues('LEMMA_SENSEKEY')[i],
                })
            entry = {
                'id': babelnet_id,
                'synset': synset_id,
                'pos': pos,
                'type': synset_type,
                'main_sense': main_sense,
                'categories': categories,
                'translation_mappings': translation_mappings,
                'images': images,
                'forms': forms
            }
            f.write(json.dumps(entry, ensure_ascii=False) + '\n')

    return 0
Пример #3
0
    def get_tf_idf(self, field_name: str, content_id: str):
        """
        Calculates the tf-idf for the words contained in the field of the content whose id
        is content_id

        Args:
            field_name (str): Name of the field containing the words for which calculate the tf-idf
            content_id (str): Id of the content that contains the specified field

        Returns:
             words_bag (Dict <str, float>):
             Dictionary whose keys are the words contained in the field,
             and the corresponding values are the tf-idf values.
        """
        searcher = IndexSearcher(
            DirectoryReader.open(SimpleFSDirectory(Paths.get(self.directory))))
        query = QueryParser("testo_libero",
                            KeywordAnalyzer()).parse("content_id:\"" +
                                                     content_id + "\"")
        score_docs = searcher.search(query, 1).scoreDocs
        document_offset = -1
        for score_doc in score_docs:
            document_offset = score_doc.doc

        reader = searcher.getIndexReader()
        words_bag = {}
        term_vector = reader.getTermVector(document_offset, field_name)
        term_enum = term_vector.iterator()
        for term in BytesRefIterator.cast_(term_enum):
            term_text = term.utf8ToString()
            postings = term_enum.postings(None)
            postings.nextDoc()
            term_frequency = 1 + math.log10(
                postings.freq())  # normalized term frequency
            inverse_document_frequency = math.log10(
                reader.maxDoc() / reader.docFreq(Term(field_name, term)))
            tf_idf = term_frequency * inverse_document_frequency
            words_bag[term_text] = tf_idf

        reader.close()
        return words_bag
Пример #4
0
class LuceneSearch():
    def __init__(self):

        self.env = lucene.initVM(initialheap='28g',
                                 maxheap='28g',
                                 vmargs=['-Djava.awt.headless=true'])
        self.vocab = None

        BooleanQuery.setMaxClauseCount(2048)

        if not os.path.exists(prm.index_folder):
            print 'Creating index at', prm.index_folder
            if prm.docs_path == prm.docs_path_term:
                add_terms = True
            else:
                add_terms = False
            self.create_index(prm.index_folder, prm.docs_path, add_terms)

        if prm.local_index_folder:
            print 'copying index from', prm.index_folder, 'to', prm.local_index_folder
            if os.path.exists(prm.local_index_folder):
                print 'Folder', prm.local_index_folder, 'already exists! Doing nothing.'
            else:
                shutil.copytree(prm.index_folder, prm.local_index_folder)
            self.index_folder = prm.local_index_folder
        else:
            self.index_folder = prm.index_folder

        fsDir = MMapDirectory(Paths.get(prm.index_folder))
        self.searcher = IndexSearcher(DirectoryReader.open(fsDir))

        if prm.docs_path != prm.docs_path_term:
            if not os.path.exists(prm.index_folder_term):
                print 'Creating index at', prm.index_folder_term
                self.create_index(prm.index_folder_term,
                                  prm.docs_path_term,
                                  add_terms=True)

            if prm.local_index_folder_term:
                print 'copying index from', prm.index_folder_term, 'to', prm.local_index_folder_term
                if os.path.exists(prm.local_index_folder_term):
                    print 'Folder', prm.local_index_folder_term, 'already exists! Doing nothing.'
                else:
                    shutil.copytree(prm.index_folder_term,
                                    prm.local_index_folder_term)
                self.index_folder_term = prm.local_index_folder_term
            else:
                self.index_folder_term = prm.index_folder_term
            fsDir_term = MMapDirectory(Paths.get(prm.index_folder_term))
            self.searcher_term = IndexSearcher(
                DirectoryReader.open(fsDir_term))

        self.analyzer = StandardAnalyzer()
        self.pool = ThreadPool(processes=prm.n_threads)
        self.cache = {}

        print 'Loading Title-ID mapping...'
        self.title_id_map, self.id_title_map = self.get_title_id_map()

    def get_title_id_map(self):

        # get number of docs
        n_docs = self.searcher.getIndexReader().numDocs()

        title_id = {}
        id_title = {}
        query = MatchAllDocsQuery()
        hits = self.searcher.search(query, n_docs)
        for hit in hits.scoreDocs:
            doc = self.searcher.doc(hit.doc)
            idd = int(doc['id'])
            title = doc['title']
            title_id[title] = idd
            id_title[idd] = title

        return title_id, id_title

    def add_doc(self, doc_id, title, txt, add_terms):

        doc = Document()
        txt = utils.clean(txt)

        if add_terms:
            txt_ = txt.lower()
            words_idx, words = utils.text2idx2([txt_], self.vocab,
                                               prm.max_terms_per_doc)
            words_idx = words_idx[0]
            words = words[0]

        doc.add(Field("id", str(doc_id), self.t1))
        doc.add(Field("title", title, self.t1))
        doc.add(Field("text", txt, self.t2))
        if add_terms:
            doc.add(Field("word_idx", ' '.join(map(str, words_idx)), self.t3))
            doc.add(Field("word", '<&>'.join(words), self.t3))
        self.writer.addDocument(doc)

    def create_index(self, index_folder, docs_path, add_terms=False):

        print 'Loading Vocab...'
        if not self.vocab:
            self.vocab = utils.load_vocab(prm.vocab_path, prm.n_words)

        os.mkdir(index_folder)

        self.t1 = FieldType()
        self.t1.setStored(True)
        self.t1.setIndexOptions(IndexOptions.DOCS)

        self.t2 = FieldType()
        self.t2.setStored(False)
        self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        self.t3 = FieldType()
        self.t3.setStored(True)
        self.t3.setIndexOptions(IndexOptions.NONE)

        fsDir = MMapDirectory(Paths.get(index_folder))
        writerConfig = IndexWriterConfig(StandardAnalyzer())
        self.writer = IndexWriter(fsDir, writerConfig)
        print "%d docs in index" % self.writer.numDocs()
        print "Indexing documents..."

        doc_id = 0

        import corpus_hdf5
        corpus = corpus_hdf5.CorpusHDF5(docs_path)
        for txt in corpus.get_text_iter():
            title = corpus.get_article_title(doc_id)
            self.add_doc(doc_id, title, txt, add_terms)
            if doc_id % 1000 == 0:
                print 'indexing doc', doc_id
            doc_id += 1

        print "Index of %d docs..." % self.writer.numDocs()
        self.writer.close()

    def search_multithread(self, qs, max_cand, max_full_cand, searcher):

        self.max_cand = max_cand
        self.max_full_cand = max_full_cand
        self.curr_searcher = searcher
        out = self.pool.map(self.search_multithread_part, qs)

        return out

    def search_multithread_part(self, q):

        if not self.env.isCurrentThreadAttached():
            self.env.attachCurrentThread()

        if q in self.cache:
            return self.cache[q]
        else:

            try:
                q = q.replace('AND',
                              '\\AND').replace('OR',
                                               '\\OR').replace('NOT', '\\NOT')
                query = QueryParser("text",
                                    self.analyzer).parse(QueryParser.escape(q))
            except:
                print 'Unexpected error when processing query:', str(q)
                print 'Using query "dummy".'
                q = 'dummy'
                query = QueryParser("text",
                                    self.analyzer).parse(QueryParser.escape(q))

            c = OrderedDict()
            hits = self.curr_searcher.search(query, self.max_cand)

            for i, hit in enumerate(hits.scoreDocs):
                doc = self.curr_searcher.doc(hit.doc)
                if i < self.max_full_cand:
                    word_idx = map(int, doc['word_idx'].split(' '))
                    word = doc['word'].split('<&>')
                else:
                    word_idx = []
                    word = []
                c[int(doc['id'])] = [word_idx, word]

            return c

    def search_singlethread(self, qs, max_cand, max_full_cand, curr_searcher):

        out = []
        for q in qs:
            if q in self.cache:
                out.append(self.cache[q])
            else:
                try:
                    q = q.replace('AND', '\\AND').replace('OR',
                                                          '\\OR').replace(
                                                              'NOT', '\\NOT')
                    query = QueryParser("text", self.analyzer).parse(
                        QueryParser.escape(q))
                except:
                    print 'Unexpected error when processing query:', str(q)
                    print 'Using query "dummy".'
                    query = QueryParser("text", self.analyzer).parse(
                        QueryParser.escape('dummy'))

                c = OrderedDict()
                hits = curr_searcher.search(query, max_cand)

                for i, hit in enumerate(hits.scoreDocs):
                    doc = curr_searcher.doc(hit.doc)
                    if i < max_full_cand:
                        word_idx = map(int, doc['word_idx'].split(' '))
                        word = doc['word'].split('<&>')
                    else:
                        word_idx = []
                        word = []
                    c[int(doc['id'])] = [word_idx, word]

                out.append(c)

        return out

    def get_candidates(self,
                       qs,
                       max_cand,
                       max_full_cand=None,
                       save_cache=False,
                       extra_terms=True):
        if not max_full_cand:
            max_full_cand = max_cand

        if prm.docs_path != prm.docs_path_term:
            max_cand2 = 0
        else:
            max_cand2 = max_full_cand
        if prm.n_threads > 1:
            out = self.search_multithread(qs, max_cand, max_cand2,
                                          self.searcher)
            if (prm.docs_path != prm.docs_path_term) and extra_terms:
                terms = self.search_multithread(qs, max_full_cand,
                                                max_full_cand,
                                                self.searcher_term)
        else:
            out = self.search_singlethread(qs, max_cand, max_cand2,
                                           self.searcher)
            if (prm.docs_path != prm.docs_path_term) and extra_terms:
                terms = self.search_singlethread(qs, max_full_cand,
                                                 max_full_cand,
                                                 self.searcher_term)

        if (prm.docs_path != prm.docs_path_term) and extra_terms:
            for outt, termss in itertools.izip(out, terms):
                for cand_id, term in itertools.izip(
                        outt.keys()[:max_full_cand], termss.values()):
                    outt[cand_id] = term

        if save_cache:
            for q, c in itertools.izip(qs, out):
                if q not in self.cache:
                    self.cache[q] = c

        return out
Пример #5
0
class QuestionLuceneSearch():

    def __init__(self):

        self.env = lucene.initVM(initialheap='6g', maxheap='6g', vmargs=['-Djava.awt.headless=true'])
        self.vocab = None

        BooleanQuery.setMaxClauseCount(2048)

        if not os.path.exists(prm.index_folder):
            print('Creating index at', prm.index_folder)
            if prm.docs_path == prm.docs_path_term:
                add_terms = True
            else:
                add_terms = False
            self.create_index(prm.index_folder, prm.docs_path, add_terms)

        if prm.local_index_folder:
            print('copying index from', prm.index_folder, 'to', prm.local_index_folder)
            if os.path.exists(prm.local_index_folder):
                print('Folder', prm.local_index_folder, 'already exists! Doing nothing.')
            else:
                shutil.copytree(prm.index_folder, prm.local_index_folder)
            self.index_folder = prm.local_index_folder
        else:
            self.index_folder = prm.index_folder

        fsDir = MMapDirectory(Paths.get(prm.index_folder))
        self.searcher = IndexSearcher(DirectoryReader.open(fsDir))

        self.searcher.setSimilarity(BM25Similarity())

        if prm.docs_path != prm.docs_path_term:
            if not os.path.exists(prm.index_folder_term):
                print('Creating index at', prm.index_folder_term)
                self.create_index(prm.index_folder_term, prm.docs_path_term, add_terms=True)

            if prm.local_index_folder_term:
                print('copying index from', prm.index_folder_term, 'to', prm.local_index_folder_term)
                if os.path.exists(prm.local_index_folder_term):
                    print('Folder', prm.local_index_folder_term, 'already exists! Doing nothing.')
                else:
                    shutil.copytree(prm.index_folder_term, prm.local_index_folder_term)
                self.index_folder_term = prm.local_index_folder_term
            else:
                self.index_folder_term = prm.index_folder_term
            fsDir_term = MMapDirectory(Paths.get(prm.index_folder_term))
            self.searcher_term = IndexSearcher(DirectoryReader.open(fsDir_term))

        self.analyzer = StandardAnalyzer()
        self.pool = ThreadPool(processes=prm.n_threads)
        self.cache = {}
        
        print('Loading Text-ID mapping...')
        self.text_id_map, self.id_text_map = self.get_text_id_map()

    def get_text_id_map(self):

        # get number of docs
        n_docs = self.searcher.getIndexReader().numDocs()

        text_id = {}
        id_text = {}
        query = MatchAllDocsQuery()
        hits = self.searcher.search(query, n_docs)
        for hit in hits.scoreDocs:
            doc = self.searcher.doc(hit.doc)
            idd = int(doc['id'])
            text = doc['text']
            text_id[text] = idd
            id_text[idd] = text

        return text_id, id_text


    # def add_doc(self, doc_id, title, txt, add_terms):
    def add_doc(self, doc_id, txt, add_terms):

        doc = Document()
        txt = utils.clean(txt)

        if add_terms:
            txt_ = txt.lower()
            words_idx, words = utils.text2idx2([txt_], self.vocab, prm.max_terms_per_doc)
            words_idx = words_idx[0]
            words = words[0]

        doc.add(Field("id", str(doc_id), self.t1))
        # doc.add(Field("title", title, self.t1))
        doc.add(Field("text", txt, self.t2))
        if add_terms:
            doc.add(Field("word_idx", ' '.join(map(str,words_idx)), self.t3))
            doc.add(Field("word", '<&>'.join(words), self.t3))
        self.writer.addDocument(doc)


    def create_index(self, index_folder, docs_path, add_terms=False):

        print('Loading Vocab...')
        if not self.vocab:
            self.vocab = utils.load_vocab(prm.vocab_path, prm.n_words)

        os.mkdir(index_folder)

        self.t1 = FieldType()
        self.t1.setStored(True)
        self.t1.setIndexOptions(IndexOptions.DOCS)

        self.t2 = FieldType()
        self.t2.setStored(False)
        self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        self.t3 = FieldType()
        self.t3.setStored(True)
        self.t3.setIndexOptions(IndexOptions.NONE)
       
        fsDir = MMapDirectory(Paths.get(index_folder))
        writerConfig = IndexWriterConfig(StandardAnalyzer())
        self.writer = IndexWriter(fsDir, writerConfig)
        print("%d docs in index" % self.writer.numDocs())
        print("Indexing documents...")


        # import corpus_hdf5
        # corpus = corpus_hdf5.MSMARCOCorpusHDF5(docs_path)
        import pickle
        with open(docs_path, "rb") as read_file:
            corpus = pickle.load(read_file)
        idx_cnt = 0
        # for doc_id, txt in zip(corpus.get_id_iter(), corpus.get_text_iter()):
        # for doc_id, txt in corpus.items():
        for txt in corpus:
            self.add_doc(idx_cnt, txt, add_terms)  # not lowered
            if idx_cnt % 1000 == 0:
                print('indexing doc', idx_cnt)
            idx_cnt += 1
        print("Index of %d docs..." % self.writer.numDocs())
        self.writer.close()


    def search_multithread(self, qs, max_cand, max_full_cand, searcher):

        self.max_cand = max_cand
        self.max_full_cand = max_full_cand
        self.curr_searcher = searcher
        out = self.pool.map(self.search_multithread_part, qs)
 
        return out


    def search_multithread_part(self, q):

        if not self.env.isCurrentThreadAttached():
            self.env.attachCurrentThread()
    
        if q in self.cache:
            return self.cache[q]
        else:

            try:
                q = q.replace('AND','\\AND').replace('OR','\\OR').replace('NOT','\\NOT')
                query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))
            except:
                print('Unexpected error when processing query:', str(q))
                print('Using query "dummy".')
                q = 'dummy'
                query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))

            c = OrderedDict()
            hits = self.curr_searcher.search(query, self.max_cand)

            for i, hit in enumerate(hits.scoreDocs):
                doc = self.curr_searcher.doc(hit.doc)
                if i < self.max_full_cand:
                    word_idx = list(map(int, doc['word_idx'].split(' ')))
                    word = doc['word'].split('<&>')
                else:
                    word_idx = []
                    word = []
                # c[int(doc['id'])] = [word_idx, word]
                c[int(doc['id'])] = [word_idx, word, hit.score]
            # print(c)
            return c

    
    def search_singlethread(self, qs, max_cand, max_full_cand, curr_searcher):

        out = []
        for q in qs:
            if q in self.cache:
                out.append(self.cache[q])
            else:
                try:
                    q = q.replace('AND','\\AND').replace('OR','\\OR').replace('NOT','\\NOT')
                    query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))
                except:
                    print('Unexpected error when processing query:', str(q))
                    print('Using query "dummy".')
                    query = QueryParser("text", self.analyzer).parse(QueryParser.escape('dummy'))

                c = OrderedDict()
                hits = curr_searcher.search(query, max_cand)

                for i, hit in enumerate(hits.scoreDocs):
                    doc = curr_searcher.doc(hit.doc)
                    if i < max_full_cand:
                        word_idx = list(map(int, doc['word_idx'].split(' ')))
                        word = doc['word'].split('<&>')
                    else:
                        word_idx = []
                        word = []
                    # c[int(doc['id'])] = [word_idx, word]
                    c[int(doc['id'])] = [word_idx, word, hit.score]
                out.append(c)

        return out


    def get_candidates(self, qs, max_cand, max_full_cand=None, save_cache=False, extra_terms=True):
        if not max_full_cand:
            max_full_cand = max_cand

        if prm.docs_path != prm.docs_path_term:
            max_cand2 = 0
        else:
            max_cand2 = max_full_cand
        if prm.n_threads > 1:
            out = self.search_multithread(qs, max_cand, max_cand2, self.searcher)
            if (prm.docs_path != prm.docs_path_term) and extra_terms:
                terms = self.search_multithread(qs, max_full_cand, max_full_cand, self.searcher_term)
        else:
            out = self.search_singlethread(qs, max_cand, max_cand2, self.searcher)
            if (prm.docs_path != prm.docs_path_term) and extra_terms:
                terms = self.search_singlethread(qs, max_full_cand, max_full_cand, self.searcher_term)

        if (prm.docs_path != prm.docs_path_term) and extra_terms:
            for outt, termss in zip(out, terms):                
                for cand_id, term in zip(list(outt.keys())[:max_full_cand], list(termss.values())):
                    outt[cand_id] = term
  
        if save_cache:
            for q, c in zip(qs, out):
                if q not in self.cache:
                    self.cache[q] = c

        return out



    def get_pair_scores(self, q, doc_int, save_cache=False, extra_terms=True):

        # if prm.n_threads > 1:
        #     out = self.search_pair_score_multithread(qs_trailing_doc, self.searcher)
        #     if (prm.docs_path != prm.docs_path_term) and extra_terms:
        #         terms = self.search_pair_score_multithread(qs_trailing_doc, self.searcher_term)
        # else:
        # out = self.search_pair_score_singlethread(qs_trailing_doc, self.searcher)
        # if (prm.docs_path != prm.docs_path_term) and extra_terms:
        #     terms = self.search_pair_score_singlethread(qs_trailing_doc, self.searcher_term)
        out = []

        try:
            q = q.replace('AND', '\\AND').replace('OR', '\\OR').replace('NOT', '\\NOT')
            query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))
            # query = QueryParser("text", StandardAnalyzer()).parse(QueryParser.escape(q))
        except:
            print('Unexpected error when processing query:', str(q))
            print('Using query "dummy".')
            q = 'dummy'
            query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))
            # query = QueryParser("text", StandardAnalyzer()).parse(QueryParser.escape(q))

        c = OrderedDict()
        exp = self.searcher.explain(query, doc_int)
        c[1] = exp
        out.append(c)

        return out

    def search_pair_score_singlethread(self, q, doc_int, searcher):

        out = []

        try:
            q = q.replace('AND', '\\AND').replace('OR', '\\OR').replace('NOT', '\\NOT')
            query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))
            # query = QueryParser("text", StandardAnalyzer()).parse(QueryParser.escape(q))
        except:
            print('Unexpected error when processing query:', str(q))
            print('Using query "dummy".')
            q = 'dummy'
            query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))
            # query = QueryParser("text", StandardAnalyzer()).parse(QueryParser.escape(q))


        c = OrderedDict()
        exp = searcher.explain(query, doc_int)
        c[1] = exp

        out.append(c)

        return out

    def search_pair_score_multithread(self, qs_trailing_doc, searcher):

        self.curr_searcher = searcher
        # out = self.pool.map(self.search_pair_score_multithread_part, product(qs,doc_int))
        out = self.pool.map(self.search_pair_score_multithread_part, qs_trailing_doc)

        return out

    def search_pair_score_multithread_part(self, q_doc_int):

        # print(q_doc_int)
        spl=q_doc_int.split('<|endoftext|>')
        q = spl[0]
        print(q)
        doc_int = int(spl[1])
        print(doc_int)

        if not self.env.isCurrentThreadAttached():
            self.env.attachCurrentThread()

        try:
            q = q.replace('AND', '\\AND').replace('OR', '\\OR').replace('NOT', '\\NOT')
            query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))
        except:
            print('Unexpected error when processing query:', str(q))
            print('Using query "dummy".')
            q = 'dummy'
            query = QueryParser("text", self.analyzer).parse(QueryParser.escape(q))


        c = OrderedDict()
        exp = self.curr_searcher.explain(query, doc_int)
        c[1] = exp

        return c
Пример #6
0
class LuceneSearch():
    def __init__(self):

        self.env = lucene.initVM(initialheap='28g',
                                 maxheap='28g',
                                 vmargs=['-Djava.awt.headless=true'])
        self.vocab = None

        BooleanQuery.setMaxClauseCount(2048)

        if not os.path.exists(prm.index_folder):
            print 'Creating index at', prm.index_folder
            if prm.docs_path == prm.docs_path_term:
                add_terms = True
            else:
                add_terms = False
            self.create_index(prm.index_folder, prm.docs_path, add_terms)

        if prm.local_index_folder:
            print 'copying index from', prm.index_folder, 'to', prm.local_index_folder
            if os.path.exists(prm.local_index_folder):
                print 'Folder', prm.local_index_folder, 'already exists! Doing nothing.'
            else:
                shutil.copytree(prm.index_folder, prm.local_index_folder)
            self.index_folder = prm.local_index_folder
        else:
            self.index_folder = prm.index_folder

        fsDir = MMapDirectory(Paths.get(prm.index_folder))
        self.searcher = IndexSearcher(DirectoryReader.open(fsDir))

        if prm.docs_path != prm.docs_path_term:
            if not os.path.exists(prm.index_folder_term):
                print 'Creating index at', prm.index_folder_term
                self.create_index(prm.index_folder_term,
                                  prm.docs_path_term,
                                  add_terms=True)

            if prm.local_index_folder_term:
                print 'copying index from', prm.index_folder_term, 'to', prm.local_index_folder_term
                if os.path.exists(prm.local_index_folder_term):
                    print 'Folder', prm.local_index_folder_term, 'already exists! Doing nothing.'
                else:
                    shutil.copytree(prm.index_folder_term,
                                    prm.local_index_folder_term)
                self.index_folder_term = prm.local_index_folder_term
            else:
                self.index_folder_term = prm.index_folder_term
            fsDir_term = MMapDirectory(Paths.get(prm.index_folder_term))
            self.searcher_term = IndexSearcher(
                DirectoryReader.open(fsDir_term))

        self.analyzer = StandardAnalyzer()
        self.pool = ThreadPool(processes=prm.n_threads)
        self.cache = {}

        print 'Loading Title-ID mapping...'
        self.title_id_map, self.id_title_map = self.get_title_id_map()

        if prm.idf_path:
            print 'Loading IDF dictionary...'
            self.idf = pkl.load(open(prm.idf_path))

    def get_title_id_map(self):

        # get number of docs
        n_docs = self.searcher.getIndexReader().numDocs()

        title_id = {}
        id_title = {}
        query = MatchAllDocsQuery()
        hits = self.searcher.search(query, n_docs)
        for hit in hits.scoreDocs:
            doc = self.searcher.doc(hit.doc)
            idd = int(doc['id'])
            title = doc['title']
            title_id[title] = idd
            id_title[idd] = title

        return title_id, id_title

    def add_idf(self, txt):
        txt = utils.clean(txt)
        txt = txt.lower()
        df = set()
        for word in wordpunct_tokenize(txt):
            if word not in df:
                df.add(word)
                self.idf[word] += 1.

    def add_doc(self, doc_id, title, txt, add_terms):

        doc = Document()
        txt = utils.clean(txt)

        if add_terms:
            if prm.top_tfidf > 0:
                words_idx = []
                words, _ = utils.top_tfidf(txt.lower(), self.idf,
                                           prm.top_tfidf, prm.min_term_freq)

                if len(words) == 0:
                    words.append('unk')

                for w in words:
                    if w in self.vocab:
                        words_idx.append(self.vocab[w])
                    else:
                        words_idx.append(-1)  # unknown words.

            else:
                txt_ = txt.lower()
                words_idx, words = utils.text2idx2([txt_], self.vocab,
                                                   prm.max_terms_per_doc)
                words_idx = words_idx[0]
                words = words[0]

        doc.add(Field("id", str(doc_id), self.t1))
        doc.add(Field("title", title, self.t1))
        doc.add(Field("text", txt, self.t2))
        if add_terms:
            doc.add(Field("word_idx", ' '.join(map(str, words_idx)), self.t3))
            doc.add(Field("word", '<&>'.join(words), self.t3))
        self.writer.addDocument(doc)

    def create_index(self, index_folder, docs_path, add_terms=False):

        print 'Loading Vocab...'
        if not self.vocab:
            self.vocab = utils.load_vocab(prm.vocab_path, prm.n_words)

        os.mkdir(index_folder)

        self.t1 = FieldType()
        self.t1.setStored(True)
        self.t1.setIndexOptions(IndexOptions.DOCS)

        self.t2 = FieldType()
        self.t2.setStored(False)
        self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS)

        self.t3 = FieldType()
        self.t3.setStored(True)
        self.t3.setIndexOptions(IndexOptions.NONE)

        if add_terms:
            if prm.top_tfidf > 0 or prm.idf_path:
                print 'Creating IDF dictionary...'
                self.idf = defaultdict(int)
                doc_id = 0
                if docs_path.lower().endswith('.hdf5'):
                    import corpus_hdf5
                    corpus = corpus_hdf5.CorpusHDF5(docs_path)
                    for txt in corpus.get_text_iter():
                        self.add_idf(txt)

                        if doc_id % 1000 == 0:
                            print 'Creating IDF, doc', doc_id
                        doc_id += 1

                else:
                    # ClueWeb09
                    import warc
                    import gzip
                    from bs4 import BeautifulSoup
                    # list all files in the folder.
                    paths = []
                    for root, directories, filenames in os.walk(docs_path):
                        for filename in filenames:
                            paths.append(os.path.join(root, filename))

                    for path in paths:
                        with gzip.open(path, mode='rb') as gzf:
                            for record in warc.WARCFile(fileobj=gzf):
                                # remove html tags
                                txt = BeautifulSoup(
                                    record.payload[:1000 * 1000],
                                    "lxml").get_text()
                                # remove WARC headers.
                                txt = '\n'.join(txt.split('\n')[10:])

                                self.add_idf(txt)

                                if doc_id % 1000 == 0:
                                    print 'Creating IDF, doc', doc_id
                                doc_id += 1

                for key, val in self.idf.items():
                    self.idf[key] = math.log(float(doc_id) / val)

                pkl.dump(self.idf, open(prm.idf_path, 'wb'))

        fsDir = MMapDirectory(Paths.get(index_folder))
        writerConfig = IndexWriterConfig(StandardAnalyzer())
        self.writer = IndexWriter(fsDir, writerConfig)
        print "%d docs in index" % self.writer.numDocs()
        print "Indexing documents..."

        doc_id = 0
        if docs_path.lower().endswith('.hdf5'):
            import corpus_hdf5
            corpus = corpus_hdf5.CorpusHDF5(docs_path)
            for txt in corpus.get_text_iter():
                title = corpus.get_article_title(doc_id)
                self.add_doc(doc_id, title, txt, add_terms)
                if doc_id % 1000 == 0:
                    print 'indexing doc', doc_id
                doc_id += 1
        else:
            # ClueWeb09
            import warc
            import gzip
            from bs4 import BeautifulSoup

            # list all files in the folder.
            paths = []
            for root, directories, filenames in os.walk(docs_path):
                for filename in filenames:
                    paths.append(os.path.join(root, filename))

            for path in paths:
                with gzip.open(path, mode='rb') as gzf:
                    for record in warc.WARCFile(fileobj=gzf):
                        if 'warc-trec-id' in record:
                            title = record['warc-trec-id']
                        else:
                            title = record['warc-record-id']
                        # remove html tags
                        #txt = BeautifulSoup(record.payload[:1000*1000], "lxml").get_text()
                        txt = record.payload[:1000 * 1000]
                        # remove WARC headers.
                        txt = '\n'.join(txt.split('\n')[10:])

                        self.add_doc(doc_id, title, txt, add_terms)
                        if doc_id % 1000 == 0:
                            print 'indexing doc', doc_id
                        doc_id += 1

        print "Index of %d docs..." % self.writer.numDocs()
        self.writer.close()

    def search_multithread(self, qs, max_cand, max_full_cand, searcher):

        self.max_cand = max_cand
        self.max_full_cand = max_full_cand
        self.curr_searcher = searcher
        out = self.pool.map(self.search_multithread_part, qs)

        return out

    def search_multithread_part(self, q):

        if not self.env.isCurrentThreadAttached():
            self.env.attachCurrentThread()

        if q in self.cache:
            return self.cache[q]
        else:

            try:
                q = q.replace('AND',
                              '\\AND').replace('OR',
                                               '\\OR').replace('NOT', '\\NOT')
                query = QueryParser("text",
                                    self.analyzer).parse(QueryParser.escape(q))
            except:
                print 'Unexpected error when processing query:', str(q)
                print 'Using query "dummy".'
                q = 'dummy'
                query = QueryParser("text",
                                    self.analyzer).parse(QueryParser.escape(q))

            c = OrderedDict()
            hits = self.curr_searcher.search(query, self.max_cand)

            for i, hit in enumerate(hits.scoreDocs):
                doc = self.curr_searcher.doc(hit.doc)
                if i < self.max_full_cand:
                    word_idx = map(int, doc['word_idx'].split(' '))
                    word = doc['word'].split('<&>')
                else:
                    word_idx = []
                    word = []
                c[int(doc['id'])] = [word_idx, word]

            return c

    def search_singlethread(self, qs, max_cand, max_full_cand, curr_searcher):

        out = []
        for q in qs:
            if q in self.cache:
                out.append(self.cache[q])
            else:
                try:
                    q = q.replace('AND', '\\AND').replace('OR',
                                                          '\\OR').replace(
                                                              'NOT', '\\NOT')
                    query = QueryParser("text", self.analyzer).parse(
                        QueryParser.escape(q))
                except:
                    print 'Unexpected error when processing query:', str(q)
                    print 'Using query "dummy".'
                    query = QueryParser("text", self.analyzer).parse(
                        QueryParser.escape('dummy'))

                c = OrderedDict()
                hits = curr_searcher.search(query, max_cand)

                for i, hit in enumerate(hits.scoreDocs):
                    doc = curr_searcher.doc(hit.doc)
                    if i < max_full_cand:
                        word_idx = map(int, doc['word_idx'].split(' '))
                        word = doc['word'].split('<&>')
                    else:
                        word_idx = []
                        word = []
                    c[int(doc['id'])] = [word_idx, word]

                out.append(c)

        return out

    def get_candidates(self,
                       qs,
                       max_cand,
                       max_full_cand=None,
                       save_cache=False,
                       extra_terms=True):
        if not max_full_cand:
            max_full_cand = max_cand

        if prm.docs_path != prm.docs_path_term:
            max_cand2 = 0
        else:
            max_cand2 = max_full_cand
        if prm.n_threads > 1:
            out = self.search_multithread(qs, max_cand, max_cand2,
                                          self.searcher)
            if (prm.docs_path != prm.docs_path_term) and extra_terms:
                terms = self.search_multithread(qs, max_full_cand,
                                                max_full_cand,
                                                self.searcher_term)
        else:
            out = self.search_singlethread(qs, max_cand, max_cand2,
                                           self.searcher)
            if (prm.docs_path != prm.docs_path_term) and extra_terms:
                terms = self.search_singlethread(qs, max_full_cand,
                                                 max_full_cand,
                                                 self.searcher_term)

        if (prm.docs_path != prm.docs_path_term) and extra_terms:
            for outt, termss in itertools.izip(out, terms):
                for cand_id, term in itertools.izip(
                        outt.keys()[:max_full_cand], termss.values()):
                    outt[cand_id] = term

        if save_cache:
            for q, c in itertools.izip(qs, out):
                if q not in self.cache:
                    self.cache[q] = c

        return out