Exemplo n.º 1
0
class LuceneSearch():
    """Index and search docs.

    Parameters
    ----------
    index_dir : str
        Index of the documents produced by Lucene
    db_path: str
        File path of the SQLlite database containing articles of wikipedia dump.(from DrQA)
    num_search_workers: int (optional), default=8
        Workers to use to accelerate searching.
    """
    def __init__(self,
                 index_dir: str,
                 db_path: str = None,
                 num_search_workers: int = 8) -> None:

        self.env = lucene.getVMEnv()  # pylint: disable=no-member
        if not self.env:
            self.env = lucene.initVM(
                initialheap='28g',  # pylint: disable=no-member
                maxheap='28g',
                vmargs=['-Djava.awt.headless=true'])

        self.num_search_workers = num_search_workers

        if not os.path.exists(index_dir):
            self.doc_db = DocDB(db_path=db_path)
            logger.info('Creating index at %s', index_dir)
            self._create_index(index_dir)

        fs_dir = MMapDirectory(Paths.get(index_dir))
        self.searcher = IndexSearcher(DirectoryReader.open(fs_dir))
        self.analyzer = StandardAnalyzer()
        self.pool = ThreadPool(processes=num_search_workers)

    def _create_index(self, index_dir: str) -> None:
        """Index documents

        Parameters
        ----------
        index_dir : str
            The dir to store index
        """
        os.mkdir(index_dir)

        TITLE_FIELD = FieldType()  # pylint: disable=invalid-name
        TITLE_FIELD.setStored(True)
        TITLE_FIELD.setIndexOptions(IndexOptions.DOCS)

        TEXT_FIELD = FieldType()  # pylint: disable=invalid-name
        TEXT_FIELD.setStored(True)
        TEXT_FIELD.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        fs_dir = MMapDirectory(Paths.get(index_dir))
        writer_config = IndexWriterConfig(StandardAnalyzer())
        writer_config.setRAMBufferSizeMB(16384.0)  # 14g
        self.writer = IndexWriter(fs_dir, writer_config)
        logger.info("%d docs in index", self.writer.numDocs())
        logger.info("Indexing documents...")

        doc_ids = self.doc_db.get_doc_ids()
        for doc_id in tqdm(doc_ids, total=len(doc_ids)):
            text = self.doc_db.get_doc_text(doc_id)

            doc = Document()
            doc.add(Field("title", doc_id, TITLE_FIELD))
            doc.add(Field("text", text, TEXT_FIELD))

            self.writer.addDocument(doc)

        logger.info("Indexed %d docs.", self.writer.numDocs())
        self.writer.forceMerge(1)  # to increase search performance
        self.writer.close()

    def _search_multithread(
            self, queries: List[str],
            doc_max: int) -> List[List[Dict[str, Union[float, str]]]]:
        args = [(query, doc_max) for query in queries]
        queries_results = self.pool.starmap(self._search_multithread_part,
                                            args)
        return queries_results

    def _search_multithread_part(
            self, query: str,
            doc_max: int) -> List[Dict[str, Union[float, str]]]:
        if not self.env.isCurrentThreadAttached():
            self.env.attachCurrentThread()

        try:
            query = QueryParser('text',
                                self.analyzer).parse(QueryParser.escape(query))
        except Exception as exception:  # pylint: disable=broad-except
            logger.warning(colored(f'{exception}: {query}, use query dummy.'),
                           'yellow')
            query = QueryParser('text', self.analyzer).parse('dummy')

        query_results = []
        hits = self.searcher.search(query, doc_max)

        for hit in hits.scoreDocs:
            doc = self.searcher.doc(hit.doc)

            query_results.append({
                'score': hit.score,
                'title': doc['title'],
                'text': doc['text']
            })

        if not query_results:
            logger.warning(
                colored(
                    f'WARN: search engine returns no results for query: {query}.',
                    'yellow'))

        return query_results

    def _search_singlethread(
            self, queries: List[str],
            doc_max: int) -> List[List[Dict[str, Union[float, str]]]]:
        queries_result = []
        for query in queries:
            try:
                query = QueryParser('text', self.analyzer).parse(
                    QueryParser.escape(query))
            except Exception as exception:  # pylint: disable=broad-except
                logger.warning(
                    colored(f'{exception}: {query}, use query dummy.'),
                    'yellow')
                query = QueryParser('text', self.analyzer).parse('dummy')

            query_results = []
            hits = self.searcher.search(query, doc_max)

            for hit in hits.scoreDocs:
                doc = self.searcher.doc(hit.doc)

                query_results.append({
                    'score': hit.score,
                    'title': doc['title'],
                    'text': doc['text']
                })

            if not query_results:
                logger.warning(
                    colored(
                        f'WARN: search engine returns no results for query: {query}.',
                        'yellow'))

            queries_result.append(query_results)

        return queries_result

    def search(self,
               query: str,
               doc_max: int = 20) -> List[Dict[str, Union[float, str]]]:
        """Search a given query.

        Parameters
        ----------
        query : str
            Anything you want to search
        doc_max : int
            Maximum number of result to return

        Returns
        -------
        Tuple[Any]
            Search results.
        """
        return self.batch_search([query], doc_max=doc_max)[0]

    def batch_search(
            self,
            queries: List[str],
            doc_max: int = 20) -> List[List[Dict[str, Union[float, str]]]]:
        """
        Search a list of queries.

        Parameters
        ----------
        queries : List[str]
            queries list
        doc_max : int, optional, default=20
            maximum number of docs returned by the search engine.

        Returns
        -------
        List[Tuple[Any]]
            Result returned by the search engine.
        """
        if self.num_search_workers > 1:
            result = self._search_multithread(queries, doc_max)
        else:
            result = self._search_singlethread(queries, doc_max)

        return result

    @staticmethod
    def pprint(search_result: List[Dict[str, Union[float, str]]]) -> None:
        """Print the results returned by the doc searcher.

        Parameters
        ----------
        search_result : List[Dict[str, Union[float, str]]]
            Results returned from ranker
        """

        headers = ['Rank', 'Title', 'Text', 'Score']
        table = prettytable.PrettyTable(headers)
        for i, result in enumerate(search_result):
            text, title = result['text'], result['title']
            text = text[:100] + ' ...' if len(text) > 100 else text
            title = title[:30] + ' ...' if len(title) > 30 else title
            table.add_row([i, title, text, '%.5g' % result['score']])
        print('Top Results:')
        print(table)
Exemplo n.º 2
0
class LuceneSearch(object):
    def __init__(self, args):

        self.env = lucene.initVM(initialheap='28g',
                                 maxheap='28g',
                                 vmargs=['-Djava.awt.headless=true'])
        self.args = args

        index_folder = os.path.join(DATA_DIR, args.index_folder)
        if not os.path.exists(index_folder):
            self.doc_db = DocDB()
            logger.info(f'Creating index at {index_folder}')
            self.create_index(index_folder)

        fsDir = MMapDirectory(Paths.get(index_folder))
        self.searcher = IndexSearcher(DirectoryReader.open(fsDir))
        self.searcher.setSimilarity(MyTFIDFSimilarity())
        self.analyzer = MySimpleAnalyzer(
            CharArraySet(collections.JavaSet(utils.STOPWORDS), True))
        self.pool = ThreadPool(processes=args.num_search_workers)

    def add_doc(self, title, text, tokens):

        doc = Document()
        doc.add(Field("title", title, self.t1))
        doc.add(Field("text", text, self.t2))
        doc.add(Field("token", tokens, self.t3))

        self.writer.addDocument(doc)

    def create_index(self, index_folder):
        os.mkdir(index_folder)

        self.t1 = FieldType()
        self.t1.setStored(True)
        self.t1.setIndexOptions(IndexOptions.DOCS)

        self.t2 = FieldType()
        self.t2.setStored(True)
        self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        self.t3 = FieldType()
        self.t3.setStored(True)
        self.t3.setIndexOptions(IndexOptions.NONE)

        fsDir = MMapDirectory(Paths.get(index_folder))
        writerConfig = IndexWriterConfig(
            MySimpleAnalyzer(
                CharArraySet(collections.JavaSet(utils.STOPWORDS), True)))
        writerConfig.setSimilarity(MyTFIDFSimilarity())
        writerConfig.setRAMBufferSizeMB(16384.0)  # 14g
        self.writer = IndexWriter(fsDir, writerConfig)
        logger.info(f"{self.writer.numDocs()} docs in index")
        logger.info("Indexing documents...")

        doc_ids = self.doc_db.get_doc_ids()
        for doc_id in tqdm(doc_ids, total=len(doc_ids)):
            text = self.doc_db.get_doc_text(doc_id)
            tokens = self.doc_db.get_doc_tokens(doc_id)
            self.add_doc(doc_id, text, tokens)

        logger.info(f"Indexed {self.writer.numDocs()} docs.")
        self.writer.forceMerge(1)  # to increase search performance
        self.writer.close()

    def search_multithread(self, qs, ranker_doc_max, searcher):
        self.ranker_doc_max = ranker_doc_max
        self.curr_searcher = searcher
        out = self.pool.map(self.search_multithread_part, qs)

        return out

    def search_multithread_part(self, q):
        if not self.env.isCurrentThreadAttached():
            self.env.attachCurrentThread()

        try:
            if self.args.ngram == 2:
                query = self._parse_query(field_name='text', query=q)
            else:
                # self.args.ngram == 1
                query = QueryParser('text',
                                    self.analyzer).parse(QueryParser.escape(q))
        except Exception as e:
            logger.warning(colored(f'{e}: {q}, use query dummy.'), 'yellow')
            if self.args.ngram == 2:
                query = self._parse_query(field_name='text', query=q)
            else:
                # self.args.ngram == 1
                query = QueryParser('text', self.analyzer).parse('dummy')

        doc_scores, doc_titles, doc_texts, doc_words = [], [], [], []
        hits = self.curr_searcher.search(query, self.ranker_doc_max)

        for i, hit in enumerate(hits.scoreDocs):
            doc = self.curr_searcher.doc(hit.doc)

            doc_score = hit.score
            doc_title = doc['title']
            doc_word = doc['token'].split('<&>')
            doc_text = doc['text']

            doc_scores.append(doc_score)
            doc_titles.append(doc_title)
            doc_words.append(doc_word)
            doc_texts.append(doc_text)

        if len(doc_scores) == 0:
            logger.warning(
                colored(
                    f'WARN: search engine returns no results for query: {q}.',
                    'yellow'))

        return doc_scores, doc_titles, doc_texts, doc_words

    def search_singlethread(self, qs, ranker_doc_max, curr_searcher):
        out = []
        for q in qs:
            try:
                if self.args.ngram == 2:
                    query = self._parse_query(field_name='text', query=q)
                else:
                    # self.args.ngram == 1
                    query = QueryParser('text', self.analyzer).parse(
                        QueryParser.escape(q))
            except Exception as e:
                logger.warning(colored(f'{e}: {q}, use query dummy.'),
                               'yellow')
                if self.args.ngram == 2:
                    query = self._parse_query(field_name='text', query=q)
                else:
                    # self.args.ngram == 1
                    query = QueryParser('text', self.analyzer).parse('dummy')

            doc_scores, doc_titles, doc_texts, doc_words = [], [], [], []
            hits = curr_searcher.search(query, ranker_doc_max)

            for i, hit in enumerate(hits.scoreDocs):
                doc = curr_searcher.doc(hit.doc)

                doc_score = hit.score
                doc_title = doc['title']
                doc_word = doc['token'].split('<&>')
                doc_text = doc['text']

                doc_scores.append(doc_score)
                doc_titles.append(doc_title)
                doc_words.append(doc_word)
                doc_texts.append(doc_text)

            if len(doc_scores) == 0:
                logger.warning(
                    colored(
                        f'WARN: search engine returns no results for query: {q}.',
                        'yellow'))

            out.append((doc_scores, doc_titles, doc_texts, doc_words))

        return out

    def batch_closest_docs(self, qs, ranker_doc_max):

        if self.args.num_search_workers > 1:
            out = self.search_multithread(qs, ranker_doc_max, self.searcher)
        else:
            out = self.search_singlethread(qs, ranker_doc_max, self.searcher)

        return out

    def _parse_query(self, field_name, query):
        ts = self.analyzer.tokenStream("dummy", StringReader(query))
        termAtt = ts.getAttribute(CharTermAttribute.class_)
        ts.reset()
        tokens = []
        while ts.incrementToken():
            tokens.append(termAtt.toString())
        ts.end()
        ts.close()

        booleanQuery = BooleanQuery.Builder()
        for token in tokens:
            builder = PhraseQuery.Builder()
            for i, word in enumerate(token.split(' ')):
                builder.add(Term(field_name, word), i)
            pq = builder.build()
            booleanQuery.add(pq, BooleanClause.Occur.SHOULD)
        final_query = booleanQuery.build()
        return final_query