class LuceneSearch(): """Index and search docs. Parameters ---------- index_dir : str Index of the documents produced by Lucene db_path: str File path of the SQLlite database containing articles of wikipedia dump.(from DrQA) num_search_workers: int (optional), default=8 Workers to use to accelerate searching. """ def __init__(self, index_dir: str, db_path: str = None, num_search_workers: int = 8) -> None: self.env = lucene.getVMEnv() # pylint: disable=no-member if not self.env: self.env = lucene.initVM( initialheap='28g', # pylint: disable=no-member maxheap='28g', vmargs=['-Djava.awt.headless=true']) self.num_search_workers = num_search_workers if not os.path.exists(index_dir): self.doc_db = DocDB(db_path=db_path) logger.info('Creating index at %s', index_dir) self._create_index(index_dir) fs_dir = MMapDirectory(Paths.get(index_dir)) self.searcher = IndexSearcher(DirectoryReader.open(fs_dir)) self.analyzer = StandardAnalyzer() self.pool = ThreadPool(processes=num_search_workers) def _create_index(self, index_dir: str) -> None: """Index documents Parameters ---------- index_dir : str The dir to store index """ os.mkdir(index_dir) TITLE_FIELD = FieldType() # pylint: disable=invalid-name TITLE_FIELD.setStored(True) TITLE_FIELD.setIndexOptions(IndexOptions.DOCS) TEXT_FIELD = FieldType() # pylint: disable=invalid-name TEXT_FIELD.setStored(True) TEXT_FIELD.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) fs_dir = MMapDirectory(Paths.get(index_dir)) writer_config = IndexWriterConfig(StandardAnalyzer()) writer_config.setRAMBufferSizeMB(16384.0) # 14g self.writer = IndexWriter(fs_dir, writer_config) logger.info("%d docs in index", self.writer.numDocs()) logger.info("Indexing documents...") doc_ids = self.doc_db.get_doc_ids() for doc_id in tqdm(doc_ids, total=len(doc_ids)): text = self.doc_db.get_doc_text(doc_id) doc = Document() doc.add(Field("title", doc_id, TITLE_FIELD)) doc.add(Field("text", text, TEXT_FIELD)) self.writer.addDocument(doc) logger.info("Indexed %d docs.", self.writer.numDocs()) self.writer.forceMerge(1) # to increase search performance self.writer.close() def _search_multithread( self, queries: List[str], doc_max: int) -> List[List[Dict[str, Union[float, str]]]]: args = [(query, doc_max) for query in queries] queries_results = self.pool.starmap(self._search_multithread_part, args) return queries_results def _search_multithread_part( self, query: str, doc_max: int) -> List[Dict[str, Union[float, str]]]: if not self.env.isCurrentThreadAttached(): self.env.attachCurrentThread() try: query = QueryParser('text', self.analyzer).parse(QueryParser.escape(query)) except Exception as exception: # pylint: disable=broad-except logger.warning(colored(f'{exception}: {query}, use query dummy.'), 'yellow') query = QueryParser('text', self.analyzer).parse('dummy') query_results = [] hits = self.searcher.search(query, doc_max) for hit in hits.scoreDocs: doc = self.searcher.doc(hit.doc) query_results.append({ 'score': hit.score, 'title': doc['title'], 'text': doc['text'] }) if not query_results: logger.warning( colored( f'WARN: search engine returns no results for query: {query}.', 'yellow')) return query_results def _search_singlethread( self, queries: List[str], doc_max: int) -> List[List[Dict[str, Union[float, str]]]]: queries_result = [] for query in queries: try: query = QueryParser('text', self.analyzer).parse( QueryParser.escape(query)) except Exception as exception: # pylint: disable=broad-except logger.warning( colored(f'{exception}: {query}, use query dummy.'), 'yellow') query = QueryParser('text', self.analyzer).parse('dummy') query_results = [] hits = self.searcher.search(query, doc_max) for hit in hits.scoreDocs: doc = self.searcher.doc(hit.doc) query_results.append({ 'score': hit.score, 'title': doc['title'], 'text': doc['text'] }) if not query_results: logger.warning( colored( f'WARN: search engine returns no results for query: {query}.', 'yellow')) queries_result.append(query_results) return queries_result def search(self, query: str, doc_max: int = 20) -> List[Dict[str, Union[float, str]]]: """Search a given query. Parameters ---------- query : str Anything you want to search doc_max : int Maximum number of result to return Returns ------- Tuple[Any] Search results. """ return self.batch_search([query], doc_max=doc_max)[0] def batch_search( self, queries: List[str], doc_max: int = 20) -> List[List[Dict[str, Union[float, str]]]]: """ Search a list of queries. Parameters ---------- queries : List[str] queries list doc_max : int, optional, default=20 maximum number of docs returned by the search engine. Returns ------- List[Tuple[Any]] Result returned by the search engine. """ if self.num_search_workers > 1: result = self._search_multithread(queries, doc_max) else: result = self._search_singlethread(queries, doc_max) return result @staticmethod def pprint(search_result: List[Dict[str, Union[float, str]]]) -> None: """Print the results returned by the doc searcher. Parameters ---------- search_result : List[Dict[str, Union[float, str]]] Results returned from ranker """ headers = ['Rank', 'Title', 'Text', 'Score'] table = prettytable.PrettyTable(headers) for i, result in enumerate(search_result): text, title = result['text'], result['title'] text = text[:100] + ' ...' if len(text) > 100 else text title = title[:30] + ' ...' if len(title) > 30 else title table.add_row([i, title, text, '%.5g' % result['score']]) print('Top Results:') print(table)
class LuceneSearch(object): def __init__(self, args): self.env = lucene.initVM(initialheap='28g', maxheap='28g', vmargs=['-Djava.awt.headless=true']) self.args = args index_folder = os.path.join(DATA_DIR, args.index_folder) if not os.path.exists(index_folder): self.doc_db = DocDB() logger.info(f'Creating index at {index_folder}') self.create_index(index_folder) fsDir = MMapDirectory(Paths.get(index_folder)) self.searcher = IndexSearcher(DirectoryReader.open(fsDir)) self.searcher.setSimilarity(MyTFIDFSimilarity()) self.analyzer = MySimpleAnalyzer( CharArraySet(collections.JavaSet(utils.STOPWORDS), True)) self.pool = ThreadPool(processes=args.num_search_workers) def add_doc(self, title, text, tokens): doc = Document() doc.add(Field("title", title, self.t1)) doc.add(Field("text", text, self.t2)) doc.add(Field("token", tokens, self.t3)) self.writer.addDocument(doc) def create_index(self, index_folder): os.mkdir(index_folder) self.t1 = FieldType() self.t1.setStored(True) self.t1.setIndexOptions(IndexOptions.DOCS) self.t2 = FieldType() self.t2.setStored(True) self.t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) self.t3 = FieldType() self.t3.setStored(True) self.t3.setIndexOptions(IndexOptions.NONE) fsDir = MMapDirectory(Paths.get(index_folder)) writerConfig = IndexWriterConfig( MySimpleAnalyzer( CharArraySet(collections.JavaSet(utils.STOPWORDS), True))) writerConfig.setSimilarity(MyTFIDFSimilarity()) writerConfig.setRAMBufferSizeMB(16384.0) # 14g self.writer = IndexWriter(fsDir, writerConfig) logger.info(f"{self.writer.numDocs()} docs in index") logger.info("Indexing documents...") doc_ids = self.doc_db.get_doc_ids() for doc_id in tqdm(doc_ids, total=len(doc_ids)): text = self.doc_db.get_doc_text(doc_id) tokens = self.doc_db.get_doc_tokens(doc_id) self.add_doc(doc_id, text, tokens) logger.info(f"Indexed {self.writer.numDocs()} docs.") self.writer.forceMerge(1) # to increase search performance self.writer.close() def search_multithread(self, qs, ranker_doc_max, searcher): self.ranker_doc_max = ranker_doc_max self.curr_searcher = searcher out = self.pool.map(self.search_multithread_part, qs) return out def search_multithread_part(self, q): if not self.env.isCurrentThreadAttached(): self.env.attachCurrentThread() try: if self.args.ngram == 2: query = self._parse_query(field_name='text', query=q) else: # self.args.ngram == 1 query = QueryParser('text', self.analyzer).parse(QueryParser.escape(q)) except Exception as e: logger.warning(colored(f'{e}: {q}, use query dummy.'), 'yellow') if self.args.ngram == 2: query = self._parse_query(field_name='text', query=q) else: # self.args.ngram == 1 query = QueryParser('text', self.analyzer).parse('dummy') doc_scores, doc_titles, doc_texts, doc_words = [], [], [], [] hits = self.curr_searcher.search(query, self.ranker_doc_max) for i, hit in enumerate(hits.scoreDocs): doc = self.curr_searcher.doc(hit.doc) doc_score = hit.score doc_title = doc['title'] doc_word = doc['token'].split('<&>') doc_text = doc['text'] doc_scores.append(doc_score) doc_titles.append(doc_title) doc_words.append(doc_word) doc_texts.append(doc_text) if len(doc_scores) == 0: logger.warning( colored( f'WARN: search engine returns no results for query: {q}.', 'yellow')) return doc_scores, doc_titles, doc_texts, doc_words def search_singlethread(self, qs, ranker_doc_max, curr_searcher): out = [] for q in qs: try: if self.args.ngram == 2: query = self._parse_query(field_name='text', query=q) else: # self.args.ngram == 1 query = QueryParser('text', self.analyzer).parse( QueryParser.escape(q)) except Exception as e: logger.warning(colored(f'{e}: {q}, use query dummy.'), 'yellow') if self.args.ngram == 2: query = self._parse_query(field_name='text', query=q) else: # self.args.ngram == 1 query = QueryParser('text', self.analyzer).parse('dummy') doc_scores, doc_titles, doc_texts, doc_words = [], [], [], [] hits = curr_searcher.search(query, ranker_doc_max) for i, hit in enumerate(hits.scoreDocs): doc = curr_searcher.doc(hit.doc) doc_score = hit.score doc_title = doc['title'] doc_word = doc['token'].split('<&>') doc_text = doc['text'] doc_scores.append(doc_score) doc_titles.append(doc_title) doc_words.append(doc_word) doc_texts.append(doc_text) if len(doc_scores) == 0: logger.warning( colored( f'WARN: search engine returns no results for query: {q}.', 'yellow')) out.append((doc_scores, doc_titles, doc_texts, doc_words)) return out def batch_closest_docs(self, qs, ranker_doc_max): if self.args.num_search_workers > 1: out = self.search_multithread(qs, ranker_doc_max, self.searcher) else: out = self.search_singlethread(qs, ranker_doc_max, self.searcher) return out def _parse_query(self, field_name, query): ts = self.analyzer.tokenStream("dummy", StringReader(query)) termAtt = ts.getAttribute(CharTermAttribute.class_) ts.reset() tokens = [] while ts.incrementToken(): tokens.append(termAtt.toString()) ts.end() ts.close() booleanQuery = BooleanQuery.Builder() for token in tokens: builder = PhraseQuery.Builder() for i, word in enumerate(token.split(' ')): builder.add(Term(field_name, word), i) pq = builder.build() booleanQuery.add(pq, BooleanClause.Occur.SHOULD) final_query = booleanQuery.build() return final_query