def createIndexWriter(self, actual_dir, max_field_length=20000000): """ Returns an IndexWriter object created for the actual_dir specified """ ensureDirExists(actual_dir) index = SimpleFSDirectory(File(actual_dir)) analyzer = StandardAnalyzer(LuceneVersion.LUCENE_CURRENT) writerConfig=IndexWriterConfig(LuceneVersion.LUCENE_CURRENT, analyzer) similarity=FieldAgnosticSimilarity() writerConfig.setSimilarity(similarity) writerConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE) ## res= IndexWriter(index, analyzer, True, IndexWriter.MaxFieldLength(max_field_length)) res= IndexWriter(index, writerConfig) res.deleteAll() return res
def createIndexWriter(self, actual_dir, max_field_length=20000000): """ Returns an IndexWriter object created for the actual_dir specified """ ensureDirExists(actual_dir) index = SimpleFSDirectory(File(actual_dir)) analyzer = StandardAnalyzer(LuceneVersion.LUCENE_CURRENT) writerConfig = IndexWriterConfig(LuceneVersion.LUCENE_CURRENT, analyzer) similarity = FieldAgnosticSimilarity() writerConfig.setSimilarity(similarity) writerConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE) ## res= IndexWriter(index, analyzer, True, IndexWriter.MaxFieldLength(max_field_length)) res = IndexWriter(index, writerConfig) res.deleteAll() return res
class LuceneManager(object): def __init__(self, index_root_loc, index_subdir_name='.siftindex/index'): self.index_root_loc = index_root_loc self.index_subdir_name = index_subdir_name def __enter__(self): """ Used by "with" statement. Like an "open" / "init" method. """ if lucene.getVMEnv() is None: lucene.initVM(vmargs=['-Djava.awt.headless=true']) index_path = Path(self.index_root_loc).joinpath('%s/' % self.index_subdir_name) index_path.mkdir(parents=True, exist_ok=True) store = SimpleFSDirectory(Paths.get(str(index_path))) self.analyzer = StandardAnalyzer() config = IndexWriterConfig(self.analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) # IndexWriter self.writer = IndexWriter(store, config) # IndexReader self.reader = DirectoryReader.open(self.writer) # IndexSearcher self.searcher = IndexSearcher(self.reader) return self def insert(self, document): self.writer.addDocument(document) return document['key'] def delete(self, key): self.writer.deleteDocuments(Term('key', key)) return key def delete_all(self): self.writer.deleteAll() def num_docs(self): return self.reader.numDocs() def update(self, key, document): # atomic delete and add self.writer.updateDocument(Term('key', key), document) return key def exists(self, key): boolean_query = BooleanQuery.Builder() boolean_query.add(TermQuery(Term('key', key)), BooleanClause.Occur.MUST) results = self.searcher.search(boolean_query.build(), 1) return results.totalHits > 0 def commit(self): self.writer.commit() # make IndexReader reflect index updates # TODO: try IndexReader.isCurrent() new_reader = DirectoryReader.openIfChanged(self.reader) if new_reader is not None: self.reader.close() # note: not thread safe, may need to revisit self.reader = new_reader self.searcher = IndexSearcher(self.reader) def _process_search_result(self, result, highlighter=None): docid = result.doc # this is not a stable identifier # obtain document through an IndexReader doc = self.searcher.doc(docid) # doc.getFields() -> field.name(), field.stringValue() # use highlighter to extract relevant part of body highlighted_text = '' if highlighter: contents = doc['body'] token_stream = self.analyzer.tokenStream('body', contents) n_fragments = 3 fragment_separator = '...' highlighted_text = highlighter.getBestFragments( token_stream, contents, n_fragments, fragment_separator) return { 'fullpath': doc['fullpath'], 'last_modified_time': doc['last_modified_time'], 'score': result.score, 'excerpt': highlighted_text } def search(self, terms, n_hits=5): """ Run search query. """ # TODO: support date range queries # build query parser = MultiFieldQueryParser(['fullpath', 'body'], self.analyzer) #parser.setDefaultOperator(QueryParser.Operator.AND) # defaults to OR unless terms have modifier query = MultiFieldQueryParser.parse( parser, terms) # https://stackoverflow.com/a/26853987/130164 # create a highlighter highlighter = Highlighter(SimpleHTMLFormatter('*', '*'), QueryScorer(query)) # execute search for top N hits return [ self._process_search_result(result, highlighter) for result in self.searcher.search(query, n_hits).scoreDocs ] def get_all_docs(self, n_hits=1000): # debug method return [ self._process_search_result(result) for result in self.searcher.search(MatchAllDocsQuery(), n_hits).scoreDocs ] def __exit__(self, type, value, traceback): """ Used by the "with" statement. Handles close. TODO: error handling """ self.writer.close() self.reader.close() def debug_analyzer(self, text): """ Debug what StandardAnalyzer will give on this text. Ref: https://lucene.apache.org/core/7_6_0/core/org/apache/lucene/analysis/package-summary.html Ref: pylucene tests --> test_Analyzers.py, BaseTokenStreamTestCase.py """ token_stream = self.analyzer.tokenStream('field', text) termAtt = token_stream.getAttribute(CharTermAttribute.class_) token_stream.reset() tokens = [] while token_stream.incrementToken(): #tokens.append(token_stream.reflectAsString(True)) tokens.append(termAtt.toString()) token_stream.end() token_stream.close() return tokens
analyzer = PersianAnalyzer() config = IndexWriterConfig(analyzer) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE) writer = IndexWriter(index_store, config) TokenizeFields = True # Question field type qft = FieldType() # qft.setIndexed(True) # todo qft.setStored(True) qft.setTokenized(TokenizeFields) qft.setIndexOptions(IndexOptions.DOCS_AND_FREQS) # Answer field type aft = FieldType() # aft.setIndexed(False) # todo aft.setStored(True) writer.deleteAll() for row in range(1, sheet1.nrows): doc = Document() row_q = str(sheet1.cell(row, 0).value) row_a = str(sheet0.cell(row, 1).value) doc.add(Field(question_field, row_q, qft)) doc.add(Field(answer_field, row_a, aft)) writer.addDocument(doc) writer.commit() writer.close() print('indexing completed')