class IndexBuilder(object): def __init__(self, index_path, update=False): dir = FSDirectory.open(Paths.get(index_path)) analyzer = StandardAnalyzer() iwc = IndexWriterConfig(analyzer) if update: iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) else: iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE) self.writer = IndexWriter(dir, iwc) def index_docs(self, input_documents): for document in tqdm(input_documents, total=len(input_documents)): doc = Document() doc.add(StringField(".I", document[".I"].lower(), Field.Store.YES)) doc.add(StringField(".U", document[".U"].lower(), Field.Store.YES)) type = FieldType() type.setIndexOptions( IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) type.setStored(True) type.setStoreTermVectors(True) type.setTokenized(True) if ".W" in document and ".M" in document: doc.add( Field( "text", " ".join( tokenizer.tokenize(document[".M"].lower() + " " + document[".T"].lower() + document[".W"].lower())), type)) elif ".M" in document and ".W" not in document: doc.add( Field( "text", " ".join( tokenizer.tokenize(document[".M"].lower() + " " + document[".T"].lower())), type)) elif ".M" not in document and ".W" in document: doc.add( Field( "text", " ".join( tokenizer.tokenize(document[".T"].lower() + document[".W"].lower())), type)) elif ".M" not in document and ".W" not in document: doc.add( Field("text", " ".join(tokenizer.tokenize(document[".T"].lower())), type)) if self.writer.getConfig().getOpenMode( ) == IndexWriterConfig.OpenMode.CREATE: self.writer.addDocument(doc) else: self.writer.updateDocument(Term(".U", document[".U"]), doc) self.writer.close()