예제 #1
0
class IndexBuilder(object):
    def __init__(self, index_path, update=False):
        dir = FSDirectory.open(Paths.get(index_path))
        analyzer = StandardAnalyzer()
        iwc = IndexWriterConfig(analyzer)
        if update:
            iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
        else:
            iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
        self.writer = IndexWriter(dir, iwc)

    def index_docs(self, input_documents):
        for document in tqdm(input_documents, total=len(input_documents)):
            doc = Document()
            doc.add(StringField(".I", document[".I"].lower(), Field.Store.YES))
            doc.add(StringField(".U", document[".U"].lower(), Field.Store.YES))
            type = FieldType()
            type.setIndexOptions(
                IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)
            type.setStored(True)
            type.setStoreTermVectors(True)
            type.setTokenized(True)
            if ".W" in document and ".M" in document:
                doc.add(
                    Field(
                        "text", " ".join(
                            tokenizer.tokenize(document[".M"].lower() + " " +
                                               document[".T"].lower() +
                                               document[".W"].lower())), type))
            elif ".M" in document and ".W" not in document:
                doc.add(
                    Field(
                        "text", " ".join(
                            tokenizer.tokenize(document[".M"].lower() + " " +
                                               document[".T"].lower())), type))
            elif ".M" not in document and ".W" in document:
                doc.add(
                    Field(
                        "text", " ".join(
                            tokenizer.tokenize(document[".T"].lower() +
                                               document[".W"].lower())), type))
            elif ".M" not in document and ".W" not in document:
                doc.add(
                    Field("text",
                          " ".join(tokenizer.tokenize(document[".T"].lower())),
                          type))
            if self.writer.getConfig().getOpenMode(
            ) == IndexWriterConfig.OpenMode.CREATE:
                self.writer.addDocument(doc)
            else:
                self.writer.updateDocument(Term(".U", document[".U"]), doc)
        self.writer.close()