예제 #1
0
def create_index(directory, analyzer, documents_to_index):
    config = index.IndexWriterConfig(analyzer)
    index_writer = index.IndexWriter(directory, config)
    for doc in documents_to_index:
        index_writer.addDocument(doc)

    index_writer.close()
def create_index_for_wiki_sentence(filename, path, firstTime=False):
    logging.info('Start create wiki_sentence!')
    wiki_dict = get_wiki_data(path)

    logging.info('Start creating index!')
    filename = '_wiki_sentence'
    analyzer = analysis.standard.StandardAnalyzer()

    # # Store the index in memory:
    base_dir = HOMEPATH
    INDEX_DIR = "IndexFiles" + filename + ".index"
    storeDir = os.path.join(base_dir, INDEX_DIR)
    if not os.path.exists(storeDir):
        os.mkdir(storeDir)
    directory = SimpleFSDirectory(Paths.get(storeDir))
    if firstTime:
        config = index.IndexWriterConfig(analyzer)
        iwriter = index.IndexWriter(directory, config)
        for cnt, key in enumerate(wiki_dict.keys()):
            if cnt % 1000 == 0:
                logging.info(
                    'I have preprocessed {} index in creating index by document!'
                    .format(str(cnt)))
            org_title = key[0]
            preprocessed_title = key[1]
            doc_id = key[2]
            sentence = wiki_dict[key]
            doc = create_document_by_document_sentence(org_title,
                                                       preprocessed_title,
                                                       doc_id, sentence)
            iwriter.addDocument(doc)
        iwriter.close()
    logging.info('Finish creating index wiki_sentence!')
    return directory
예제 #3
0
파일: indexers.py 프로젝트: napoler/lupyne
 def __init__(self,
              directory=None,
              mode: str = 'a',
              analyzer=None,
              version=None,
              **attrs):
     self.shared = closing()
     config = index.IndexWriterConfig(
     ) if analyzer is None else index.IndexWriterConfig(
         self.shared.analyzer(analyzer))
     config.openMode = index.IndexWriterConfig.OpenMode.values()[
         'wra'.index(mode)]
     for name, value in attrs.items():
         setattr(config, name, value)
     self.policy = config.indexDeletionPolicy = index.SnapshotDeletionPolicy(
         config.indexDeletionPolicy)
     super().__init__(self.shared.directory(directory), config)
     self.fields = {}  # type: dict
    def createIndex_Stem_Lemma_SpacyIndex(self):
        print("In create index method")
        spacy_file = self.directory+"wiki_spacy_lemma_pos.json"
        my_analyzer = analysis.en.EnglishAnalyzer()
        my_config = index.IndexWriterConfig(my_analyzer)
        my_config.setSimilarity(ClassicSimilarity())
        my_writer = index.IndexWriter(self.in_directory_English_lemma, my_config)
        # # Setting up Title field for content we want tokenized
        t1 = FieldType()
        t1.setStored(True)
        t1.setTokenized(True)
        t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)   # only want documents returned
        # Setting up Body field for content we want tokenized
        t2 = FieldType()
        t2.setStored(True)
        t2.setTokenized(True)
        t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) #  using IndexOptions.DOCS_AND_FREQS_AND_POSITIONS since care about frequencies and positions to generate ranking
        # Setting up Categories field for content we want tokenized
        t3 = FieldType()
        t3.setStored(True)
        t3.setTokenized(True)
        t3.setIndexOptions(IndexOptions.DOCS) #  using IndexOptions.DOCS_AND_FREQS_AND_POSITIONS since care about frequencies and positions to generate ranking
        # Setting up Body POS  field for content we want tokenized
        t4 = FieldType()
        t4.setStored(True)
        t4.setTokenized(True)
        t4.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) #  using IndexOptions.DOCS_AND_FREQS_AND_POSITIONS since care about frequencies and positions to generate ranking

        nDocsAdded = 0
        docs = self.readJSONFromDisk(spacy_file)
        print("Len of file is", len(docs))
        for doc in docs:
            title = doc[0]
            lemma = doc[1]
            category = doc[2]
            pos = doc[3]
            doc = Document()

            doc.add(Field(self.TITLE, title, t1))
            doc.add(Field(self.TEXT, lemma, t2))
            doc.add(Field("Categories", category, t3))
            doc.add(Field("POS", pos, t4))
            my_writer.addDocument(doc)
            nDocsAdded +=1
        # now safely in the provided directories: indexDir and taxoDir.
        my_writer.commit()
        my_writer.close()
        print("Indexed %d documents with spacy." % nDocsAdded)
        pass
    def createIndex_simple(self,input_files):
        # open file and read lines
        docs = []
        cur_title = ""
        cur_body = ""
        cur_category = []
        file_counter = 0
        ip_file_counter = 1
        # Initialize Standard analyzer & Index writer
        my_analyzer = analysis.standard.StandardAnalyzer()
        my_config = index.IndexWriterConfig(my_analyzer)
        # Set ClassicSimilarity for tf-idf
        #my_config.setSimilarity(ClassicSimilarity())
        my_writer = index.IndexWriter(self.in_directory, my_config)

        # # Setting up Title field for content we want tokenized
        t1 = FieldType()
        t1.setStored(True)
        t1.setTokenized(True)
        t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)   # only want documents returned
        # Setting up Body field for content we want tokenized
        t2 = FieldType()
        t2.setStored(True)
        t2.setTokenized(True)
        t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) #  using IndexOptions.DOCS_AND_FREQS_AND_POSITIONS since care about frequencies and positions to generate ranking
        # Setting up Categories field for content we want tokenized
        t3 = FieldType()
        t3.setStored(True)
        t3.setTokenized(True)
        t3.setIndexOptions(IndexOptions.DOCS) #  using IndexOptions.DOCS_AND_FREQS_AND_POSITIONS since care about frequencies and positions to generate ranking
        nDocsAdded = 0
        print("List of input files is",input_files)
        for input_file in input_files:
            with open(input_file, 'r', encoding='utf8') as f1:
                # Assumes all input documents contain documents that are separated by titles denoted by  [[xxx]]
                my_line = f1.readline()
                while my_line:
                    if my_line.startswith("[[") and my_line.rstrip().endswith("]]"):
                        if cur_title != "":
                            # add previous document to index when next document
                            doc = Document()
                            doc.add(Field(self.TITLE, cur_title, t1))
                            doc.add(Field(self.TEXT, cur_body, t2))
                            doc.add(Field("Categories", self.listToString(cur_category), t3))
                            my_writer.addDocument(doc)
                            # increment counters and reset document variables
                            nDocsAdded += 1
                            cur_title = ""
                            cur_body = ""
                            cur_category = []
                            file_counter += 1
                        # store current title
                        cur_title = my_line[2:-3]
                    # store categories as a string
                    elif my_line.startswith("CATEGORIES:"):
                        # categories are in a line that starts with CATEGORIES and each category is separated by ", "
                        cur_category = my_line[11:].strip().split(", ")
                    # store body of document
                    else:
                        cur_body += my_line
                    #read next line
                    my_line = f1.readline()
                file_counter += 1
                print("File counter",file_counter) # ,"cur category",listToString(cur_category)
                # on EOF save document to index
                doc = Document()
                doc.add(Field(self.TITLE, cur_title, t1))
                doc.add(Field(self.TEXT, cur_body, t2))
                doc.add(Field("Categories", self.listToString(cur_category), t3))
                my_writer.addDocument(doc)
                cur_title = ""
                cur_body = ""
            ip_file_counter += 1
        # now safely in the provided directories: indexDir and taxoDir.
        my_writer.commit()
        my_writer.close()
        print("Indexed %d documents." % nDocsAdded)
        pass
 def createIndex_Stem(self,input_files):
     cur_title = ""
     cur_body = ""
     cur_category = []
     file_counter = 0
     ip_file_counter = 1
     # Initialize PorterStemmer analyzer & Index writer
     my_analyzer = analysis.en.EnglishAnalyzer()
     my_config = index.IndexWriterConfig(my_analyzer)
     my_config.setSimilarity(ClassicSimilarity())
     my_writer = index.IndexWriter(self.in_directory_English, my_config)
     # Setting up Title field for content we want tokenized
     t1 = FieldType()
     t1.setStored(True)
     t1.setTokenized(True)
     t1.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)
     # Setting up Body field for content we want tokenized
     t2 = FieldType()
     t2.setStored(True)
     t2.setTokenized(True)
     t2.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) #  using IndexOptions.DOCS_AND_FREQS_AND_POSITIONS since care about frequencies and positions to generate ranking
     # Setting up Categories field for content we want tokenized
     t3 = FieldType()
     t3.setStored(True)
     t3.setTokenized(True)
     t3.setIndexOptions(IndexOptions.DOCS)
     nDocsAdded = 0
     print("List of input files is",input_files)
     for input_file in input_files:
         with open(input_file, 'r', encoding='utf8') as f1:
             # Assumes all input documents contain documents that are separated by titles denoted by  [[xxx]]
             my_line = f1.readline()
             while my_line:
                 if my_line.startswith("[[") and my_line.rstrip().endswith("]]"):
                     if cur_title != "":
                         doc = Document()
                         doc.add(Field(self.TITLE, cur_title, t1))
                         doc.add(Field(self.TEXT, cur_body, t2))
                         doc.add(Field("Categories", self.listToString(cur_category), t3))
                         my_writer.addDocument(doc)
                         nDocsAdded += 1
                         cur_body = ""
                         cur_category = []
                         file_counter += 1
                     cur_title = my_line[2:-3]
                 elif my_line.startswith("CATEGORIES:"):
                     cur_category = my_line[11:].strip().split(", ")
                 else:
                     cur_body += my_line
                 my_line = f1.readline()
             file_counter += 1
             doc = Document()
             doc.add(Field(self.TITLE, cur_title, t1))
             doc.add(Field(self.TEXT, cur_body, t2))
             doc.add(Field("Categories", self.listToString(cur_category), t3))
             my_writer.addDocument(doc)
             cur_title = ""
             cur_body = ""
         ip_file_counter += 1
     my_writer.commit()
     my_writer.close()
     print("Indexed %d documents." % nDocsAdded)
     pass