예제 #1
0
 def load_docs(self):
     """
     Aims at loading all the collection's documents (processed) in the collection instance.
     """
     pickle_path = f"pickle/{self.name}_docs.p"
     try:
         self.documents = load(open(pickle_path, "rb"))
         self.number_of_docs = len(self.documents)
     except FileNotFoundError:
         number_document_loaded = 0
         for id_directory in range(10):
             print(f"Loading directory {id_directory}")
             path_directory = self.path_to_corpus + str(id_directory)
             for text_file in listdir(path_directory):
                 # create a document instance
                 document = Document(
                     id_doc=number_document_loaded,
                     id_folder=id_directory,
                     address=text_file,
                 )
                 # load data and process documents (filter, remove stopwords and lemmatize)
                 document.get_content(self.path_to_corpus)
                 document.process_document(stopwords_list=self.stopwords,
                                           lemmatizer=self.lemmatizer)
                 self.documents.append(document)
                 number_document_loaded += 1
         makedirs(path.dirname(pickle_path), exist_ok=True)
         dump(self.documents, open(pickle_path, "wb"))
         self.number_of_docs = number_document_loaded