def _read_document(self): """Must set the self._content and self._metadata variables.""" with io.open(self._document_path, 'r', encoding='utf-8', errors='ignore') as f: text = f.read() metadata, content = basic_tools.seperate_metadata_and_content(text) self._metadata = basic_tools.metadata_to_dict(metadata) for f in self._filters: content = f(content) self._content = content
def __init__(self, dataset_directory, is_recursive=True): """ dataset_directory -- a relative or absolute file path to the \ directory that contains the documents directory and the dataset_metadata.txt file. is_recursive -- find documents recursively in the documents directory, \ by default documents will be found recursively. Document metadata types will be infered from the document metadata. """ # create commonly used directory and file paths self._dataset_directory = dataset_directory self._abs_dataset_directory = os.path.abspath(dataset_directory) self._metadata_file = os.path.join(self._abs_dataset_directory, 'dataset_metadata.txt') self._documents_directory = os.path.join(self._abs_dataset_directory, 'documents') self.is_recursive = is_recursive self._filters = [] self.metadata = {} # load the dataset metadata with io.open(self._metadata_file, 'r', encoding='utf-8', errors='ignore') as meta_file: content = meta_file.read() metadata, __ = basic_tools.seperate_metadata_and_content(content) self.metadata = basic_tools.metadata_to_dict(metadata) self.metadata_types = {} basic_tools.collect_types(self.metadata_types, self.metadata) if not 'readable_name' in self.metadata: identifier = self._dataset_directory.replace('_', ' ').replace( '/', ' ').title() else: identifier = self.metadata['readable_name'] self.name = basic_tools.remove_punctuation(identifier) # find and sort all file paths self._list_of_documents = basic_tools.get_all_files_from_directory( self._documents_directory, self.is_recursive) self._list_of_documents.sort() # find any bad documents and find document metadata self.document_metadata_types = {} bad_doc_indices = [] for doc_index, doc in enumerate(self): try: basic_tools.collect_types(self.document_metadata_types, doc.metadata) except Exception as e: print("Bad document: ", self._list_of_documents[doc_index]) bad_doc_indices.append(doc_index) while len(bad_doc_indices) != 0: remove_index = bad_doc_indices.pop() del self._list_of_documents[remove_index]
def __init__(self, dataset_directory, is_recursive=True): """ dataset_directory -- a relative or absolute file path to the \ directory that contains the documents directory and the dataset_metadata.txt file. is_recursive -- find documents recursively in the documents directory, \ by default documents will be found recursively. Document metadata types will be infered from the document metadata. """ # create commonly used directory and file paths self._dataset_directory = dataset_directory self._abs_dataset_directory = os.path.abspath(dataset_directory) self._metadata_file = os.path.join(self._abs_dataset_directory, 'dataset_metadata.txt') self._documents_directory = os.path.join(self._abs_dataset_directory, 'documents') self.is_recursive = is_recursive self._filters = [] self.metadata = {} # load the dataset metadata with io.open(self._metadata_file, 'r', encoding='utf-8', errors='ignore') as meta_file: content = meta_file.read() metadata, __ = basic_tools.seperate_metadata_and_content(content) self.metadata = basic_tools.metadata_to_dict(metadata) self.metadata_types = {} basic_tools.collect_types(self.metadata_types, self.metadata) if not 'readable_name' in self.metadata: identifier = self._dataset_directory.replace('_', ' ').replace('/', ' ').title() else: identifier = self.metadata['readable_name'] self.name = basic_tools.remove_punctuation(identifier) # find and sort all file paths self._list_of_documents = basic_tools.get_all_files_from_directory(self._documents_directory, self.is_recursive) self._list_of_documents.sort() # find any bad documents and find document metadata self.document_metadata_types = {} bad_doc_indices = [] for doc_index, doc in enumerate(self): try: basic_tools.collect_types(self.document_metadata_types, doc.metadata) except Exception as e: print("Bad document: ", self._list_of_documents[doc_index]) bad_doc_indices.append(doc_index) while len(bad_doc_indices) != 0: remove_index = bad_doc_indices.pop() del self._list_of_documents[remove_index]