def __init__(self, dataset_directory, is_recursive=True): """ dataset_directory -- a relative or absolute file path to the \ directory that contains the documents directory and the dataset_metadata.txt file. is_recursive -- find documents recursively in the documents directory, \ by default documents will be found recursively. Document metadata types will be infered from the document metadata. """ # create commonly used directory and file paths self._dataset_directory = dataset_directory self._abs_dataset_directory = os.path.abspath(dataset_directory) self._metadata_file = os.path.join(self._abs_dataset_directory, 'dataset_metadata.txt') self._documents_directory = os.path.join(self._abs_dataset_directory, 'documents') self.is_recursive = is_recursive self._filters = [] self.metadata = {} # load the dataset metadata with io.open(self._metadata_file, 'r', encoding='utf-8', errors='ignore') as meta_file: content = meta_file.read() metadata, __ = basic_tools.seperate_metadata_and_content(content) self.metadata = basic_tools.metadata_to_dict(metadata) self.metadata_types = {} basic_tools.collect_types(self.metadata_types, self.metadata) if not 'readable_name' in self.metadata: identifier = self._dataset_directory.replace('_', ' ').replace( '/', ' ').title() else: identifier = self.metadata['readable_name'] self.name = basic_tools.remove_punctuation(identifier) # find and sort all file paths self._list_of_documents = basic_tools.get_all_files_from_directory( self._documents_directory, self.is_recursive) self._list_of_documents.sort() # find any bad documents and find document metadata self.document_metadata_types = {} bad_doc_indices = [] for doc_index, doc in enumerate(self): try: basic_tools.collect_types(self.document_metadata_types, doc.metadata) except Exception as e: print("Bad document: ", self._list_of_documents[doc_index]) bad_doc_indices.append(doc_index) while len(bad_doc_indices) != 0: remove_index = bad_doc_indices.pop() del self._list_of_documents[remove_index]
def __init__(self, dataset_directory, is_recursive=True): """ dataset_directory -- a relative or absolute file path to the \ directory that contains the documents directory and the dataset_metadata.txt file. is_recursive -- find documents recursively in the documents directory, \ by default documents will be found recursively. Document metadata types will be infered from the document metadata. """ # create commonly used directory and file paths self._dataset_directory = dataset_directory self._abs_dataset_directory = os.path.abspath(dataset_directory) self._metadata_file = os.path.join(self._abs_dataset_directory, 'dataset_metadata.txt') self._documents_directory = os.path.join(self._abs_dataset_directory, 'documents') self.is_recursive = is_recursive self._filters = [] self.metadata = {} # load the dataset metadata with io.open(self._metadata_file, 'r', encoding='utf-8', errors='ignore') as meta_file: content = meta_file.read() metadata, __ = basic_tools.seperate_metadata_and_content(content) self.metadata = basic_tools.metadata_to_dict(metadata) self.metadata_types = {} basic_tools.collect_types(self.metadata_types, self.metadata) if not 'readable_name' in self.metadata: identifier = self._dataset_directory.replace('_', ' ').replace('/', ' ').title() else: identifier = self.metadata['readable_name'] self.name = basic_tools.remove_punctuation(identifier) # find and sort all file paths self._list_of_documents = basic_tools.get_all_files_from_directory(self._documents_directory, self.is_recursive) self._list_of_documents.sort() # find any bad documents and find document metadata self.document_metadata_types = {} bad_doc_indices = [] for doc_index, doc in enumerate(self): try: basic_tools.collect_types(self.document_metadata_types, doc.metadata) except Exception as e: print("Bad document: ", self._list_of_documents[doc_index]) bad_doc_indices.append(doc_index) while len(bad_doc_indices) != 0: remove_index = bad_doc_indices.pop() del self._list_of_documents[remove_index]
def __init__(self, dataset_directory, **kwargs): self._dataset_directory = dataset_directory self._abs_dataset_directory = os.path.abspath(dataset_directory) self._settings_file = os.path.join(self._abs_dataset_directory, 'settings.json') if os.path.exists(self._settings_file): with io.open(self._settings_file, 'r', encoding='utf-8') as f: self._settings = json.loads(f.read())['settings'] else: self._settings = {} self.metadata_types = {} self.metadata = self._settings basic_tools.collect_types(self.metadata_types, self.metadata) self.document_metadata_types = {'meta': 'text'} self.name = self._settings.setdefault('name', 'random') self.number_of_documents = self._settings.setdefault('number_of_documents', 1000) self.document_length = self._settings.setdefault('document_length', 1000) self.seed = self._settings.setdefault('seed', 0)
def __init__(self, dataset_directory, **kwargs): self._dataset_directory = dataset_directory self._abs_dataset_directory = os.path.abspath(dataset_directory) self._settings_file = os.path.join(self._abs_dataset_directory, 'settings.json') if os.path.exists(self._settings_file): with io.open(self._settings_file, 'r', encoding='utf-8') as f: self._settings = json.loads(f.read())['settings'] else: self._settings = {} self.metadata_types = {} self.metadata = self._settings basic_tools.collect_types(self.metadata_types, self.metadata) self.document_metadata_types = {'meta': 'text'} self.name = self._settings.setdefault('name', 'random') self.number_of_documents = self._settings.setdefault( 'number_of_documents', 1000) self.document_length = self._settings.setdefault( 'document_length', 1000) self.seed = self._settings.setdefault('seed', 0)
def metadata_types(self): result = {} basic_tools.collect_types(result, self.metadata) return result