def get_database_configurations(file_path): """ Get the database configurations, an error is thrown if the \ file cannot be read or is not found. If the contents don't make sense return an empty dictionary. For relative filenames, the folder they will be relative to is \ the working directory. file_path -- file specifying the database configurations in key: value pairs. """ database_config = {} key_names = [ 'ENGINE', 'NAME', 'HOST', 'OPTIONS', 'PASSWORD', 'PORT', 'USER' ] with open(file_path, 'r') as f: database_config = basic_tools.metadata_to_dict( f.read()) # read in the database configurations # make sure the key names are upper case for key in key_names: if key.lower() in database_config: database_config[key] = database_config[key.lower()] del database_config[key.lower()] # make sure that the relative path is relative to the working directory if 'NAME' in database_config and not isdir(database_config['NAME']): if not isabs( database_config['NAME'] ): # create an absolute path if a relative one is specified topical_guide_dir = os.path.abspath(os.path.dirname(__file__)) database_config['NAME'] = os.path.join(topical_guide_dir, 'working', database_config['NAME']) return database_config
def get_database_configurations(file_path): """ Get the database configurations, an error is thrown if the \ file cannot be read or is not found. If the contents don't make sense return an empty dictionary. For relative filenames, the folder they will be relative to is \ the working directory. file_path -- file specifying the database configurations in key: value pairs. """ database_config = {} key_names = ['ENGINE', 'NAME', 'HOST', 'OPTIONS', 'PASSWORD', 'PORT', 'USER'] with open(file_path, 'r') as f: database_config = basic_tools.metadata_to_dict(f.read()) # read in the database configurations # make sure the key names are upper case for key in key_names: if key.lower() in database_config: database_config[key] = database_config[key.lower()] del database_config[key.lower()] # make sure that the relative path is relative to the working directory if 'NAME' in database_config and not isdir(database_config['NAME']): if not isabs(database_config['NAME']): # create an absolute path if a relative one is specified topical_guide_dir = os.path.abspath(os.path.dirname(__file__)) database_config['NAME'] = os.path.join(topical_guide_dir, 'working', database_config['NAME']) return database_config
def _read_document(self): """Must set the self._content and self._metadata variables.""" with io.open(self._document_path, 'r', encoding='utf-8', errors='ignore') as f: text = f.read() metadata, content = basic_tools.seperate_metadata_and_content(text) self._metadata = basic_tools.metadata_to_dict(metadata) for f in self._filters: content = f(content) self._content = content
def __init__(self, dataset_directory, is_recursive=True): """ dataset_directory -- a relative or absolute file path to the \ directory that contains the documents directory and the dataset_metadata.txt file. is_recursive -- find documents recursively in the documents directory, \ by default documents will be found recursively. Document metadata types will be infered from the document metadata. """ # create commonly used directory and file paths self._dataset_directory = dataset_directory self._abs_dataset_directory = os.path.abspath(dataset_directory) self._metadata_file = os.path.join(self._abs_dataset_directory, 'dataset_metadata.txt') self._documents_directory = os.path.join(self._abs_dataset_directory, 'documents') self.is_recursive = is_recursive self._filters = [] self.metadata = {} # load the dataset metadata with io.open(self._metadata_file, 'r', encoding='utf-8', errors='ignore') as meta_file: content = meta_file.read() metadata, __ = basic_tools.seperate_metadata_and_content(content) self.metadata = basic_tools.metadata_to_dict(metadata) self.metadata_types = {} basic_tools.collect_types(self.metadata_types, self.metadata) if not 'readable_name' in self.metadata: identifier = self._dataset_directory.replace('_', ' ').replace( '/', ' ').title() else: identifier = self.metadata['readable_name'] self.name = basic_tools.remove_punctuation(identifier) # find and sort all file paths self._list_of_documents = basic_tools.get_all_files_from_directory( self._documents_directory, self.is_recursive) self._list_of_documents.sort() # find any bad documents and find document metadata self.document_metadata_types = {} bad_doc_indices = [] for doc_index, doc in enumerate(self): try: basic_tools.collect_types(self.document_metadata_types, doc.metadata) except Exception as e: print("Bad document: ", self._list_of_documents[doc_index]) bad_doc_indices.append(doc_index) while len(bad_doc_indices) != 0: remove_index = bad_doc_indices.pop() del self._list_of_documents[remove_index]
def __init__(self, dataset_directory, is_recursive=True): """ dataset_directory -- a relative or absolute file path to the \ directory that contains the documents directory and the dataset_metadata.txt file. is_recursive -- find documents recursively in the documents directory, \ by default documents will be found recursively. Document metadata types will be infered from the document metadata. """ # create commonly used directory and file paths self._dataset_directory = dataset_directory self._abs_dataset_directory = os.path.abspath(dataset_directory) self._metadata_file = os.path.join(self._abs_dataset_directory, 'dataset_metadata.txt') self._documents_directory = os.path.join(self._abs_dataset_directory, 'documents') self.is_recursive = is_recursive self._filters = [] self.metadata = {} # load the dataset metadata with io.open(self._metadata_file, 'r', encoding='utf-8', errors='ignore') as meta_file: content = meta_file.read() metadata, __ = basic_tools.seperate_metadata_and_content(content) self.metadata = basic_tools.metadata_to_dict(metadata) self.metadata_types = {} basic_tools.collect_types(self.metadata_types, self.metadata) if not 'readable_name' in self.metadata: identifier = self._dataset_directory.replace('_', ' ').replace('/', ' ').title() else: identifier = self.metadata['readable_name'] self.name = basic_tools.remove_punctuation(identifier) # find and sort all file paths self._list_of_documents = basic_tools.get_all_files_from_directory(self._documents_directory, self.is_recursive) self._list_of_documents.sort() # find any bad documents and find document metadata self.document_metadata_types = {} bad_doc_indices = [] for doc_index, doc in enumerate(self): try: basic_tools.collect_types(self.document_metadata_types, doc.metadata) except Exception as e: print("Bad document: ", self._list_of_documents[doc_index]) bad_doc_indices.append(doc_index) while len(bad_doc_indices) != 0: remove_index = bad_doc_indices.pop() del self._list_of_documents[remove_index]
def test_metadata_to_dict(): meta = """KEY WITH Spaces: Value with: Semi-colon YEAR: 1983 MONTH: Nov. SPEAKER: Marvin J. Ashton CALLING: Of the Quorum of the Twelve Apostles TOPIC: commitment""" meta_dict = basic_tools.metadata_to_dict(meta) actual_meta_dict = {'key_with_spaces': 'Value with: Semi-colon', 'year': '1983', 'month': 'Nov.', 'speaker': 'Marvin J. Ashton', 'topic': 'commitment'} for key in actual_meta_dict: assert key in meta_dict assert meta_dict[key] == actual_meta_dict[key]