def setUp(self): self.dataset_names = [ 'flla', 'elastic_tensor_2015', 'piezoelectric_tensor', 'dielectric_constant' ] self.dataset_attributes = [ 'file_type', 'url', 'hash', 'reference', 'description', 'columns', 'bibtex_refs', 'num_entries' ] self.dataset_dict = _load_dataset_dict() # current directory, for storing and discarding test_dataset current_dir = os.path.dirname(os.path.abspath(__file__)) # directory where in-use datasets should be stored, # either at MATMINER_DATA env var or under matminer/datasets/ self.dataset_dir = os.environ.get( "MATMINER_DATA", os.path.abspath(os.path.join(current_dir, os.pardir)) ) # Shared set up for test_validate_dataset & test_fetch_external_dataset self._path = os.path.join(current_dir, "test_dataset.csv") self._url = "https://ndownloader.figshare.com/files/13039562" self._hash = "c487f59ce0d48505c36633b4b202027" \ "d0c915474b081e8fb0bde8d5474ee59a1"
def setUp(self): self.dataset_names = [ 'flla', 'elastic_tensor_2015', 'piezoelectric_tensor', 'dielectric_constant', 'castelli_perovskites', 'boltztrap_mp', 'phonon_dielectric_mp', 'glass_ternary_hipt', 'double_perovskites_gap', 'double_perovskites_gap_lumo', 'mp_all', 'mp_nostruct', 'glass_ternary_landolt', 'citrine_thermal_conductivity', 'wolverton_oxides', 'heusler_magnetic', 'steel_strength', 'jarvis_ml_dft_training', 'jarvis_dft_2d', 'jarvis_dft_3d', 'glass_binary', 'm2ax', 'expt_gap', 'expt_formation_enthalpy' ] self.dataset_attributes = [ 'file_type', 'url', 'hash', 'reference', 'description', 'columns', 'bibtex_refs', 'num_entries' ] self.dataset_dict = _load_dataset_dict() # current directory, for storing and discarding test_dataset current_dir = os.path.dirname(os.path.abspath(__file__)) # directory where in-use datasets should be stored, # either at MATMINER_DATA env var or under matminer/datasets/ self.dataset_dir = os.environ.get( "MATMINER_DATA", os.path.abspath(os.path.join(current_dir, os.pardir))) # Shared set up for test_validate_dataset & test_fetch_external_dataset self._path = os.path.join(current_dir, "test_dataset.csv") self._url = "https://ndownloader.figshare.com/files/13039562" self._hash = "c487f59ce0d48505c36633b4b202027" \ "d0c915474b081e8fb0bde8d5474ee59a1"
def test_load_dataset_dict(self): dataset_dict = _load_dataset_dict() # Check to make sure all datasets are present and have string type keys self.assertEqual(set(dataset_dict.keys()), set(self.dataset_names)) # Check the validity of each set of values in each dataset for value in dataset_dict.values(): # Check to make sure each dataset has all attributes # and string type keys self.assertEqual(set(value.keys()), set(self.dataset_attributes)) # Make sure string attributes have string values for item in [ 'file_type', 'url', 'hash', 'reference', 'description' ]: self.assertIsInstance(value[item], str) # Make sure int attributes have int values self.assertIsInstance(value['num_entries'], int) # Make sure refs are in a list and are strings self.assertIsInstance(value['bibtex_refs'], list) for ref in value['bibtex_refs']: self.assertIsInstance(ref, str) # Make sure columns is a dict and it has string valued entries self.assertIsInstance(value['columns'], dict) for column_name, column_description in value['columns'].items(): self.assertIsInstance(column_name, str) self.assertIsInstance(column_description, str)
def load_dataset(name, data_home=None, download_if_missing=True): """ Loads a dataframe containing the dataset specified with the 'name' field. Dataset file is stored/loaded from data_home if specified, otherwise at the MATMINER_DATA environment variable if set or at matminer/datasets by default. Args: name (str): keyword specifying what dataset to load, run matminer.datasets.get_available_datasets() for options data_home (str): path to folder to look for dataset file download_if_missing (bool): whether to download the dataset if is not found on disk Returns: (pd.DataFrame, tuple -> (pd.DataFrame, pd.DataFrame) if return_lumo = True) """ global _dataset_dict if _dataset_dict is None: _dataset_dict = _load_dataset_dict() if name not in _dataset_dict: error_string = "Unrecognized dataset name: {}. \n" \ "Use matminer.datasets.get_available_datasets() " \ "to see a list of currently available " \ "datasets".format(name) # Very simple attempt to match unrecognized keyword to existing # dataset names in an attempt to give the user immediate feedback possible_matches = [ x for x in _dataset_dict.keys() if name.lower() in x.lower() ] if possible_matches: error_string += "\nCould you have been looking for these similar " \ "matches?:\n{}".format(possible_matches) raise ValueError(error_string) dataset_metadata = _dataset_dict[name] data_path = os.path.join(_get_data_home(data_home), name + "." + dataset_metadata['file_type']) _validate_dataset(data_path, dataset_metadata['url'], dataset_metadata['hash'], download_if_missing) df = load_dataframe_from_json(data_path) return df
def get_available_datasets(print_format="medium", sort_method='alphabetical'): """ Function for retrieving the datasets available within matminer. Args: print_format (None, str): None, "short", "medium", or "long": None: Don't print anything "short": only the dataset names "medium": dataset names and their descriptions "long": All dataset info associated with the dataset sort_method (str): By what metric to sort the datasets when retrieving their information. alphabetical: sorts by dataset name, num_entries: sorts by number of dataset entries Returns: (list) """ global _dataset_dict if _dataset_dict is None: _dataset_dict = _load_dataset_dict() if sort_method not in {"alphabetical", "num_entries"}: raise ValueError("Error, unsupported sorting metric {}" " see docs for options".format(sort_method)) if sort_method == 'num_entries': dataset_names = sorted(_dataset_dict.keys(), key=lambda x: _dataset_dict[x]["num_entries"], reverse=True) else: dataset_names = sorted(_dataset_dict.keys()) # If checks done before for loop to avoid unnecessary repetitive evaluation if print_format is not None: dataset_string = "" if print_format == "short": for dataset_name in dataset_names: dataset_string += f"{dataset_name}\n" elif print_format == "medium": for dataset_name in dataset_names: dataset_description = get_dataset_description(dataset_name) dataset_string += f"{dataset_name}: " \ f"{dataset_description}\n\n" elif print_format == "long": for dataset_name in dataset_names: dataset_string += f"{get_all_dataset_info(dataset_name)}" print(dataset_string) return dataset_names
def get_available_datasets(print_datasets=True, print_descriptions=True, sort_method='alphabetical'): """ Function for retrieving the datasets available within matminer. Args: print_datasets (bool): Whether to, along with returning a list of dataset names, also print info on each dataset print_descriptions (bool): Whether to print the description of the dataset along with the name. Ignored if print_datasets is False sort_method (str): By what metric to sort the datasets when retrieving their information. alphabetical: sorts by dataset name, num_entries: sorts by number of dataset entries Returns: (list) """ global _dataset_dict if _dataset_dict is None: _dataset_dict = _load_dataset_dict() if sort_method not in {"alphabetical", "num_entries"}: raise ValueError("Error, unsupported sorting metric {}" " see docs for options".format(sort_method)) if sort_method == 'num_entries': dataset_names = sorted(_dataset_dict.keys(), key=lambda x: _dataset_dict[x]["num_entries"], reverse=True) else: dataset_names = sorted(_dataset_dict.keys()) # If checks done before for loop to avoid unnecessary repetitive evaluation if print_datasets and print_descriptions: for name in dataset_names: # Printing blank line with sep=\n to give extra line break print(name, _dataset_dict[name]["description"], "", sep="\n") elif print_datasets: for name in dataset_names: print(name) return dataset_names
def get_dataset_attribute(dataset_name, attrib_key): """ Helper function for getting generic attributes of the dataset Args: dataset_name (str): Name of the dataset querying info from attrib_key (str): Name of attribute to pull Returns: Dataset attribute """ # Load the dictionary into a global variable, keep around for future access global _dataset_dict if _dataset_dict is None: _dataset_dict = _load_dataset_dict() return _dataset_dict[dataset_name][attrib_key]
def test_load_dataset_dict(self): dataset_dict = _load_dataset_dict() # Check to make sure all datasets are present and have string type keys self.assertEqual(set(dataset_dict.keys()), set(self.dataset_names)) # Check the validity of each set of values in each dataset for value in dataset_dict.values(): # Check to make sure each dataset has all attributes # and string type keys self.assertEqual(set(value.keys()), set(self.dataset_attributes)) # Make sure string attributes have string values for item in ['file_type', 'url', 'hash', 'reference', 'description']: self.assertIsInstance(value[item], str) # Make sure int attributes have int values self.assertIsInstance(value['num_entries'], int) # Make sure refs are in a list and are strings self.assertIsInstance(value['bibtex_refs'], list) for ref in value['bibtex_refs']: self.assertIsInstance(ref, str) # Make sure columns is a dict and it has string valued entries self.assertIsInstance(value['columns'], dict) for column_name, column_description in value['columns'].items(): self.assertIsInstance(column_name, str) self.assertIsInstance(column_description, str)
def load_dataset(name, data_home=None, download_if_missing=True, include_metadata=False): """ Loads a dataframe containing the dataset specified with the 'name' field. Dataset file is stored/loaded from data_home if specified, otherwise at the MATMINER_DATA environment variable if set or at matminer/datasets by default. Args: name (str): keyword specifying what dataset to load, run matminer.datasets.available_datasets() for options data_home (str): path to folder to look for dataset file download_if_missing (bool): whether to download the dataset if is not found on disk include_metadata (bool): optional argument for some datasets with metadata fields Returns: (pd.DataFrame) """ dataset_dict = _load_dataset_dict() if name not in dataset_dict: error_string = "Unrecognized dataset name: {}. \n" \ "Use matminer.datasets.available_datasets() " \ "to see a list of currently available " \ "datasets".format(name) # Very simple attempt to match unrecognized keyword to existing # dataset names in an attempt to give the user immediate feedback possible_matches = [ x for x in dataset_dict.keys() if name.lower() in x.lower() ] if possible_matches: error_string += "\nCould you have been looking for these similar " \ "matches?:\n{}".format(possible_matches) raise ValueError(error_string) dataset_metadata = dataset_dict[name] data_path = os.path.join(_get_data_home(data_home), name + "." + dataset_metadata['file_type']) _validate_dataset(data_path, dataset_metadata['url'], dataset_metadata['hash'], download_if_missing) df = load_dataframe_from_json(data_path) if not include_metadata: if name == "elastic_tensor_2015": df = df.drop(['cif', 'kpoint_density', 'poscar'], axis=1) elif name in {"piezoelectric_tensor", "dielectric_constant"}: df = df.drop(['cif', 'meta', 'poscar'], axis=1) return df
ref = input() new_reference = "\n".join(reference_lines).strip() if new_reference: print('The following will be added:') print(new_reference) reference = new_reference elif attrib_name == "url": url = input("Add a file download url: ").strip() else: print("Invalid option") if __name__ == '__main__': _dataset_dict = _load_dataset_dict() with open(".dataset_data_backup.json", 'w') as outfile: json.dump(_dataset_dict, outfile, indent=4, sort_keys=True) unsaved_changes = False _temp_dataset = deepcopy(_dataset_dict) quit_flag = False while not quit_flag: print(intro_message) command = input(">>> ") command = command.strip().lower() # Show current datasets if command == "1": print("Current Datasets:") pprint(_temp_dataset, width=150)
""" Metadata for matbench. """ from matminer.datasets.utils import _load_dataset_dict from monty.serialization import loadfn from matbench.constants import ( MBV01_DATASET_METADATA_PATH, MBV01_VALIDATION_DATA_PATH, ) from matbench.util import RecursiveDotDict MATMINER_DATASET_METADATA = _load_dataset_dict() mbv01_metadata = loadfn(MBV01_DATASET_METADATA_PATH) for d in mbv01_metadata.keys(): mbv01_metadata[d].update(MATMINER_DATASET_METADATA[d]) mbv01_metadata = RecursiveDotDict(mbv01_metadata) mbv01_validation = loadfn(MBV01_VALIDATION_DATA_PATH) mbv01_validation = RecursiveDotDict(mbv01_validation)