def load_dataset(name, data_home=None, download_if_missing=True): """ Loads a dataframe containing the dataset specified with the 'name' field. Dataset file is stored/loaded from data_home if specified, otherwise at the MATMINER_DATA environment variable if set or at matminer/datasets by default. Args: name (str): keyword specifying what dataset to load, run matminer.datasets.get_available_datasets() for options data_home (str): path to folder to look for dataset file download_if_missing (bool): whether to download the dataset if is not found on disk Returns: (pd.DataFrame, tuple -> (pd.DataFrame, pd.DataFrame) if return_lumo = True) """ global _dataset_dict if _dataset_dict is None: _dataset_dict = _load_dataset_dict() if name not in _dataset_dict: error_string = "Unrecognized dataset name: {}. \n" \ "Use matminer.datasets.get_available_datasets() " \ "to see a list of currently available " \ "datasets".format(name) # Very simple attempt to match unrecognized keyword to existing # dataset names in an attempt to give the user immediate feedback possible_matches = [ x for x in _dataset_dict.keys() if name.lower() in x.lower() ] if possible_matches: error_string += "\nCould you have been looking for these similar " \ "matches?:\n{}".format(possible_matches) raise ValueError(error_string) dataset_metadata = _dataset_dict[name] data_path = os.path.join(_get_data_home(data_home), name + "." + dataset_metadata['file_type']) _validate_dataset(data_path, dataset_metadata['url'], dataset_metadata['hash'], download_if_missing) df = load_dataframe_from_json(data_path) return df
def load_dataset(name, data_home=None, download_if_missing=True, include_metadata=False): """ Loads a dataframe containing the dataset specified with the 'name' field. Dataset file is stored/loaded from data_home if specified, otherwise at the MATMINER_DATA environment variable if set or at matminer/datasets by default. Args: name (str): keyword specifying what dataset to load, run matminer.datasets.available_datasets() for options data_home (str): path to folder to look for dataset file download_if_missing (bool): whether to download the dataset if is not found on disk include_metadata (bool): optional argument for some datasets with metadata fields Returns: (pd.DataFrame) """ dataset_dict = _load_dataset_dict() if name not in dataset_dict: error_string = "Unrecognized dataset name: {}. \n" \ "Use matminer.datasets.available_datasets() " \ "to see a list of currently available " \ "datasets".format(name) # Very simple attempt to match unrecognized keyword to existing # dataset names in an attempt to give the user immediate feedback possible_matches = [ x for x in dataset_dict.keys() if name.lower() in x.lower() ] if possible_matches: error_string += "\nCould you have been looking for these similar " \ "matches?:\n{}".format(possible_matches) raise ValueError(error_string) dataset_metadata = dataset_dict[name] data_path = os.path.join(_get_data_home(data_home), name + "." + dataset_metadata['file_type']) _validate_dataset(data_path, dataset_metadata['url'], dataset_metadata['hash'], download_if_missing) df = load_dataframe_from_json(data_path) if not include_metadata: if name == "elastic_tensor_2015": df = df.drop(['cif', 'kpoint_density', 'poscar'], axis=1) elif name in {"piezoelectric_tensor", "dielectric_constant"}: df = df.drop(['cif', 'meta', 'poscar'], axis=1) return df
def test_get_data_home(self): home = _get_data_home() self.assertEqual(home, self.dataset_dir) specified_home = _get_data_home('/some/specified/path') self.assertEqual(specified_home, '/some/specified/path')