def test_load_dataframe_from_json(self): df = load_dataframe_from_json(os.path.join(test_dir, 'dataframe.json')) self.assertTrue(self.diamond == df['structure'][0], "Dataframe contents do not match") df = load_dataframe_from_json(os.path.join(test_dir, 'dataframe.json.gz')) self.assertTrue(self.diamond == df['structure'][0], "Dataframe contents do not match") df = load_dataframe_from_json(os.path.join(test_dir, 'dataframe.json.bz2')) self.assertTrue(self.diamond == df['structure'][0], "Dataframe contents do not match")
def test_load_dataframe_from_json(self): df = load_dataframe_from_json(os.path.join(test_dir, 'dataframe.json')) self.assertTrue(self.diamond == df['structure'][0], "Dataframe contents do not match") df = load_dataframe_from_json( os.path.join(test_dir, 'dataframe.json.gz')) self.assertTrue(self.diamond == df['structure'][0], "Dataframe contents do not match") df = load_dataframe_from_json( os.path.join(test_dir, 'dataframe.json.bz2')) self.assertTrue(self.diamond == df['structure'][0], "Dataframe contents do not match")
def run_task(self, fw_spec): # Read data from fw_spec pipe_config_dict = fw_spec["pipe_config"] target = fw_spec["target"] data_file = fw_spec["data_file"] learner_name = pipe_config_dict["learner_name"] learner_kwargs = pipe_config_dict["learner_kwargs"] reducer_kwargs = pipe_config_dict["reducer_kwargs"] cleaner_kwargs = pipe_config_dict["cleaner_kwargs"] autofeaturizer_kwargs = pipe_config_dict["autofeaturizer_kwargs"] # Modify data_file based on computing resource data_dir = os.environ["AMM_DATASET_DIR"] data_file = os.path.join(data_dir, data_file) # Modify save_dir based on computing resource bench_dir = os.environ["AMM_SINGLE_FIT_DIR"] base_save_dir = fw_spec["base_save_dir"] base_save_dir = os.path.join(bench_dir, base_save_dir) if not os.path.exists(base_save_dir): os.makedirs(base_save_dir) # Set up pipeline config if learner_name == "TPOTAdaptor": learner = TPOTAdaptor(**learner_kwargs) elif learner_name == "rf": warnings.warn( "Learner kwargs passed into RF regressor/classifiers bc. rf being used." ) learner = SinglePipelineAdaptor( regressor=RandomForestRegressor(**learner_kwargs), classifier=RandomForestClassifier(**learner_kwargs), ) else: raise ValueError("{} not supported yet!" "".format(learner_name)) pipe_config = { "learner": learner, "reducer": FeatureReducer(**reducer_kwargs), "cleaner": DataCleaner(**cleaner_kwargs), "autofeaturizer": AutoFeaturizer(**autofeaturizer_kwargs), } pipe = MatPipe(**pipe_config) # Set up dataset # Dataset should already be set up correctly as json beforehand. # this includes targets being converted to classification, removing # extra columns, having the names of featurization cols set to the # same as the matpipe config, etc. df = load_dataframe_from_json(data_file) pipe.fit(df, target) pipe.save(os.path.join(base_save_dir, "pipe.p"))
def load_dataset(name, data_home=None, download_if_missing=True): """ Loads a dataframe containing the dataset specified with the 'name' field. Dataset file is stored/loaded from data_home if specified, otherwise at the MATMINER_DATA environment variable if set or at matminer/datasets by default. Args: name (str): keyword specifying what dataset to load, run matminer.datasets.get_available_datasets() for options data_home (str): path to folder to look for dataset file download_if_missing (bool): whether to download the dataset if is not found on disk Returns: (pd.DataFrame, tuple -> (pd.DataFrame, pd.DataFrame) if return_lumo = True) """ global _dataset_dict if _dataset_dict is None: _dataset_dict = _load_dataset_dict() if name not in _dataset_dict: error_string = "Unrecognized dataset name: {}. \n" \ "Use matminer.datasets.get_available_datasets() " \ "to see a list of currently available " \ "datasets".format(name) # Very simple attempt to match unrecognized keyword to existing # dataset names in an attempt to give the user immediate feedback possible_matches = [ x for x in _dataset_dict.keys() if name.lower() in x.lower() ] if possible_matches: error_string += "\nCould you have been looking for these similar " \ "matches?:\n{}".format(possible_matches) raise ValueError(error_string) dataset_metadata = _dataset_dict[name] data_path = os.path.join(_get_data_home(data_home), name + "." + dataset_metadata['file_type']) _validate_dataset(data_path, dataset_metadata['url'], dataset_metadata['hash'], download_if_missing) df = load_dataframe_from_json(data_path) return df
def __init__(self, initial_ltol: float = 0.2, initial_stol: float = 0.3, initial_angle_tol: float = 5., use_fingerprint_matching: bool = True, fingerprint_distance_cutoff: float = 0.4): db_file = resource_filename('robocrys.condense', 'mineral_db.json.gz') self.mineral_db = load_dataframe_from_json(db_file) self.initial_ltol = initial_ltol self.initial_stol = initial_stol self.initial_angle_tol = initial_angle_tol self.fingerprint_distance_cutoff = fingerprint_distance_cutoff self.use_fingerprint_matching = use_fingerprint_matching self._structure = None self._mineral_db = None
# -- start import os import pandas as pd import numpy as np import keras import matminer import pickle from keras.models import load_model from matminer.utils.io import load_dataframe_from_json from sklearn.preprocessing import MinMaxScaler from sklearn.model_selection import train_test_split from supplement import searchNN, trainNN, evaluateModel, createRF #stores custom methods/functions fdf = load_dataframe_from_json('Batteries_feat.json') print("The starting dataset has {}".format(fdf.shape)) print(fdf.head()) ''' Block 1 - Random Forest ''' print('\n---\n') print('Selecting target variable...') excluded = [ 'Spacegroup', 'Capacity Grav', 'Capacity Vol', 'Specific E Wh/kg', 'E Density Wh/l', 'Stability Charge', 'Stability Discharge', 'Ion', 'Reduced Formula', 'Id', 'composition', 'composition_oxid', 'HOMO_character', 'HOMO_element', 'LUMO_character', 'LUMO_element' ]
def load_dataset(name, data_home=None, download_if_missing=True, include_metadata=False): """ Loads a dataframe containing the dataset specified with the 'name' field. Dataset file is stored/loaded from data_home if specified, otherwise at the MATMINER_DATA environment variable if set or at matminer/datasets by default. Args: name (str): keyword specifying what dataset to load, run matminer.datasets.available_datasets() for options data_home (str): path to folder to look for dataset file download_if_missing (bool): whether to download the dataset if is not found on disk include_metadata (bool): optional argument for some datasets with metadata fields Returns: (pd.DataFrame) """ dataset_dict = _load_dataset_dict() if name not in dataset_dict: error_string = "Unrecognized dataset name: {}. \n" \ "Use matminer.datasets.available_datasets() " \ "to see a list of currently available " \ "datasets".format(name) # Very simple attempt to match unrecognized keyword to existing # dataset names in an attempt to give the user immediate feedback possible_matches = [ x for x in dataset_dict.keys() if name.lower() in x.lower() ] if possible_matches: error_string += "\nCould you have been looking for these similar " \ "matches?:\n{}".format(possible_matches) raise ValueError(error_string) dataset_metadata = dataset_dict[name] data_path = os.path.join(_get_data_home(data_home), name + "." + dataset_metadata['file_type']) _validate_dataset(data_path, dataset_metadata['url'], dataset_metadata['hash'], download_if_missing) df = load_dataframe_from_json(data_path) if not include_metadata: if name == "elastic_tensor_2015": df = df.drop(['cif', 'kpoint_density', 'poscar'], axis=1) elif name in {"piezoelectric_tensor", "dielectric_constant"}: df = df.drop(['cif', 'meta', 'poscar'], axis=1) return df
from sklearn.dummy import DummyClassifier, DummyRegressor from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold from automatminer.utils.ml import regression_or_classification from automatminer.utils.ml import AMM_CLF_NAME, AMM_REG_NAME from automatminer_dev.config import BENCHMARK_FULL_SET, GLASS, EXPT_IS_METAL, EXPT_GAP from matminer.utils.io import load_dataframe_from_json benchmark_dir = os.environ["AMM_DATASET_DIR"] bmarks = BENCHMARK_FULL_SET bmarks = [GLASS, EXPT_GAP, EXPT_IS_METAL] for p in bmarks: pname = p["name"] print("Loading {}".format(pname)) df = load_dataframe_from_json(os.path.join(benchmark_dir, p["data_file"])) target = p["target"] ltype = p["problem_type"] if ltype == AMM_REG_NAME: kf = KFold(n_splits=5, random_state=18012019, shuffle=True) estimator = DummyRegressor(strategy="mean") scoring = "neg_mean_absolute_error" multiplier = -1 elif ltype == AMM_CLF_NAME: kf = StratifiedKFold(n_splits=5, random_state=18012019, shuffle=True) estimator = DummyClassifier(strategy="stratified") multiplier = 1 scoring = "roc_auc" else: raise ValueError("problem type {} is not known.".format(ltype))
import pandas as pd import numpy as np import os from pymatgen import MPRester import matminer from matminer.data_retrieval.retrieve_MP import MPDataRetrieval from matminer.utils.io import store_dataframe_as_json from matminer.utils.io import load_dataframe_from_json from matminer.figrecipes.plot import PlotlyFig ''' #Block 1 - Loading and filtering the experimental dataframe ''' df = load_dataframe_from_json('data/Batteries_raw.json') # Select the working ion among {Li, Al, Zr, Mg} select = 'Li' # Initial filter based on the selected element from matminer.featurizers.conversions import StrToComposition fdf = StrToComposition().featurize_dataframe(df, 'Ion') select_at = fdf["composition"].apply(lambda x: x.get_atomic_fraction(select)) fdf = fdf[select_at == 1] # Debug print("Remaining samples: {}".format(fdf.describe)) fdf = fdf.drop(['composition'], axis=1)
def test_featurize_bsdos(self, refresh_df_init=False, limit=1): """ Tests featurize_dos and featurize_bandstructure. Args: refresh_df_init (bool): for developers, if the test need to be updated set to True. Otherwise set to False to make the final test independent of MPRester and faster. limit (int): the maximum final number of entries. Returns (None): """ target = "color" df_bsdos_pickled = "mp_data_with_dos_bandstructure.pickle" save_path = os.path.join(TEST_DIR, df_bsdos_pickled) if refresh_df_init: mpdr = MPDataRetrieval() df = mpdr.get_dataframe( criteria={"material_id": "mp-149"}, properties=[ "pretty_formula", "dos", "bandstructure", "bandstructure_uniform", ], ) store_dataframe_as_json(df, save_path) else: df = load_dataframe_from_json(save_path) df = df.dropna(axis=0) df = df.rename( columns={ "bandstructure_uniform": "bandstructure", "bandstructure": "line bandstructure", } ) df[target] = [["red"]] n_cols_init = df.shape[1] featurizer = AutoFeaturizer( preset="express", ignore_errors=False, multiindex=False ) df = featurizer.fit_transform(df, target) # sanity checks self.assertTrue(len(df), limit) self.assertGreater(len(df.columns), n_cols_init) # DOSFeaturizer: self.assertEqual(df["cbm_character_1"][0], "p") # DopingFermi: self.assertAlmostEqual(df["fermi_c1e+20T300"][0], -0.539, 3) # Hybridization: self.assertAlmostEqual(df["vbm_sp"][0], 0.181, 3) self.assertAlmostEqual(df["cbm_s"][0], 0.4416, 3) self.assertAlmostEqual(df["cbm_sp"][0], 0.9864, 3) # BandFeaturizer: self.assertAlmostEqual(df["direct_gap"][0], 2.556, 3) self.assertAlmostEqual(df["n_ex1_norm"][0], 0.6285, 4) # BranchPointEnergy: self.assertAlmostEqual(df["branch_point_energy"][0], 5.7677, 4)
def transform(self, df, target, prevent_cache_overwrite=False): """ Decorate a dataframe containing composition, structure, bandstructure, and/or DOS objects with descriptors. Args: df (pandas.DataFrame): The dataframe not containing features. target (str): The ML-target property contained in the df. prevent_cache_overwrite (bool): If True, does not try to write any new features to the cache. Returns: df (pandas.DataFrame): Transformed dataframe containing features. """ if self.cache_src and os.path.exists(self.cache_src): logger.debug(self._log_prefix + "Reading cache_src {}".format(self.cache_src)) cached_df = load_dataframe_from_json(self.cache_src) if not all([loc in cached_df.index for loc in df.index]): raise AutomatminerError("Feature cache does not contain all " "entries (by DataFrame index) needed " "to transform the input df.") else: cached_subdf = cached_df.loc[df.index] if target in cached_subdf.columns: if target not in df.columns: logger.warn( self._log_prefix + "Target not present in both cached df and input df." " Cannot perform comparison to ensure index match." ) else: cached_targets = cached_subdf[target] input_targets = df[target] cached_type = regression_or_classification( cached_targets) input_type = regression_or_classification( input_targets) if cached_type != input_type: raise AutomatminerError( "Cached targets appear to be '{}' type, while " "input targets appear to be '{}'." "".format(cached_type, input_type)) problems = {} for ix in input_targets.index: iv = input_targets[ix] cv = cached_targets[ix] if iv != cv: try: if not math.isclose(iv, cv): problems[ix] = [iv, cv] except TypeError: pass if problems: logger.warning( self._log_prefix + "Mismatch between cached targets and input " "targets: \n{}".format(problems)) logger.info(self._log_prefix + "Restored {} features on {} samples from " "cache {}".format(len(cached_subdf.columns), len(df.index), self.cache_src)) return cached_subdf else: transforming_on_fitted = df is self.fitted_input_df df = self._prescreen_df(df, inplace=True) if transforming_on_fitted: df = self.converted_input_df else: df = self._add_composition_from_structure(df) for featurizer_type, featurizers in self.featurizers.items(): if featurizer_type in df.columns: if not transforming_on_fitted: df = self._tidy_column(df, featurizer_type) for f in featurizers: logger.info(self._log_prefix + "Featurizing with {}." "".format(f.__class__.__name__)) df = f.featurize_dataframe( df, featurizer_type, ignore_errors=self.ignore_errors, multiindex=self.multiindex, inplace=False, ) if self.drop_inputs: df = df.drop(columns=[featurizer_type]) else: logger.info(self._log_prefix + "Featurizer type {} not in the dataframe. " "Skipping...".format(featurizer_type)) if self.functionalize: ff = FunctionFeaturizer() ff.set_n_jobs(self.n_jobs) cols = df.columns.tolist() for ft in self.featurizers.keys(): if ft in cols: cols.pop(ft) df = ff.fit_featurize_dataframe( df, cols, ignore_errors=self.ignore_errors, multiindex=self.multiindex, inplace=False, ) if (self.cache_src and not os.path.exists(self.cache_src) and not prevent_cache_overwrite): store_dataframe_as_json(df, self.cache_src) return df
# %% where there are several mp_ids, pick the one with lowest energy above convex hull def get_e_above_hull(mp_id: str) -> float: return mpr.query(mp_id, ["e_above_hull"])["e_above_hull"] phonons["es_above_hull"] = phonons.likely_mp_ids.progress_apply( lambda ids: [get_e_above_hull(id) for id in ids]) phonons["likely_mp_id"] = phonons.apply( lambda row: row.likely_mp_ids[np.argmin(row.es_above_hull)], axis=1) # %% cols = ["structure", "last phdos peak", "likely_mp_id"] store_dataframe_as_json(phonons[cols], "matbench-phonons-with-mp-id.json.gz") phonons[cols] = load_dataframe_from_json("matbench-phonons-with-mp-id.json.gz") # %% phonons[["sg_symbol", "sg_number"]] = phonons.progress_apply( lambda row: row.structure.get_space_group_info(), axis=1, result_type="expand") phonons["crystal_system"] = phonons.structure.progress_apply( lambda struct: SpacegroupAnalyzer(struct).get_crystal_system()) phonons[["sg_symbol", "sg_number", "crystal_system", "volume", "formula"]].to_csv("additional-df-cols.csv", index=False) # %% phonons[["sg_symbol", "sg_number", "crystal_system", "volume",
def run_task(self, fw_spec): # Read data from fw_spec pipe_config_dict = fw_spec["pipe_config"] fold = fw_spec["fold"] kfold_config = fw_spec["kfold_config"] target = fw_spec["target"] data_file = fw_spec["data_file"] clf_pos_label = fw_spec["clf_pos_label"] problem_type = fw_spec["problem_type"] learner_name = pipe_config_dict["learner_name"] cache = fw_spec["cache"] learner_kwargs = pipe_config_dict["learner_kwargs"] reducer_kwargs = pipe_config_dict["reducer_kwargs"] cleaner_kwargs = pipe_config_dict["cleaner_kwargs"] autofeaturizer_kwargs = pipe_config_dict["autofeaturizer_kwargs"] # Modify data_file based on computing resource data_dir = os.environ["AMM_DATASET_DIR"] data_file = os.path.join(data_dir, data_file) # Modify save_dir based on computing resource bench_dir = os.environ["AMM_BENCH_DIR"] base_save_dir = fw_spec["base_save_dir"] base_save_dir = os.path.join(bench_dir, base_save_dir) save_dir = fw_spec.pop("save_dir") save_dir = os.path.join(base_save_dir, save_dir) if not os.path.exists(save_dir): os.makedirs(save_dir) # Set up pipeline config if learner_name == "TPOTAdaptor": learner = TPOTAdaptor(**learner_kwargs) elif learner_name == "rf": warnings.warn( "Learner kwargs passed into RF regressor/classifiers bc. rf being used." ) learner = SinglePipelineAdaptor( regressor=RandomForestRegressor(**learner_kwargs), classifier=RandomForestClassifier(**learner_kwargs), ) else: raise ValueError("{} not supported by RunPipe yet!" "".format(learner_name)) if cache: autofeaturizer_kwargs["cache_src"] = os.path.join( base_save_dir, "features.json") pipe_config = { "learner": learner, "reducer": FeatureReducer(**reducer_kwargs), "cleaner": DataCleaner(**cleaner_kwargs), "autofeaturizer": AutoFeaturizer(**autofeaturizer_kwargs), } pipe = MatPipe(**pipe_config) # Set up dataset # Dataset should already be set up correctly as json beforehand. # this includes targets being converted to classification, removing # extra columns, having the names of featurization cols set to the # same as the matpipe config, etc. df = load_dataframe_from_json(data_file) # Check other parameters that would otherwise not be checked until after # benchmarking, hopefully saves some errors at the end during scoring. if problem_type not in [AMM_CLF_NAME, AMM_REG_NAME]: raise ValueError("Problem must be either classification or " "regression.") elif problem_type == AMM_CLF_NAME: if not isinstance(clf_pos_label, (str, bool)): raise TypeError( "The classification positive label should be a " "string, or bool not {}." "".format(type(clf_pos_label))) elif clf_pos_label not in df[target]: raise ValueError("The classification positive label should be" "present in the target column.") elif len(df[target].unique()) > 2: raise ValueError("Only binary classification scoring available" "at this time.") # Set up testing scheme if problem_type == AMM_REG_NAME: kfold = KFold(**kfold_config) else: kfold = StratifiedKFold(**kfold_config) if fold >= kfold.n_splits: raise ValueError("{} is out of range for KFold with n_splits=" "{}".format(fold, kfold)) # Run the benchmark t1 = time.time() results = pipe.benchmark(df, target, kfold, fold_subset=[fold], cache=True) result_df = results[0] elapsed_time = time.time() - t1 # Save everything pipe.save(os.path.join(save_dir, "pipe.p")) pipe.inspect(filename=os.path.join(save_dir, "digest.txt")) result_df.to_csv(os.path.join(save_dir, "test_df.csv")) pipe.post_fit_df.to_csv(os.path.join(save_dir, "fitted_df.csv")) # Evaluate model true = result_df[target] test = result_df[target + " predicted"] pass_to_storage = {} if problem_type == AMM_REG_NAME: pass_to_storage["r2"] = r2_score(true, test) pass_to_storage["mae"] = mean_absolute_error(true, test) pass_to_storage["rmse"] = sqrt(mean_squared_error(true, test)) elif problem_type == AMM_CLF_NAME: pass_to_storage["f1"] = f1_score(true, test, pos_label=clf_pos_label) pass_to_storage["roc_auc"] = roc_auc_score(true, test) pass_to_storage["accuracy"] = accuracy_score(true, test) else: raise ValueError("Scoring method for problem type {} not supported" "".format(problem_type)) # Extract important inspect for storage try: # TPOT Adaptor best_pipeline = [ str(step) for step in pipe.learner.best_pipeline.steps ] except AttributeError: best_pipeline = str(pipe.learner.best_pipeline) features = pipe.learner.features n_features = len(features) fold_orig = list(kfold.split(df, y=df[target]))[fold] n_samples_train_original = len(fold_orig[0]) n_samples_test_original = len(fold_orig[1]) pass_to_storage.update({ "target": target, "best_pipeline": best_pipeline, "elapsed_time": elapsed_time, "features": features, "n_features": n_features, "n_test_samples_original": n_samples_test_original, "n_train_samples_original": n_samples_train_original, "n_train_samples": len(pipe.post_fit_df), "n_test_samples": len(test), "test_sample_frac_retained": len(test) / n_samples_test_original, "completion_time": datetime.datetime.now(), "base_save_dir": base_save_dir, "save_dir": save_dir, }) fw_spec.update(pass_to_storage)