예제 #1
0
파일: test_io.py 프로젝트: ardunn/MatMiner
    def test_load_dataframe_from_json(self):

        df = load_dataframe_from_json(os.path.join(test_dir, 'dataframe.json'))
        self.assertTrue(self.diamond == df['structure'][0],
                        "Dataframe contents do not match")

        df = load_dataframe_from_json(os.path.join(test_dir, 'dataframe.json.gz'))
        self.assertTrue(self.diamond == df['structure'][0],
                        "Dataframe contents do not match")

        df = load_dataframe_from_json(os.path.join(test_dir, 'dataframe.json.bz2'))
        self.assertTrue(self.diamond == df['structure'][0],
                        "Dataframe contents do not match")
예제 #2
0
    def test_load_dataframe_from_json(self):

        df = load_dataframe_from_json(os.path.join(test_dir, 'dataframe.json'))
        self.assertTrue(self.diamond == df['structure'][0],
                        "Dataframe contents do not match")

        df = load_dataframe_from_json(
            os.path.join(test_dir, 'dataframe.json.gz'))
        self.assertTrue(self.diamond == df['structure'][0],
                        "Dataframe contents do not match")

        df = load_dataframe_from_json(
            os.path.join(test_dir, 'dataframe.json.bz2'))
        self.assertTrue(self.diamond == df['structure'][0],
                        "Dataframe contents do not match")
예제 #3
0
    def run_task(self, fw_spec):
        # Read data from fw_spec
        pipe_config_dict = fw_spec["pipe_config"]
        target = fw_spec["target"]
        data_file = fw_spec["data_file"]
        learner_name = pipe_config_dict["learner_name"]
        learner_kwargs = pipe_config_dict["learner_kwargs"]
        reducer_kwargs = pipe_config_dict["reducer_kwargs"]
        cleaner_kwargs = pipe_config_dict["cleaner_kwargs"]
        autofeaturizer_kwargs = pipe_config_dict["autofeaturizer_kwargs"]

        # Modify data_file based on computing resource
        data_dir = os.environ["AMM_DATASET_DIR"]
        data_file = os.path.join(data_dir, data_file)

        # Modify save_dir based on computing resource
        bench_dir = os.environ["AMM_SINGLE_FIT_DIR"]
        base_save_dir = fw_spec["base_save_dir"]
        base_save_dir = os.path.join(bench_dir, base_save_dir)

        if not os.path.exists(base_save_dir):
            os.makedirs(base_save_dir)

        # Set up pipeline config
        if learner_name == "TPOTAdaptor":
            learner = TPOTAdaptor(**learner_kwargs)
        elif learner_name == "rf":
            warnings.warn(
                "Learner kwargs passed into RF regressor/classifiers bc. rf being used."
            )
            learner = SinglePipelineAdaptor(
                regressor=RandomForestRegressor(**learner_kwargs),
                classifier=RandomForestClassifier(**learner_kwargs),
            )
        else:
            raise ValueError("{} not supported yet!" "".format(learner_name))
        pipe_config = {
            "learner": learner,
            "reducer": FeatureReducer(**reducer_kwargs),
            "cleaner": DataCleaner(**cleaner_kwargs),
            "autofeaturizer": AutoFeaturizer(**autofeaturizer_kwargs),
        }
        pipe = MatPipe(**pipe_config)

        # Set up dataset
        # Dataset should already be set up correctly as json beforehand.
        # this includes targets being converted to classification, removing
        # extra columns, having the names of featurization cols set to the
        # same as the matpipe config, etc.
        df = load_dataframe_from_json(data_file)

        pipe.fit(df, target)
        pipe.save(os.path.join(base_save_dir, "pipe.p"))
예제 #4
0
def load_dataset(name, data_home=None, download_if_missing=True):
    """
    Loads a dataframe containing the dataset specified with the 'name' field.

    Dataset file is stored/loaded from data_home if specified, otherwise at
    the MATMINER_DATA environment variable if set or at matminer/datasets
    by default.

    Args:
        name (str): keyword specifying what dataset to load, run
            matminer.datasets.get_available_datasets() for options

        data_home (str): path to folder to look for dataset file

        download_if_missing (bool): whether to download the dataset if is not
            found on disk

    Returns: (pd.DataFrame,
              tuple -> (pd.DataFrame, pd.DataFrame) if return_lumo = True)
    """
    global _dataset_dict

    if _dataset_dict is None:
        _dataset_dict = _load_dataset_dict()

    if name not in _dataset_dict:
        error_string = "Unrecognized dataset name: {}. \n" \
                       "Use matminer.datasets.get_available_datasets() " \
                       "to see a list of currently available " \
                       "datasets".format(name)

        # Very simple attempt to match unrecognized keyword to existing
        # dataset names in an attempt to give the user immediate feedback
        possible_matches = [
            x for x in _dataset_dict.keys() if name.lower() in x.lower()
        ]

        if possible_matches:
            error_string += "\nCould you have been looking for these similar " \
                            "matches?:\n{}".format(possible_matches)

        raise ValueError(error_string)

    dataset_metadata = _dataset_dict[name]
    data_path = os.path.join(_get_data_home(data_home),
                             name + "." + dataset_metadata['file_type'])
    _validate_dataset(data_path, dataset_metadata['url'],
                      dataset_metadata['hash'], download_if_missing)

    df = load_dataframe_from_json(data_path)

    return df
예제 #5
0
def load_dataset(name, data_home=None, download_if_missing=True):
    """
    Loads a dataframe containing the dataset specified with the 'name' field.

    Dataset file is stored/loaded from data_home if specified, otherwise at
    the MATMINER_DATA environment variable if set or at matminer/datasets
    by default.

    Args:
        name (str): keyword specifying what dataset to load, run
            matminer.datasets.get_available_datasets() for options

        data_home (str): path to folder to look for dataset file

        download_if_missing (bool): whether to download the dataset if is not
            found on disk

    Returns: (pd.DataFrame,
              tuple -> (pd.DataFrame, pd.DataFrame) if return_lumo = True)
    """
    global _dataset_dict

    if _dataset_dict is None:
        _dataset_dict = _load_dataset_dict()

    if name not in _dataset_dict:
        error_string = "Unrecognized dataset name: {}. \n" \
                       "Use matminer.datasets.get_available_datasets() " \
                       "to see a list of currently available " \
                       "datasets".format(name)

        # Very simple attempt to match unrecognized keyword to existing
        # dataset names in an attempt to give the user immediate feedback
        possible_matches = [
            x for x in _dataset_dict.keys() if name.lower() in x.lower()
        ]

        if possible_matches:
            error_string += "\nCould you have been looking for these similar " \
                            "matches?:\n{}".format(possible_matches)

        raise ValueError(error_string)

    dataset_metadata = _dataset_dict[name]
    data_path = os.path.join(_get_data_home(data_home),
                             name + "." + dataset_metadata['file_type'])
    _validate_dataset(data_path, dataset_metadata['url'],
                      dataset_metadata['hash'], download_if_missing)

    df = load_dataframe_from_json(data_path)

    return df
예제 #6
0
 def __init__(self,
              initial_ltol: float = 0.2,
              initial_stol: float = 0.3,
              initial_angle_tol: float = 5.,
              use_fingerprint_matching: bool = True,
              fingerprint_distance_cutoff: float = 0.4):
     db_file = resource_filename('robocrys.condense', 'mineral_db.json.gz')
     self.mineral_db = load_dataframe_from_json(db_file)
     self.initial_ltol = initial_ltol
     self.initial_stol = initial_stol
     self.initial_angle_tol = initial_angle_tol
     self.fingerprint_distance_cutoff = fingerprint_distance_cutoff
     self.use_fingerprint_matching = use_fingerprint_matching
     self._structure = None
     self._mineral_db = None
예제 #7
0
파일: load.py 프로젝트: alvarotb/scm2020
# -- start

import os
import pandas as pd
import numpy as np
import keras
import matminer
import pickle

from keras.models import load_model
from matminer.utils.io import load_dataframe_from_json
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from supplement import searchNN, trainNN, evaluateModel, createRF  #stores custom methods/functions

fdf = load_dataframe_from_json('Batteries_feat.json')
print("The starting dataset has {}".format(fdf.shape))
print(fdf.head())
'''
Block 1 - Random Forest
'''
print('\n---\n')
print('Selecting target variable...')

excluded = [
    'Spacegroup', 'Capacity Grav', 'Capacity Vol', 'Specific E Wh/kg',
    'E Density Wh/l', 'Stability Charge', 'Stability Discharge', 'Ion',
    'Reduced Formula', 'Id', 'composition', 'composition_oxid',
    'HOMO_character', 'HOMO_element', 'LUMO_character', 'LUMO_element'
]
예제 #8
0
def load_dataset(name,
                 data_home=None,
                 download_if_missing=True,
                 include_metadata=False):
    """
    Loads a dataframe containing the dataset specified with the 'name' field.

    Dataset file is stored/loaded from data_home if specified, otherwise at
    the MATMINER_DATA environment variable if set or at matminer/datasets
    by default.

    Args:
        name (str): keyword specifying what dataset to load, run
            matminer.datasets.available_datasets() for options

        data_home (str): path to folder to look for dataset file

        download_if_missing (bool): whether to download the dataset if is not
            found on disk

        include_metadata (bool): optional argument for some datasets with
            metadata fields

    Returns: (pd.DataFrame)
    """
    dataset_dict = _load_dataset_dict()

    if name not in dataset_dict:
        error_string = "Unrecognized dataset name: {}. \n" \
                       "Use matminer.datasets.available_datasets() " \
                       "to see a list of currently available " \
                       "datasets".format(name)

        # Very simple attempt to match unrecognized keyword to existing
        # dataset names in an attempt to give the user immediate feedback
        possible_matches = [
            x for x in dataset_dict.keys() if name.lower() in x.lower()
        ]

        if possible_matches:
            error_string += "\nCould you have been looking for these similar " \
                            "matches?:\n{}".format(possible_matches)

        raise ValueError(error_string)

    dataset_metadata = dataset_dict[name]
    data_path = os.path.join(_get_data_home(data_home),
                             name + "." + dataset_metadata['file_type'])
    _validate_dataset(data_path, dataset_metadata['url'],
                      dataset_metadata['hash'], download_if_missing)

    df = load_dataframe_from_json(data_path)

    if not include_metadata:
        if name == "elastic_tensor_2015":
            df = df.drop(['cif', 'kpoint_density', 'poscar'], axis=1)

        elif name in {"piezoelectric_tensor", "dielectric_constant"}:
            df = df.drop(['cif', 'meta', 'poscar'], axis=1)

    return df
예제 #9
0
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold
from automatminer.utils.ml import regression_or_classification
from automatminer.utils.ml import AMM_CLF_NAME, AMM_REG_NAME
from automatminer_dev.config import BENCHMARK_FULL_SET, GLASS, EXPT_IS_METAL, EXPT_GAP
from matminer.utils.io import load_dataframe_from_json

benchmark_dir = os.environ["AMM_DATASET_DIR"]

bmarks = BENCHMARK_FULL_SET
bmarks = [GLASS, EXPT_GAP, EXPT_IS_METAL]

for p in bmarks:
    pname = p["name"]
    print("Loading {}".format(pname))
    df = load_dataframe_from_json(os.path.join(benchmark_dir, p["data_file"]))
    target = p["target"]
    ltype = p["problem_type"]
    if ltype == AMM_REG_NAME:
        kf = KFold(n_splits=5, random_state=18012019, shuffle=True)
        estimator = DummyRegressor(strategy="mean")
        scoring = "neg_mean_absolute_error"
        multiplier = -1
    elif ltype == AMM_CLF_NAME:
        kf = StratifiedKFold(n_splits=5, random_state=18012019, shuffle=True)
        estimator = DummyClassifier(strategy="stratified")
        multiplier = 1
        scoring = "roc_auc"
    else:
        raise ValueError("problem type {} is not known.".format(ltype))
예제 #10
0
import pandas as pd
import numpy as np
import os

from pymatgen import MPRester

import matminer
from matminer.data_retrieval.retrieve_MP import MPDataRetrieval
from matminer.utils.io import store_dataframe_as_json
from matminer.utils.io import load_dataframe_from_json
from matminer.figrecipes.plot import PlotlyFig
'''
#Block 1 - Loading and filtering the experimental dataframe
'''
df = load_dataframe_from_json('data/Batteries_raw.json')

# Select the working ion among {Li, Al, Zr, Mg}
select = 'Li'

# Initial filter based on the selected element
from matminer.featurizers.conversions import StrToComposition
fdf = StrToComposition().featurize_dataframe(df, 'Ion')

select_at = fdf["composition"].apply(lambda x: x.get_atomic_fraction(select))
fdf = fdf[select_at == 1]

# Debug
print("Remaining samples: {}".format(fdf.describe))
fdf = fdf.drop(['composition'], axis=1)
예제 #11
0
    def test_featurize_bsdos(self, refresh_df_init=False, limit=1):
        """
        Tests featurize_dos and featurize_bandstructure.

        Args:
            refresh_df_init (bool): for developers, if the test need to be
                updated set to True. Otherwise set to False to make the final
                test independent of MPRester and faster.
            limit (int): the maximum final number of entries.

        Returns (None):
        """
        target = "color"
        df_bsdos_pickled = "mp_data_with_dos_bandstructure.pickle"
        save_path = os.path.join(TEST_DIR, df_bsdos_pickled)
        if refresh_df_init:
            mpdr = MPDataRetrieval()
            df = mpdr.get_dataframe(
                criteria={"material_id": "mp-149"},
                properties=[
                    "pretty_formula",
                    "dos",
                    "bandstructure",
                    "bandstructure_uniform",
                ],
            )
            store_dataframe_as_json(df, save_path)
        else:
            df = load_dataframe_from_json(save_path)
        df = df.dropna(axis=0)
        df = df.rename(
            columns={
                "bandstructure_uniform": "bandstructure",
                "bandstructure": "line bandstructure",
            }
        )
        df[target] = [["red"]]
        n_cols_init = df.shape[1]

        featurizer = AutoFeaturizer(
            preset="express", ignore_errors=False, multiindex=False
        )
        df = featurizer.fit_transform(df, target)

        # sanity checks
        self.assertTrue(len(df), limit)
        self.assertGreater(len(df.columns), n_cols_init)

        # DOSFeaturizer:
        self.assertEqual(df["cbm_character_1"][0], "p")

        # DopingFermi:
        self.assertAlmostEqual(df["fermi_c1e+20T300"][0], -0.539, 3)

        # Hybridization:
        self.assertAlmostEqual(df["vbm_sp"][0], 0.181, 3)
        self.assertAlmostEqual(df["cbm_s"][0], 0.4416, 3)
        self.assertAlmostEqual(df["cbm_sp"][0], 0.9864, 3)

        # BandFeaturizer:
        self.assertAlmostEqual(df["direct_gap"][0], 2.556, 3)
        self.assertAlmostEqual(df["n_ex1_norm"][0], 0.6285, 4)

        # BranchPointEnergy:
        self.assertAlmostEqual(df["branch_point_energy"][0], 5.7677, 4)
예제 #12
0
파일: core.py 프로젝트: suthzx/automatminer
    def transform(self, df, target, prevent_cache_overwrite=False):
        """
        Decorate a dataframe containing composition, structure, bandstructure,
        and/or DOS objects with descriptors.

        Args:
            df (pandas.DataFrame): The dataframe not containing features.
            target (str): The ML-target property contained in the df.
            prevent_cache_overwrite (bool): If True, does not try to write any
                new features to the cache.

        Returns:
            df (pandas.DataFrame): Transformed dataframe containing features.
        """
        if self.cache_src and os.path.exists(self.cache_src):
            logger.debug(self._log_prefix +
                         "Reading cache_src {}".format(self.cache_src))
            cached_df = load_dataframe_from_json(self.cache_src)
            if not all([loc in cached_df.index for loc in df.index]):
                raise AutomatminerError("Feature cache does not contain all "
                                        "entries (by DataFrame index) needed "
                                        "to transform the input df.")
            else:
                cached_subdf = cached_df.loc[df.index]
                if target in cached_subdf.columns:
                    if target not in df.columns:
                        logger.warn(
                            self._log_prefix +
                            "Target not present in both cached df and input df."
                            " Cannot perform comparison to ensure index match."
                        )
                    else:
                        cached_targets = cached_subdf[target]
                        input_targets = df[target]
                        cached_type = regression_or_classification(
                            cached_targets)
                        input_type = regression_or_classification(
                            input_targets)
                        if cached_type != input_type:
                            raise AutomatminerError(
                                "Cached targets appear to be '{}' type, while "
                                "input targets appear to be '{}'."
                                "".format(cached_type, input_type))

                        problems = {}
                        for ix in input_targets.index:
                            iv = input_targets[ix]
                            cv = cached_targets[ix]
                            if iv != cv:
                                try:
                                    if not math.isclose(iv, cv):
                                        problems[ix] = [iv, cv]
                                except TypeError:
                                    pass
                        if problems:
                            logger.warning(
                                self._log_prefix +
                                "Mismatch between cached targets and input "
                                "targets: \n{}".format(problems))

                logger.info(self._log_prefix +
                            "Restored {} features on {} samples from "
                            "cache {}".format(len(cached_subdf.columns),
                                              len(df.index), self.cache_src))
                return cached_subdf
        else:
            transforming_on_fitted = df is self.fitted_input_df
            df = self._prescreen_df(df, inplace=True)

            if transforming_on_fitted:
                df = self.converted_input_df
            else:
                df = self._add_composition_from_structure(df)

            for featurizer_type, featurizers in self.featurizers.items():
                if featurizer_type in df.columns:
                    if not transforming_on_fitted:
                        df = self._tidy_column(df, featurizer_type)

                    for f in featurizers:
                        logger.info(self._log_prefix + "Featurizing with {}."
                                    "".format(f.__class__.__name__))
                        df = f.featurize_dataframe(
                            df,
                            featurizer_type,
                            ignore_errors=self.ignore_errors,
                            multiindex=self.multiindex,
                            inplace=False,
                        )
                    if self.drop_inputs:
                        df = df.drop(columns=[featurizer_type])
                else:
                    logger.info(self._log_prefix +
                                "Featurizer type {} not in the dataframe. "
                                "Skipping...".format(featurizer_type))
            if self.functionalize:
                ff = FunctionFeaturizer()
                ff.set_n_jobs(self.n_jobs)
                cols = df.columns.tolist()
                for ft in self.featurizers.keys():
                    if ft in cols:
                        cols.pop(ft)
                df = ff.fit_featurize_dataframe(
                    df,
                    cols,
                    ignore_errors=self.ignore_errors,
                    multiindex=self.multiindex,
                    inplace=False,
                )
            if (self.cache_src and not os.path.exists(self.cache_src)
                    and not prevent_cache_overwrite):
                store_dataframe_as_json(df, self.cache_src)
            return df
예제 #13
0
# %% where there are several mp_ids, pick the one with lowest energy above convex hull
def get_e_above_hull(mp_id: str) -> float:
    return mpr.query(mp_id, ["e_above_hull"])["e_above_hull"]


phonons["es_above_hull"] = phonons.likely_mp_ids.progress_apply(
    lambda ids: [get_e_above_hull(id) for id in ids])

phonons["likely_mp_id"] = phonons.apply(
    lambda row: row.likely_mp_ids[np.argmin(row.es_above_hull)], axis=1)

# %%
cols = ["structure", "last phdos peak", "likely_mp_id"]
store_dataframe_as_json(phonons[cols], "matbench-phonons-with-mp-id.json.gz")

phonons[cols] = load_dataframe_from_json("matbench-phonons-with-mp-id.json.gz")

# %%
phonons[["sg_symbol", "sg_number"]] = phonons.progress_apply(
    lambda row: row.structure.get_space_group_info(),
    axis=1,
    result_type="expand")

phonons["crystal_system"] = phonons.structure.progress_apply(
    lambda struct: SpacegroupAnalyzer(struct).get_crystal_system())

phonons[["sg_symbol", "sg_number", "crystal_system", "volume",
         "formula"]].to_csv("additional-df-cols.csv", index=False)

# %%
phonons[["sg_symbol", "sg_number", "crystal_system", "volume",
예제 #14
0
    def run_task(self, fw_spec):
        # Read data from fw_spec
        pipe_config_dict = fw_spec["pipe_config"]
        fold = fw_spec["fold"]
        kfold_config = fw_spec["kfold_config"]
        target = fw_spec["target"]
        data_file = fw_spec["data_file"]
        clf_pos_label = fw_spec["clf_pos_label"]
        problem_type = fw_spec["problem_type"]
        learner_name = pipe_config_dict["learner_name"]
        cache = fw_spec["cache"]
        learner_kwargs = pipe_config_dict["learner_kwargs"]
        reducer_kwargs = pipe_config_dict["reducer_kwargs"]
        cleaner_kwargs = pipe_config_dict["cleaner_kwargs"]
        autofeaturizer_kwargs = pipe_config_dict["autofeaturizer_kwargs"]

        # Modify data_file based on computing resource
        data_dir = os.environ["AMM_DATASET_DIR"]
        data_file = os.path.join(data_dir, data_file)

        # Modify save_dir based on computing resource
        bench_dir = os.environ["AMM_BENCH_DIR"]
        base_save_dir = fw_spec["base_save_dir"]
        base_save_dir = os.path.join(bench_dir, base_save_dir)
        save_dir = fw_spec.pop("save_dir")
        save_dir = os.path.join(base_save_dir, save_dir)

        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        # Set up pipeline config
        if learner_name == "TPOTAdaptor":
            learner = TPOTAdaptor(**learner_kwargs)
        elif learner_name == "rf":
            warnings.warn(
                "Learner kwargs passed into RF regressor/classifiers bc. rf being used."
            )
            learner = SinglePipelineAdaptor(
                regressor=RandomForestRegressor(**learner_kwargs),
                classifier=RandomForestClassifier(**learner_kwargs),
            )
        else:
            raise ValueError("{} not supported by RunPipe yet!"
                             "".format(learner_name))
        if cache:
            autofeaturizer_kwargs["cache_src"] = os.path.join(
                base_save_dir, "features.json")
        pipe_config = {
            "learner": learner,
            "reducer": FeatureReducer(**reducer_kwargs),
            "cleaner": DataCleaner(**cleaner_kwargs),
            "autofeaturizer": AutoFeaturizer(**autofeaturizer_kwargs),
        }

        pipe = MatPipe(**pipe_config)

        # Set up dataset
        # Dataset should already be set up correctly as json beforehand.
        # this includes targets being converted to classification, removing
        # extra columns, having the names of featurization cols set to the
        # same as the matpipe config, etc.
        df = load_dataframe_from_json(data_file)

        # Check other parameters that would otherwise not be checked until after
        # benchmarking, hopefully saves some errors at the end during scoring.
        if problem_type not in [AMM_CLF_NAME, AMM_REG_NAME]:
            raise ValueError("Problem must be either classification or "
                             "regression.")
        elif problem_type == AMM_CLF_NAME:
            if not isinstance(clf_pos_label, (str, bool)):
                raise TypeError(
                    "The classification positive label should be a "
                    "string, or bool not {}."
                    "".format(type(clf_pos_label)))
            elif clf_pos_label not in df[target]:
                raise ValueError("The classification positive label should be"
                                 "present in the target column.")
            elif len(df[target].unique()) > 2:
                raise ValueError("Only binary classification scoring available"
                                 "at this time.")

        # Set up testing scheme
        if problem_type == AMM_REG_NAME:
            kfold = KFold(**kfold_config)
        else:
            kfold = StratifiedKFold(**kfold_config)
        if fold >= kfold.n_splits:
            raise ValueError("{} is out of range for KFold with n_splits="
                             "{}".format(fold, kfold))

        # Run the benchmark
        t1 = time.time()
        results = pipe.benchmark(df,
                                 target,
                                 kfold,
                                 fold_subset=[fold],
                                 cache=True)
        result_df = results[0]
        elapsed_time = time.time() - t1

        # Save everything
        pipe.save(os.path.join(save_dir, "pipe.p"))
        pipe.inspect(filename=os.path.join(save_dir, "digest.txt"))
        result_df.to_csv(os.path.join(save_dir, "test_df.csv"))
        pipe.post_fit_df.to_csv(os.path.join(save_dir, "fitted_df.csv"))

        # Evaluate model
        true = result_df[target]
        test = result_df[target + " predicted"]

        pass_to_storage = {}
        if problem_type == AMM_REG_NAME:
            pass_to_storage["r2"] = r2_score(true, test)
            pass_to_storage["mae"] = mean_absolute_error(true, test)
            pass_to_storage["rmse"] = sqrt(mean_squared_error(true, test))
        elif problem_type == AMM_CLF_NAME:
            pass_to_storage["f1"] = f1_score(true,
                                             test,
                                             pos_label=clf_pos_label)
            pass_to_storage["roc_auc"] = roc_auc_score(true, test)
            pass_to_storage["accuracy"] = accuracy_score(true, test)
        else:
            raise ValueError("Scoring method for problem type {} not supported"
                             "".format(problem_type))

        # Extract important inspect for storage
        try:
            # TPOT Adaptor
            best_pipeline = [
                str(step) for step in pipe.learner.best_pipeline.steps
            ]
        except AttributeError:
            best_pipeline = str(pipe.learner.best_pipeline)

        features = pipe.learner.features
        n_features = len(features)
        fold_orig = list(kfold.split(df, y=df[target]))[fold]
        n_samples_train_original = len(fold_orig[0])
        n_samples_test_original = len(fold_orig[1])

        pass_to_storage.update({
            "target": target,
            "best_pipeline": best_pipeline,
            "elapsed_time": elapsed_time,
            "features": features,
            "n_features": n_features,
            "n_test_samples_original": n_samples_test_original,
            "n_train_samples_original": n_samples_train_original,
            "n_train_samples": len(pipe.post_fit_df),
            "n_test_samples": len(test),
            "test_sample_frac_retained": len(test) / n_samples_test_original,
            "completion_time": datetime.datetime.now(),
            "base_save_dir": base_save_dir,
            "save_dir": save_dir,
        })
        fw_spec.update(pass_to_storage)