示例#1
0
 def test_BaseEstimator(self):
     learner = SinglePipelineAdaptor(regressor=RandomForestRegressor(),
                                     classifier=RandomForestClassifier())
     target_key = "K_VRH"
     learner.fit(self.train_df, target_key)
     test_w_predictions = learner.predict(self.test_df, target_key)
     y_true = test_w_predictions[target_key]
     y_test = test_w_predictions[target_key + " predicted"]
     self.assertGreater(r2_score(y_true, y_test), 0.75)
示例#2
0
 def test_Pipeline(self):
     modelr = Pipeline([("scaler", StandardScaler()),
                        ("rfr", RandomForestRegressor())])
     modelc = Pipeline([("scaler", StandardScaler()),
                        ("rfr", RandomForestClassifier())])
     learner = SinglePipelineAdaptor(regressor=modelr, classifier=modelc)
     target_key = "K_VRH"
     learner.fit(self.train_df, target_key)
     test_w_predictions = learner.predict(self.test_df, target_key)
     y_true = test_w_predictions[target_key]
     y_test = test_w_predictions[target_key + " predicted"]
     print(r2_score(y_true, y_test))
     self.assertTrue(r2_score(y_true, y_test) > 0.75)
示例#3
0
    def test_BaseEstimator_classification(self):
        learner = SinglePipelineAdaptor(regressor=RandomForestRegressor(),
                                        classifier=RandomForestClassifier())
        # Prepare dataset for classification
        train_df = self.train_df
        test_df = self.test_df
        for df in [train_df, test_df]:
            df["K_VRH"] = df["K_VRH"] > 150
            df.rename(columns={"K_VRH": "K_VRH > 50"}, inplace=True)

        print(train_df["K_VRH > 50"].value_counts())
        print(test_df["K_VRH > 50"].value_counts())

        target_key = "K_VRH > 50"
        learner.fit(self.train_df, target_key)
        test_w_predictions = learner.predict(self.test_df, target_key)
        y_true = test_w_predictions[target_key]
        y_test = test_w_predictions[target_key + " predicted"]
        print(f1_score(y_true, y_test))
        self.assertGreater(f1_score(y_true, y_test), 0.65)
示例#4
0
    def run_task(self, fw_spec):
        # Read data from fw_spec
        pipe_config_dict = fw_spec["pipe_config"]
        target = fw_spec["target"]
        data_file = fw_spec["data_file"]
        learner_name = pipe_config_dict["learner_name"]
        learner_kwargs = pipe_config_dict["learner_kwargs"]
        reducer_kwargs = pipe_config_dict["reducer_kwargs"]
        cleaner_kwargs = pipe_config_dict["cleaner_kwargs"]
        autofeaturizer_kwargs = pipe_config_dict["autofeaturizer_kwargs"]

        # Modify data_file based on computing resource
        data_dir = os.environ["AMM_DATASET_DIR"]
        data_file = os.path.join(data_dir, data_file)

        # Modify save_dir based on computing resource
        bench_dir = os.environ["AMM_SINGLE_FIT_DIR"]
        base_save_dir = fw_spec["base_save_dir"]
        base_save_dir = os.path.join(bench_dir, base_save_dir)

        if not os.path.exists(base_save_dir):
            os.makedirs(base_save_dir)

        # Set up pipeline config
        if learner_name == "TPOTAdaptor":
            learner = TPOTAdaptor(**learner_kwargs)
        elif learner_name == "rf":
            warnings.warn(
                "Learner kwargs passed into RF regressor/classifiers bc. rf being used."
            )
            learner = SinglePipelineAdaptor(
                regressor=RandomForestRegressor(**learner_kwargs),
                classifier=RandomForestClassifier(**learner_kwargs),
            )
        else:
            raise ValueError("{} not supported yet!" "".format(learner_name))
        pipe_config = {
            "learner": learner,
            "reducer": FeatureReducer(**reducer_kwargs),
            "cleaner": DataCleaner(**cleaner_kwargs),
            "autofeaturizer": AutoFeaturizer(**autofeaturizer_kwargs),
        }
        pipe = MatPipe(**pipe_config)

        # Set up dataset
        # Dataset should already be set up correctly as json beforehand.
        # this includes targets being converted to classification, removing
        # extra columns, having the names of featurization cols set to the
        # same as the matpipe config, etc.
        df = load_dataframe_from_json(data_file)

        pipe.fit(df, target)
        pipe.save(os.path.join(base_save_dir, "pipe.p"))
示例#5
0
 def test_feature_mismatching(self):
     learner = SinglePipelineAdaptor(regressor=RandomForestRegressor(),
                                     classifier=RandomForestClassifier())
     target_key = "K_VRH"
     df1 = self.train_df
     df2 = self.test_df.rename(columns={"mean X": "some other feature"})
     learner.fit(df1, target_key)
     with self.assertRaises(AutomatminerError):
         learner.predict(df2, target_key)
示例#6
0
    def run_task(self, fw_spec):
        # Read data from fw_spec
        pipe_config_dict = fw_spec["pipe_config"]
        fold = fw_spec["fold"]
        kfold_config = fw_spec["kfold_config"]
        target = fw_spec["target"]
        data_pickle = fw_spec["data_pickle"]
        clf_pos_label = fw_spec["clf_pos_label"]
        problem_type = fw_spec["problem_type"]
        learner_name = pipe_config_dict["learner_name"]
        cache = fw_spec["cache"]
        learner_kwargs = pipe_config_dict["learner_kwargs"]
        reducer_kwargs = pipe_config_dict["reducer_kwargs"]
        cleaner_kwargs = pipe_config_dict["cleaner_kwargs"]
        autofeaturizer_kwargs = pipe_config_dict["autofeaturizer_kwargs"]

        # Modify data_pickle based on computing resource
        data_dir = os.environ['AMM_DATASET_DIR']
        data_file = os.path.join(data_dir, data_pickle)

        # Modify save_dir based on computing resource
        bench_dir = os.environ['AMM_BENCH_DIR']
        base_save_dir = fw_spec["base_save_dir"]
        base_save_dir = os.path.join(bench_dir, base_save_dir)
        save_dir = fw_spec.pop("save_dir")
        save_dir = os.path.join(base_save_dir, save_dir)

        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

        from multiprocessing import cpu_count
        ont = os.environ.get("OMP_NUM_THREADS", None)
        print("Number of omp threads: {}".format(ont))
        print("Number of cpus: {}".format(cpu_count()))
        # n_jobs = int(cpu_count()/2)
        # print("Setting number of featurization jobs to: {}".format(n_jobs))
        # autofeaturizer_kwargs["n_jobs"] = n_jobs
        # learner_kwargs["verbosity"] = 3

        # Set up pipeline config
        if learner_name == "TPOTAdaptor":
            learner = TPOTAdaptor(**learner_kwargs)
        elif learner_name == "rf":
            warnings.warn(
                "Learner kwargs passed into RF regressor/classifiers bc. rf being used."
            )
            learner = SinglePipelineAdaptor(
                regressor=RandomForestRegressor(**learner_kwargs),
                classifier=RandomForestClassifier(**learner_kwargs))
        else:
            raise ValueError("{} not supported by RunPipe yet!"
                             "".format(learner_name))
        if cache:
            autofeaturizer_kwargs["cache_src"] = os.path.join(
                base_save_dir, "features.json")
        pipe_config = {
            "learner": learner,
            "reducer": FeatureReducer(**reducer_kwargs),
            "cleaner": DataCleaner(**cleaner_kwargs),
            "autofeaturizer": AutoFeaturizer(**autofeaturizer_kwargs)
        }

        logger = initialize_logger(AMM_LOGGER_BASENAME, filepath=save_dir)
        pipe = MatPipe(**pipe_config, logger=logger)

        # Set up dataset
        # Dataset should already be set up correctly as pickle beforehand.
        # this includes targets being converted to classification, removing
        # extra columns, having the names of featurization cols set to the
        # same as the matpipe config, etc.
        df = pd.read_pickle(data_file)

        # Check other parameters that would otherwise not be checked until after
        # benchmarking, hopefully saves some errors at the end during scoring.
        if problem_type not in [AMM_CLF_NAME, AMM_REG_NAME]:
            raise ValueError("Problem must be either classification or "
                             "regression.")
        elif problem_type == AMM_CLF_NAME:
            if not isinstance(clf_pos_label, (str, bool)):
                raise TypeError(
                    "The classification positive label should be a "
                    "string, or bool not {}."
                    "".format(type(clf_pos_label)))
            elif clf_pos_label not in df[target]:
                raise ValueError("The classification positive label should be"
                                 "present in the target column.")
            elif len(df[target].unique()) > 2:
                raise ValueError("Only binary classification scoring available"
                                 "at this time.")

        # Set up testing scheme
        if problem_type == AMM_REG_NAME:
            kfold = KFold(**kfold_config)
        else:
            kfold = StratifiedKFold(**kfold_config)
        if fold >= kfold.n_splits:
            raise ValueError("{} is out of range for KFold with n_splits="
                             "{}".format(fold, kfold))

        # Run the benchmark
        t1 = time.time()
        results = pipe.benchmark(df,
                                 target,
                                 kfold,
                                 fold_subset=[fold],
                                 cache=True)
        result_df = results[0]
        elapsed_time = time.time() - t1

        # Save everything
        pipe.save(os.path.join(save_dir, "pipe.p"))
        pipe.digest(os.path.join(save_dir, "digest.txt"))
        result_df.to_csv(os.path.join(save_dir, "test_df.csv"))
        pipe.post_fit_df.to_csv(os.path.join(save_dir, "fitted_df.csv"))

        # Evaluate model
        true = result_df[target]
        test = result_df[target + " predicted"]

        pass_to_storage = {}
        if problem_type == AMM_REG_NAME:
            pass_to_storage["r2"] = r2_score(true, test)
            pass_to_storage["mae"] = mean_absolute_error(true, test)
            pass_to_storage['rmse'] = sqrt(mean_squared_error(true, test))
        elif problem_type == AMM_CLF_NAME:
            pass_to_storage["f1"] = f1_score(true,
                                             test,
                                             pos_label=clf_pos_label)
            pass_to_storage["roc_auc"] = roc_auc_score(true, test)
            pass_to_storage["accuracy"] = accuracy_score(true, test)
        else:
            raise ValueError("Scoring method for problem type {} not supported"
                             "".format(problem_type))

        # Extract important details for storage
        try:
            # TPOT Adaptor
            best_pipeline = [
                str(step) for step in pipe.learner.best_pipeline.steps
            ]
        except AttributeError:
            best_pipeline = str(pipe.learner.best_pipeline)

        features = pipe.learner.features
        n_features = len(features)
        fold_orig = list(kfold.split(df, y=df[target]))[fold]
        n_samples_train_original = len(fold_orig[0])
        n_samples_test_original = len(fold_orig[1])

        pass_to_storage.update({
            "target": target,
            "best_pipeline": best_pipeline,
            "elapsed_time": elapsed_time,
            "features": features,
            "n_features": n_features,
            "n_test_samples_original": n_samples_test_original,
            "n_train_samples_original": n_samples_train_original,
            "n_train_samples": len(pipe.post_fit_df),
            "n_test_samples": len(test),
            "test_sample_frac_retained": len(test) / n_samples_test_original,
            "completion_time": datetime.datetime.now(),
            "base_save_dir": base_save_dir,
            "save_dir": save_dir
        })
        fw_spec.update(pass_to_storage)
示例#7
0
Reduce the number of jobs n_jobs for less memory usage on consumer machines.
"""

from automatminer import MatPipe
from automatminer.featurization import AutoFeaturizer
from automatminer.automl.adaptors import SinglePipelineAdaptor
from automatminer.preprocessing import DataCleaner, FeatureReducer
from automatminer.automl.adaptors import TPOTAdaptor, SinglePipelineAdaptor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor

from matbench.bench import MatbenchBenchmark

# The learner is a single 500-estimator Random Forest model
learner = SinglePipelineAdaptor(
    regressor=RandomForestRegressor(n_estimators=500),
    classifier=RandomForestClassifier(n_estimators=500),
)
pipe_config = {
    "learner":
    learner,
    "reducer":
    FeatureReducer(reducers=[]),
    "cleaner":
    DataCleaner(feature_na_method="mean",
                max_na_frac=0.01,
                na_method_fit="drop",
                na_method_transform="mean"),
    "autofeaturizer":
    AutoFeaturizer(n_jobs=10, preset="debug"),
}