def test_BaseEstimator(self): learner = SinglePipelineAdaptor(regressor=RandomForestRegressor(), classifier=RandomForestClassifier()) target_key = "K_VRH" learner.fit(self.train_df, target_key) test_w_predictions = learner.predict(self.test_df, target_key) y_true = test_w_predictions[target_key] y_test = test_w_predictions[target_key + " predicted"] self.assertGreater(r2_score(y_true, y_test), 0.75)
def test_Pipeline(self): modelr = Pipeline([("scaler", StandardScaler()), ("rfr", RandomForestRegressor())]) modelc = Pipeline([("scaler", StandardScaler()), ("rfr", RandomForestClassifier())]) learner = SinglePipelineAdaptor(regressor=modelr, classifier=modelc) target_key = "K_VRH" learner.fit(self.train_df, target_key) test_w_predictions = learner.predict(self.test_df, target_key) y_true = test_w_predictions[target_key] y_test = test_w_predictions[target_key + " predicted"] print(r2_score(y_true, y_test)) self.assertTrue(r2_score(y_true, y_test) > 0.75)
def test_BaseEstimator_classification(self): learner = SinglePipelineAdaptor(regressor=RandomForestRegressor(), classifier=RandomForestClassifier()) # Prepare dataset for classification train_df = self.train_df test_df = self.test_df for df in [train_df, test_df]: df["K_VRH"] = df["K_VRH"] > 150 df.rename(columns={"K_VRH": "K_VRH > 50"}, inplace=True) print(train_df["K_VRH > 50"].value_counts()) print(test_df["K_VRH > 50"].value_counts()) target_key = "K_VRH > 50" learner.fit(self.train_df, target_key) test_w_predictions = learner.predict(self.test_df, target_key) y_true = test_w_predictions[target_key] y_test = test_w_predictions[target_key + " predicted"] print(f1_score(y_true, y_test)) self.assertGreater(f1_score(y_true, y_test), 0.65)
def run_task(self, fw_spec): # Read data from fw_spec pipe_config_dict = fw_spec["pipe_config"] target = fw_spec["target"] data_file = fw_spec["data_file"] learner_name = pipe_config_dict["learner_name"] learner_kwargs = pipe_config_dict["learner_kwargs"] reducer_kwargs = pipe_config_dict["reducer_kwargs"] cleaner_kwargs = pipe_config_dict["cleaner_kwargs"] autofeaturizer_kwargs = pipe_config_dict["autofeaturizer_kwargs"] # Modify data_file based on computing resource data_dir = os.environ["AMM_DATASET_DIR"] data_file = os.path.join(data_dir, data_file) # Modify save_dir based on computing resource bench_dir = os.environ["AMM_SINGLE_FIT_DIR"] base_save_dir = fw_spec["base_save_dir"] base_save_dir = os.path.join(bench_dir, base_save_dir) if not os.path.exists(base_save_dir): os.makedirs(base_save_dir) # Set up pipeline config if learner_name == "TPOTAdaptor": learner = TPOTAdaptor(**learner_kwargs) elif learner_name == "rf": warnings.warn( "Learner kwargs passed into RF regressor/classifiers bc. rf being used." ) learner = SinglePipelineAdaptor( regressor=RandomForestRegressor(**learner_kwargs), classifier=RandomForestClassifier(**learner_kwargs), ) else: raise ValueError("{} not supported yet!" "".format(learner_name)) pipe_config = { "learner": learner, "reducer": FeatureReducer(**reducer_kwargs), "cleaner": DataCleaner(**cleaner_kwargs), "autofeaturizer": AutoFeaturizer(**autofeaturizer_kwargs), } pipe = MatPipe(**pipe_config) # Set up dataset # Dataset should already be set up correctly as json beforehand. # this includes targets being converted to classification, removing # extra columns, having the names of featurization cols set to the # same as the matpipe config, etc. df = load_dataframe_from_json(data_file) pipe.fit(df, target) pipe.save(os.path.join(base_save_dir, "pipe.p"))
def test_feature_mismatching(self): learner = SinglePipelineAdaptor(regressor=RandomForestRegressor(), classifier=RandomForestClassifier()) target_key = "K_VRH" df1 = self.train_df df2 = self.test_df.rename(columns={"mean X": "some other feature"}) learner.fit(df1, target_key) with self.assertRaises(AutomatminerError): learner.predict(df2, target_key)
def run_task(self, fw_spec): # Read data from fw_spec pipe_config_dict = fw_spec["pipe_config"] fold = fw_spec["fold"] kfold_config = fw_spec["kfold_config"] target = fw_spec["target"] data_pickle = fw_spec["data_pickle"] clf_pos_label = fw_spec["clf_pos_label"] problem_type = fw_spec["problem_type"] learner_name = pipe_config_dict["learner_name"] cache = fw_spec["cache"] learner_kwargs = pipe_config_dict["learner_kwargs"] reducer_kwargs = pipe_config_dict["reducer_kwargs"] cleaner_kwargs = pipe_config_dict["cleaner_kwargs"] autofeaturizer_kwargs = pipe_config_dict["autofeaturizer_kwargs"] # Modify data_pickle based on computing resource data_dir = os.environ['AMM_DATASET_DIR'] data_file = os.path.join(data_dir, data_pickle) # Modify save_dir based on computing resource bench_dir = os.environ['AMM_BENCH_DIR'] base_save_dir = fw_spec["base_save_dir"] base_save_dir = os.path.join(bench_dir, base_save_dir) save_dir = fw_spec.pop("save_dir") save_dir = os.path.join(base_save_dir, save_dir) if not os.path.exists(save_dir): os.makedirs(save_dir) from multiprocessing import cpu_count ont = os.environ.get("OMP_NUM_THREADS", None) print("Number of omp threads: {}".format(ont)) print("Number of cpus: {}".format(cpu_count())) # n_jobs = int(cpu_count()/2) # print("Setting number of featurization jobs to: {}".format(n_jobs)) # autofeaturizer_kwargs["n_jobs"] = n_jobs # learner_kwargs["verbosity"] = 3 # Set up pipeline config if learner_name == "TPOTAdaptor": learner = TPOTAdaptor(**learner_kwargs) elif learner_name == "rf": warnings.warn( "Learner kwargs passed into RF regressor/classifiers bc. rf being used." ) learner = SinglePipelineAdaptor( regressor=RandomForestRegressor(**learner_kwargs), classifier=RandomForestClassifier(**learner_kwargs)) else: raise ValueError("{} not supported by RunPipe yet!" "".format(learner_name)) if cache: autofeaturizer_kwargs["cache_src"] = os.path.join( base_save_dir, "features.json") pipe_config = { "learner": learner, "reducer": FeatureReducer(**reducer_kwargs), "cleaner": DataCleaner(**cleaner_kwargs), "autofeaturizer": AutoFeaturizer(**autofeaturizer_kwargs) } logger = initialize_logger(AMM_LOGGER_BASENAME, filepath=save_dir) pipe = MatPipe(**pipe_config, logger=logger) # Set up dataset # Dataset should already be set up correctly as pickle beforehand. # this includes targets being converted to classification, removing # extra columns, having the names of featurization cols set to the # same as the matpipe config, etc. df = pd.read_pickle(data_file) # Check other parameters that would otherwise not be checked until after # benchmarking, hopefully saves some errors at the end during scoring. if problem_type not in [AMM_CLF_NAME, AMM_REG_NAME]: raise ValueError("Problem must be either classification or " "regression.") elif problem_type == AMM_CLF_NAME: if not isinstance(clf_pos_label, (str, bool)): raise TypeError( "The classification positive label should be a " "string, or bool not {}." "".format(type(clf_pos_label))) elif clf_pos_label not in df[target]: raise ValueError("The classification positive label should be" "present in the target column.") elif len(df[target].unique()) > 2: raise ValueError("Only binary classification scoring available" "at this time.") # Set up testing scheme if problem_type == AMM_REG_NAME: kfold = KFold(**kfold_config) else: kfold = StratifiedKFold(**kfold_config) if fold >= kfold.n_splits: raise ValueError("{} is out of range for KFold with n_splits=" "{}".format(fold, kfold)) # Run the benchmark t1 = time.time() results = pipe.benchmark(df, target, kfold, fold_subset=[fold], cache=True) result_df = results[0] elapsed_time = time.time() - t1 # Save everything pipe.save(os.path.join(save_dir, "pipe.p")) pipe.digest(os.path.join(save_dir, "digest.txt")) result_df.to_csv(os.path.join(save_dir, "test_df.csv")) pipe.post_fit_df.to_csv(os.path.join(save_dir, "fitted_df.csv")) # Evaluate model true = result_df[target] test = result_df[target + " predicted"] pass_to_storage = {} if problem_type == AMM_REG_NAME: pass_to_storage["r2"] = r2_score(true, test) pass_to_storage["mae"] = mean_absolute_error(true, test) pass_to_storage['rmse'] = sqrt(mean_squared_error(true, test)) elif problem_type == AMM_CLF_NAME: pass_to_storage["f1"] = f1_score(true, test, pos_label=clf_pos_label) pass_to_storage["roc_auc"] = roc_auc_score(true, test) pass_to_storage["accuracy"] = accuracy_score(true, test) else: raise ValueError("Scoring method for problem type {} not supported" "".format(problem_type)) # Extract important details for storage try: # TPOT Adaptor best_pipeline = [ str(step) for step in pipe.learner.best_pipeline.steps ] except AttributeError: best_pipeline = str(pipe.learner.best_pipeline) features = pipe.learner.features n_features = len(features) fold_orig = list(kfold.split(df, y=df[target]))[fold] n_samples_train_original = len(fold_orig[0]) n_samples_test_original = len(fold_orig[1]) pass_to_storage.update({ "target": target, "best_pipeline": best_pipeline, "elapsed_time": elapsed_time, "features": features, "n_features": n_features, "n_test_samples_original": n_samples_test_original, "n_train_samples_original": n_samples_train_original, "n_train_samples": len(pipe.post_fit_df), "n_test_samples": len(test), "test_sample_frac_retained": len(test) / n_samples_test_original, "completion_time": datetime.datetime.now(), "base_save_dir": base_save_dir, "save_dir": save_dir }) fw_spec.update(pass_to_storage)
Reduce the number of jobs n_jobs for less memory usage on consumer machines. """ from automatminer import MatPipe from automatminer.featurization import AutoFeaturizer from automatminer.automl.adaptors import SinglePipelineAdaptor from automatminer.preprocessing import DataCleaner, FeatureReducer from automatminer.automl.adaptors import TPOTAdaptor, SinglePipelineAdaptor from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from matbench.bench import MatbenchBenchmark # The learner is a single 500-estimator Random Forest model learner = SinglePipelineAdaptor( regressor=RandomForestRegressor(n_estimators=500), classifier=RandomForestClassifier(n_estimators=500), ) pipe_config = { "learner": learner, "reducer": FeatureReducer(reducers=[]), "cleaner": DataCleaner(feature_na_method="mean", max_na_frac=0.01, na_method_fit="drop", na_method_transform="mean"), "autofeaturizer": AutoFeaturizer(n_jobs=10, preset="debug"), }