def run_task(self, fw_spec): # Read data from fw_spec pipe_config_dict = fw_spec["pipe_config"] target = fw_spec["target"] data_file = fw_spec["data_file"] learner_name = pipe_config_dict["learner_name"] learner_kwargs = pipe_config_dict["learner_kwargs"] reducer_kwargs = pipe_config_dict["reducer_kwargs"] cleaner_kwargs = pipe_config_dict["cleaner_kwargs"] autofeaturizer_kwargs = pipe_config_dict["autofeaturizer_kwargs"] # Modify data_file based on computing resource data_dir = os.environ["AMM_DATASET_DIR"] data_file = os.path.join(data_dir, data_file) # Modify save_dir based on computing resource bench_dir = os.environ["AMM_SINGLE_FIT_DIR"] base_save_dir = fw_spec["base_save_dir"] base_save_dir = os.path.join(bench_dir, base_save_dir) if not os.path.exists(base_save_dir): os.makedirs(base_save_dir) # Set up pipeline config if learner_name == "TPOTAdaptor": learner = TPOTAdaptor(**learner_kwargs) elif learner_name == "rf": warnings.warn( "Learner kwargs passed into RF regressor/classifiers bc. rf being used." ) learner = SinglePipelineAdaptor( regressor=RandomForestRegressor(**learner_kwargs), classifier=RandomForestClassifier(**learner_kwargs), ) else: raise ValueError("{} not supported yet!" "".format(learner_name)) pipe_config = { "learner": learner, "reducer": FeatureReducer(**reducer_kwargs), "cleaner": DataCleaner(**cleaner_kwargs), "autofeaturizer": AutoFeaturizer(**autofeaturizer_kwargs), } pipe = MatPipe(**pipe_config) # Set up dataset # Dataset should already be set up correctly as json beforehand. # this includes targets being converted to classification, removing # extra columns, having the names of featurization cols set to the # same as the matpipe config, etc. df = load_dataframe_from_json(data_file) pipe.fit(df, target) pipe.save(os.path.join(base_save_dir, "pipe.p"))
def get_preset_config(preset: str = 'express', **powerups) -> dict: """ Preset configs for MatPipe. USER: "******": Used for making production predictions and benchmarks. Balances accuracy and timeliness. "heavy" - When high accuracy is required, and you have access to (very) powerful computing resources. May be buggier and more difficult to run than production. "express" - Good for quick benchmarks with moderate accuracy. "express_single" - Same as express but uses XGB trees as single models instead of automl TPOT. Good for even more express results. DEBUG: "debug" - Debugging with automl enabled. "debug_single" - Debugging with a single model. Args: preset (str): The name of the preset config you'd like to use. **powerups: Various modifications as kwargs. cache_src (str): A file path. If specified, Autofeaturizer will use feature caching with a file stored at this location. See Autofeaturizer's cache_src argument for more information. Returns: (dict) The desired preset config. """ caching_kwargs = {"cache_src": powerups.get("cache_src", None)} if preset == "production": production_config = { "learner": TPOTAdaptor(max_time_mins=720, max_eval_time_mins=20), "reducer": FeatureReducer(reducers=('pca', )), "autofeaturizer": AutoFeaturizer(preset="best", **caching_kwargs), "cleaner": DataCleaner() } return production_config elif preset == "heavy": heavy_config = { "learner": TPOTAdaptor(max_time_mins=1440), "reducer": FeatureReducer(reducers=("corr", "rebate")), "autofeaturizer": AutoFeaturizer(preset="all", **caching_kwargs), "cleaner": DataCleaner() } return heavy_config elif preset == "express": express_config = { "learner": TPOTAdaptor(max_time_mins=60, population_size=20), "reducer": FeatureReducer(reducers=('corr', )), "autofeaturizer": AutoFeaturizer(preset="fast", **caching_kwargs), "cleaner": DataCleaner() } return express_config elif preset == "express_single": xgb_kwargs = {"n_estimators": 300, "max_depth": 3, "n_jobs": -1} express_config = { "learner": SinglePipelineAdaptor(regressor=XGBRegressor(**xgb_kwargs), classifier=XGBClassifier(**xgb_kwargs)), "reducer": FeatureReducer(reducers=('corr', )), "autofeaturizer": AutoFeaturizer(preset="fast", **caching_kwargs), "cleaner": DataCleaner() } return express_config elif preset == "debug": debug_config = { "learner": TPOTAdaptor(max_time_mins=2, max_eval_time_mins=1, population_size=10), "reducer": FeatureReducer(reducers=('corr', 'tree')), "autofeaturizer": AutoFeaturizer(preset="fast", **caching_kwargs), "cleaner": DataCleaner() } return debug_config elif preset == "debug_single": rf_kwargs = {"n_estimators": 10, "n_jobs": -1} debug_single_config = { "learner": SinglePipelineAdaptor( classifier=RandomForestClassifier(**rf_kwargs), regressor=RandomForestRegressor(**rf_kwargs)), "reducer": FeatureReducer(reducers=('corr', )), "autofeaturizer": AutoFeaturizer(preset="fast", **caching_kwargs), "cleaner": DataCleaner() } return debug_single_config else: raise ValueError("{} unknown preset.".format(preset))
def get_preset_config(preset: str = "express", **powerups) -> dict: """ Preset configs for MatPipe. USER: "******" - Good for quick benchmarks with moderate accuracy. "express_single" - Same as express but uses XGB trees as single models instead of automl TPOT. Good for even more express results. "production": Used for making production predictions and benchmarks. Balances accuracy and timeliness. "heavy" - When high accuracy is required, and you have access to (very) powerful computing resources. May be buggier and more difficult to run than production. DEBUG: "debug" - Debugging with automl enabled. "debug_single" - Debugging with a single model. Args: preset (str): The name of the preset config you'd like to use. **powerups: Various modifications as kwargs. cache_src (str): A file path. If specified, Autofeaturizer will use feature caching with a file stored at this location. See Autofeaturizer's cache_src argument for more information. n_jobs (int): The number of parallel process to use when running. Particularly important for AutoFeaturixer and TPOTAdaptor. Returns: (dict) The desired preset config. """ caching_kwargs = {"cache_src": powerups.get("cache_src", None)} n_jobs_kwargs = {"n_jobs": powerups.get("n_jobs", os.cpu_count())} if preset not in get_available_presets(): raise ValueError("{} unknown preset.".format(preset)) elif preset == "production": config = { "learner": TPOTAdaptor(max_time_mins=1440, max_eval_time_mins=20, **n_jobs_kwargs), "reducer": FeatureReducer(reducers=("corr", "tree"), tree_importance_percentile=0.99), "autofeaturizer": AutoFeaturizer(preset="express", **caching_kwargs, **n_jobs_kwargs), "cleaner": DataCleaner(), } elif preset == "heavy": config = { "learner": TPOTAdaptor(max_time_mins=2880, **n_jobs_kwargs), "reducer": FeatureReducer(reducers=("corr", "rebate")), "autofeaturizer": AutoFeaturizer(preset="heavy", **caching_kwargs, **n_jobs_kwargs), "cleaner": DataCleaner(), } elif preset == "express": config = { "learner": TPOTAdaptor(max_time_mins=60, population_size=20, **n_jobs_kwargs), "reducer": FeatureReducer(reducers=("corr", "tree"), tree_importance_percentile=0.99), "autofeaturizer": AutoFeaturizer(preset="express", **caching_kwargs, **n_jobs_kwargs), "cleaner": DataCleaner(), } elif preset == "express_single": xgb_kwargs = { "n_estimators": 300, "max_depth": 3, "n_jobs": n_jobs_kwargs } config = { "learner": SinglePipelineAdaptor( regressor=XGBRegressor(**xgb_kwargs), classifier=XGBClassifier(**xgb_kwargs), ), "reducer": FeatureReducer(reducers=("corr", )), "autofeaturizer": AutoFeaturizer(preset="express", **caching_kwargs, **n_jobs_kwargs), "cleaner": DataCleaner(), } elif preset == "debug": if "n_jobs" not in powerups: n_jobs_kwargs["n_jobs"] = 2 config = { "learner": TPOTAdaptor(max_time_mins=1, max_eval_time_mins=1, population_size=10, **n_jobs_kwargs), "reducer": FeatureReducer(reducers=("corr", "tree")), "autofeaturizer": AutoFeaturizer(preset="debug", **caching_kwargs, **n_jobs_kwargs), "cleaner": DataCleaner(), } elif preset == "debug_single": rf_kwargs = {"n_estimators": 10, "n_jobs": n_jobs_kwargs["n_jobs"]} config = { "learner": SinglePipelineAdaptor( classifier=RandomForestClassifier(**rf_kwargs), regressor=RandomForestRegressor(**rf_kwargs), ), "reducer": FeatureReducer(reducers=("corr", )), "autofeaturizer": AutoFeaturizer(preset="debug", **caching_kwargs, **n_jobs_kwargs), "cleaner": DataCleaner(), } return config
sum(r2_scores) / len(r2_scores))) sleep(1) # COMPARE TO MATBENCH df = load_tehrani_superhard_mat(data="basic_descriptors") df = df.drop(["formula", "material_id", "shear_modulus", "initial_structure"], axis=1) traindf = df.iloc[:floor(.8 * len(df))] testdf = df.iloc[floor(.8 * len(df)):] target = "bulk_modulus" # Get top-level transformers autofeater = AutoFeaturizer() cleaner = DataCleaner() reducer = FeatureReducer() learner = TPOTAdaptor("regression", max_time_mins=5) # Fit transformers on training data traindf = autofeater.fit_transform(traindf, target) traindf = cleaner.fit_transform(traindf, target) traindf = reducer.fit_transform(traindf, target) learner.fit(traindf, target) # Apply the same transformations to the testing data testdf = autofeater.transform(testdf, target) testdf = cleaner.transform(testdf, target) testdf = reducer.transform(testdf, target) testdf = learner.predict(testdf, target) #predict validation data print(testdf)
def run_task(self, fw_spec): # Read data from fw_spec pipe_config_dict = fw_spec["pipe_config"] fold = fw_spec["fold"] kfold_config = fw_spec["kfold_config"] target = fw_spec["target"] data_pickle = fw_spec["data_pickle"] clf_pos_label = fw_spec["clf_pos_label"] problem_type = fw_spec["problem_type"] learner_name = pipe_config_dict["learner_name"] cache = fw_spec["cache"] learner_kwargs = pipe_config_dict["learner_kwargs"] reducer_kwargs = pipe_config_dict["reducer_kwargs"] cleaner_kwargs = pipe_config_dict["cleaner_kwargs"] autofeaturizer_kwargs = pipe_config_dict["autofeaturizer_kwargs"] # Modify data_pickle based on computing resource data_dir = os.environ['AMM_DATASET_DIR'] data_file = os.path.join(data_dir, data_pickle) # Modify save_dir based on computing resource bench_dir = os.environ['AMM_BENCH_DIR'] base_save_dir = fw_spec["base_save_dir"] base_save_dir = os.path.join(bench_dir, base_save_dir) save_dir = fw_spec.pop("save_dir") save_dir = os.path.join(base_save_dir, save_dir) if not os.path.exists(save_dir): os.makedirs(save_dir) from multiprocessing import cpu_count ont = os.environ.get("OMP_NUM_THREADS", None) print("Number of omp threads: {}".format(ont)) print("Number of cpus: {}".format(cpu_count())) # n_jobs = int(cpu_count()/2) # print("Setting number of featurization jobs to: {}".format(n_jobs)) # autofeaturizer_kwargs["n_jobs"] = n_jobs # learner_kwargs["verbosity"] = 3 # Set up pipeline config if learner_name == "TPOTAdaptor": learner = TPOTAdaptor(**learner_kwargs) elif learner_name == "rf": warnings.warn( "Learner kwargs passed into RF regressor/classifiers bc. rf being used." ) learner = SinglePipelineAdaptor( regressor=RandomForestRegressor(**learner_kwargs), classifier=RandomForestClassifier(**learner_kwargs)) else: raise ValueError("{} not supported by RunPipe yet!" "".format(learner_name)) if cache: autofeaturizer_kwargs["cache_src"] = os.path.join( base_save_dir, "features.json") pipe_config = { "learner": learner, "reducer": FeatureReducer(**reducer_kwargs), "cleaner": DataCleaner(**cleaner_kwargs), "autofeaturizer": AutoFeaturizer(**autofeaturizer_kwargs) } logger = initialize_logger(AMM_LOGGER_BASENAME, filepath=save_dir) pipe = MatPipe(**pipe_config, logger=logger) # Set up dataset # Dataset should already be set up correctly as pickle beforehand. # this includes targets being converted to classification, removing # extra columns, having the names of featurization cols set to the # same as the matpipe config, etc. df = pd.read_pickle(data_file) # Check other parameters that would otherwise not be checked until after # benchmarking, hopefully saves some errors at the end during scoring. if problem_type not in [AMM_CLF_NAME, AMM_REG_NAME]: raise ValueError("Problem must be either classification or " "regression.") elif problem_type == AMM_CLF_NAME: if not isinstance(clf_pos_label, (str, bool)): raise TypeError( "The classification positive label should be a " "string, or bool not {}." "".format(type(clf_pos_label))) elif clf_pos_label not in df[target]: raise ValueError("The classification positive label should be" "present in the target column.") elif len(df[target].unique()) > 2: raise ValueError("Only binary classification scoring available" "at this time.") # Set up testing scheme if problem_type == AMM_REG_NAME: kfold = KFold(**kfold_config) else: kfold = StratifiedKFold(**kfold_config) if fold >= kfold.n_splits: raise ValueError("{} is out of range for KFold with n_splits=" "{}".format(fold, kfold)) # Run the benchmark t1 = time.time() results = pipe.benchmark(df, target, kfold, fold_subset=[fold], cache=True) result_df = results[0] elapsed_time = time.time() - t1 # Save everything pipe.save(os.path.join(save_dir, "pipe.p")) pipe.digest(os.path.join(save_dir, "digest.txt")) result_df.to_csv(os.path.join(save_dir, "test_df.csv")) pipe.post_fit_df.to_csv(os.path.join(save_dir, "fitted_df.csv")) # Evaluate model true = result_df[target] test = result_df[target + " predicted"] pass_to_storage = {} if problem_type == AMM_REG_NAME: pass_to_storage["r2"] = r2_score(true, test) pass_to_storage["mae"] = mean_absolute_error(true, test) pass_to_storage['rmse'] = sqrt(mean_squared_error(true, test)) elif problem_type == AMM_CLF_NAME: pass_to_storage["f1"] = f1_score(true, test, pos_label=clf_pos_label) pass_to_storage["roc_auc"] = roc_auc_score(true, test) pass_to_storage["accuracy"] = accuracy_score(true, test) else: raise ValueError("Scoring method for problem type {} not supported" "".format(problem_type)) # Extract important details for storage try: # TPOT Adaptor best_pipeline = [ str(step) for step in pipe.learner.best_pipeline.steps ] except AttributeError: best_pipeline = str(pipe.learner.best_pipeline) features = pipe.learner.features n_features = len(features) fold_orig = list(kfold.split(df, y=df[target]))[fold] n_samples_train_original = len(fold_orig[0]) n_samples_test_original = len(fold_orig[1]) pass_to_storage.update({ "target": target, "best_pipeline": best_pipeline, "elapsed_time": elapsed_time, "features": features, "n_features": n_features, "n_test_samples_original": n_samples_test_original, "n_train_samples_original": n_samples_train_original, "n_train_samples": len(pipe.post_fit_df), "n_test_samples": len(test), "test_sample_frac_retained": len(test) / n_samples_test_original, "completion_time": datetime.datetime.now(), "base_save_dir": base_save_dir, "save_dir": save_dir }) fw_spec.update(pass_to_storage)
import numpy as np from automatminer.base import LoggableMixin, DataframeTransformer from automatminer.featurization import AutoFeaturizer from automatminer.preprocessing import DataCleaner, FeatureReducer from automatminer.automl.adaptors import TPOTAdaptor from automatminer.utils.ml_tools import regression_or_classification from automatminer.utils.package_tools import check_fitted, set_fitted, \ return_attrs_recursively performance_config = {} default_config = { "learner": TPOTAdaptor(max_time_mins=120), "reducer": FeatureReducer(), "autofeaturizer": AutoFeaturizer(), "cleaner": DataCleaner() } fast_config = { "learner": TPOTAdaptor(max_time_mins=30, population_size=50), "reducer": FeatureReducer(reducers=('corr', 'tree')), "autofeaturizer": AutoFeaturizer(), "cleaner": DataCleaner() } debug_config = { "learner": TPOTAdaptor(max_time_mins=1, population_size=10), "reducer": FeatureReducer(reducers=('corr', )), "autofeaturizer": AutoFeaturizer(), "cleaner": DataCleaner() }
from matbench.bench import MatbenchBenchmark # The learner is a single 500-estimator Random Forest model learner = SinglePipelineAdaptor( regressor=RandomForestRegressor(n_estimators=500), classifier=RandomForestClassifier(n_estimators=500), ) pipe_config = { "learner": learner, "reducer": FeatureReducer(reducers=[]), "cleaner": DataCleaner(feature_na_method="mean", max_na_frac=0.01, na_method_fit="drop", na_method_transform="mean"), "autofeaturizer": AutoFeaturizer(n_jobs=10, preset="debug"), } pipe = MatPipe(**pipe_config) mb = MatbenchBenchmark(autoload=False) for task in mb.tasks: task.load() for fold in task.folds: df_train = task.get_train_and_val_data(fold, as_type="df")
import numpy as np from automatminer.base import LoggableMixin, DataframeTransformer from automatminer.featurization import AutoFeaturizer from automatminer.preprocessing import DataCleaner, FeatureReducer from automatminer.automl.adaptors import TPOTAdaptor from automatminer.utils.ml_tools import regression_or_classification from automatminer.utils.package_tools import check_fitted, set_fitted, \ return_attrs_recursively performance_config = {} default_config = {"learner": TPOTAdaptor(max_time_mins=120), "reducer": FeatureReducer(), "autofeaturizer": AutoFeaturizer(), "cleaner": DataCleaner()} fast_config = {"learner": TPOTAdaptor(max_time_mins=30, population_size=50), "reducer": FeatureReducer(reducers=('corr', 'tree')), "autofeaturizer": AutoFeaturizer(), "cleaner": DataCleaner()} debug_config = {"learner": TPOTAdaptor(max_time_mins=1, population_size=10), "reducer": FeatureReducer(reducers=('corr',)), "autofeaturizer": AutoFeaturizer(), "cleaner": DataCleaner()} class MatPipe(DataframeTransformer, LoggableMixin): """ Establish an ML pipeline for transforming compositions, structures, bandstructures, and DOS objects into machine-learned properties.