def _load_pipeline(pipeline, hyperparams=None): if isinstance(pipeline, str) and os.path.isfile(pipeline): pipeline = MLPipeline.load(pipeline) else: pipeline = MLPipeline(pipeline) if hyperparams is not None: pipeline.set_hyperparameters(hyperparams) return pipeline
def _load_pipeline(pipeline): if isinstance(pipeline, MLPipeline): return pipeline if isinstance(pipeline, str): return MLPipeline.load(pipeline) if isinstance(pipeline, dict): return MLPipeline.from_dict(pipeline) raise ValueError('Invalid pipeline %s', pipeline)
def evaluate_task(task, metrics=None, feature_matrix=None, output_path=None, save_intermedia_data=True, save_model=True, save_hyperparameters=True): """Run benchmark testing on a task. Save intermedia data, trained models, and optimized hyperparameters. Return testing results. Args: task (Task): a task instance storing meta information of the task. metrics (list) a list of strings to identify the metric functions. feature_matrix (pd.DataFrame): a dataframe consists of both feature values and target values. output_path (str): a directory path to store the intermedia data, model and hyperparametes. save_intermedia_data (boolean): whether to store the intermedia data including an entity set and a feature matrix if the beginning stage is "data_loader" or "problem_definition". save_model (boolean): whether to store the trained model. save_hyperparameters (boolean): whether to store the hyperparameters if task.tuned is true. Returns: list: benchmarking results of each run. """ # Load pipeline. pipeline = MLPipeline.load(os.path.join(ROOT_DIR, task.path_to_pipeline)) # Set hyperparameters. if task.path_to_hyperparameters is not None: _, extension = os.path.splitext(task.path_to_hyperparameters) with open(os.path.join(ROOT_DIR, task.path_to_hyperparameters)) as f: if extension == '.json': init_hyperparameters = json.load(f) elif extension == '.pkl': init_hyperparameters = pickle.load(f) else: raise TypeError("Unsupported file type {}.".format(extension)) pipeline.set_hyperparameters(init_hyperparameters) # Load Dataset. if feature_matrix is None: if task.beginning_stage == "data_loader": raise NotImplementedError elif task.beginning_stage == "problem_definition": raise NotImplementedError elif task.beginning_stage == "featurization": feature_matrix = pd.read_csv(os.path.join(ROOT_DIR, task.path_to_dataset), index_col=0) else: raise ValueError("Beginning stage should be either \"data_loader\", " "\"problem_definition\" or \"featurization\".") # Run the pipeline for #task.run_num times and record each run. results = [] records = [] for i in range(task.run_num): scores, model, hyperparameters = _evaluate_pipeline( i, pipeline, feature_matrix, task.pipeline_name, task.problem_name, task.dataset_name, task.beginning_stage, task.tuned, metrics) results.append(scores) records.append((model, hyperparameters)) # Store the output results. if output_path is not None: # Initialize the output directory. if os.path.exists(output_path): shutil.rmtree(output_path) os.mkdir(output_path) # Save task meta information task.save_as(os.path.join(output_path, "meta.json")) matrix = 'F1 Macro' best_index = np.argmax([scores[matrix] for scores in results]) models, hyperparameters = records[best_index] # Save pipeline models if required if save_model: with open(os.path.join(output_path, "model.pkl"), 'wb') as f: pickle.dump(models, f) # Save pipeline hyperparameters if required if save_hyperparameters and hyperparameters is not None: with open(os.path.join(output_path, "hyperparameters.pkl"), 'wb') as f: pickle.dump(hyperparameters, f) return results