def run(self, input_dataset, eval_datasets=[], return_pipeline=False): """ Converts internal pipeline architecture dict into pipeline and runs it. Args: - input_dataset: Input dataset to train - eval_dataset: Dataset to evaluate - return_pipeline: Whether to return the pipeline which fitted and produced the preds Returns: - If return_pipeline is False, returns just the predictions, otherwise returns a tuple (preds, pipeline) """ pipeline = self.load_pipeline_architecture(self.pipeline_architecture_dict) pipeline.check() runtime = Runtime(pipeline, context=Context.TESTING) runtime.fit(inputs=[input_dataset], return_values=['outputs.0']) all_preds = [] for dataset in eval_datasets: all_preds.append(runtime.produce(inputs=[dataset], return_values=['outputs.0'])) results = all_preds if return_pipeline: results = (all_preds, pipeline) return results
def score_pipeline(dataset_root, problem, pipeline_path): train_dataset = load_dataset(dataset_root, 'TRAIN') test_dataset = load_dataset(dataset_root, 'SCORE', 'TEST') pipeline = load_pipeline(pipeline_path) # Creating an instance on runtime with pipeline description and problem description. runtime = Runtime( pipeline=pipeline, problem_description=problem, context=Context.TESTING ) print("Fitting the pipeline") fit_results = runtime.fit(inputs=[train_dataset]) fit_results.check_success() # Producing results using the fitted pipeline. print("Producing predictions") produce_results = runtime.produce(inputs=[test_dataset]) produce_results.check_success() predictions = produce_results.values['outputs.0'] metrics = problem['problem']['performance_metrics'] print("Computing the score") scoring_pipeline = load_pipeline('ta2/pipelines/scoring_pipeline.yml') scores, scoring_pipeline_run = score( scoring_pipeline, problem, predictions, [test_dataset], metrics, context=Context.TESTING, random_seed=0, ) return scores.iloc[0].value
def run_pipeline(pipeline, dataset_name, datasets_path): ensure_downloaded(dataset_name, datasets_path) root_path = os.path.join(os.path.abspath(datasets_path), dataset_name) train_dataset = load_dataset(root_path, 'TRAIN') train_problem = load_problem(root_path, 'TRAIN') # Creating an instance on runtime with pipeline description and problem description. runtime = Runtime(pipeline=pipeline, problem_description=train_problem, context=Context.TESTING) # Fitting pipeline on input dataset. fit_results = runtime.fit(inputs=[train_dataset]) fit_results.check_success() # Producing results using the fitted pipeline. test_dataset = load_dataset(root_path, 'TEST') produce_results = runtime.produce(inputs=[test_dataset]) produce_results.check_success() print('Pipeline run successfully') output = list(produce_results.values.values())[0] print(output.shape) print(output.head())
def __run_pipeline(self, pipeline_description, data, volume_dir='/volumes'): runtime = Runtime(pipeline=pipeline_description, context=metadata_base.Context.TESTING, volumes_dir=volume_dir) fit_result = runtime.fit([data]) return fit_result
def fitproduce(self, input_item): problem_doc, pipeline_json, dataset_train, dataset_test = input_item[ 1:] # Run pipeline pipeline = Pipeline.from_json(pipeline_json) pipeline_runtime = Runtime(pipeline, context=Context.TESTING) pipeline_runtime.fit(inputs=[dataset_train], return_values=['outputs.0']) score_predictions = pipeline_runtime.produce( inputs=[dataset_test], return_values=['outputs.0']) score_predictions = score_predictions.values['outputs.0'] # Write predictions to output path path = self.get_predictions_save_path() utils.utils.write_predictions_to_file(score_predictions, path, problem_doc) path_uri = "file://%s" % path return path_uri
def score(self, input_item): problem_doc, metric, pipeline_json, dataset_train, dataset_test = input_item[ 1:] # Run pipeline pipeline = Pipeline.from_json(pipeline_json) pipeline_runtime = Runtime(pipeline, context=Context.TESTING) pipeline_runtime.fit(inputs=[dataset_train], return_values=['outputs.0']) score_predictions = pipeline_runtime.produce( inputs=[dataset_test], return_values=['outputs.0']) score_predictions = score_predictions.values['outputs.0'] # Evaluate scores on score dir achieved_score = utils.train_utils.score(score_predictions, dataset_test, problem_doc, override_metric_key=metric) return achieved_score
def score_pipeline(dataset, problem, pipeline_path, static=None, output_path=None): pipeline = load_pipeline(pipeline_path) # Creating an instance on runtime with pipeline description and problem description. runtime = Runtime( pipeline=pipeline, problem_description=problem, context=Context.EVALUATION, volumes_dir=static, ) LOGGER.info("Fitting pipeline %s", pipeline_path) fit_results = runtime.fit(inputs=[dataset]) fit_results.check_success() dataset_doc_path = dataset.metadata.query(())['location_uris'][0] dataset_root = dataset_doc_path[:-len( '/TRAIN/dataset_TRAIN/datasetDoc.json')] test_dataset = load_dataset(dataset_root, 'SCORE', 'TEST') # Producing results using the fitted pipeline. LOGGER.info("Producing predictions for pipeline %s", pipeline_path) produce_results = runtime.produce(inputs=[test_dataset]) produce_results.check_success() predictions = produce_results.values['outputs.0'] metrics = problem['problem']['performance_metrics'] LOGGER.info("Computing the score for pipeline %s", pipeline_path) scoring_pipeline = load_pipeline(DEFAULT_SCORING_PIPELINE_PATH) scores, scoring_pipeline_run = score( scoring_pipeline=scoring_pipeline, problem_description=problem, predictions=predictions, score_inputs=[test_dataset], metrics=metrics, context=Context.EVALUATION, random_seed=0, ) evaluated_pipeline_run = produce_results.pipeline_run evaluated_pipeline_run.is_standard_pipeline = True evaluated_pipeline_run.set_scores(scores, metrics) evaluated_pipeline_run.set_scoring_pipeline_run( scoring_pipeline_run.pipeline_run, [dataset]) _to_yaml_run(evaluated_pipeline_run, output_path) return scores.iloc[0].value
import utils.train_utils if __name__ == "__main__": # Get args try: path_to_pipeline_json = sys.argv[1] inputdir = sys.argv[2] # Load datasets problem_doc, dataset = utils.utils.load_data_from_dir(inputdir) # Create pipeline with open(path_to_pipeline_json, "r") as f: pipeline = Pipeline.from_json(f.read()) pipeline_runtime = Runtime(pipeline, context=Context.TESTING) pipeline_runtime.fit(inputs=[dataset], return_values=['outputs.0']) problem_doc_score, dataset_score = utils.utils.load_data_from_dir(inputdir, mode="score") score_predictions = pipeline_runtime.produce(inputs=[dataset_score], return_values=['outputs.0']) score_predictions = score_predictions.values['outputs.0'] # Evaluate scores on score dir achieved_score = utils.train_utils.score(score_predictions, dataset_score, problem_doc_score) print(achieved_score) except: print("N/A")
def write_pipeline_run(self, problem_description, dataset, filename_yaml): runtime = Runtime(pipeline=self.pipeline_description, problem_description=problem_description, context=Context.TESTING, is_standard_pipeline=True) output = runtime.fit(inputs=dataset) pipeline_run = output.pipeline_run with open(filename_yaml, "w") as out: pipeline_run.to_yaml(file=filename_yaml)
outputdir = sys.argv[3] ################ # Load dataset # ################ problem_doc_train, dataset_train = ( utils.utils.load_data_from_dir(inputdir, mode="train")) ################### # Create pipeline # ################### with open(path_to_pipeline_json, "r") as f: pipeline = utils.primitive_pipeline_utils.load_pipeline(f.read()) pipeline_runtime = Runtime(pipeline) pipeline_runtime.fit(inputs=[dataset_train]) ############################ # Try scoring on SCORE set # ############################ try: problem_doc_score, dataset_score = utils.utils.load_data_from_dir(inputdir, mode="score") score_predictions = pipeline_runtime.produce(inputs=[dataset_score]).values['outputs.0'] validation_score = utils.train_utils.get_score_on_score_set(inputdir, problem_doc_score, score_predictions) print("PipelineId: %s, Score: %s" % (pipeline.id, validation_score)) except: print("-------------------------------") print("Failed to evaluate on SCORE set") print("-------------------------------") traceback.print_exc()
# Loading problem description. problem_description = problem.parse_problem_description(problem_path) # Loading dataset. path = 'file://{uri}'.format(uri=os.path.abspath(dataset_train_path)) dataset = D3MDatasetLoader().load(dataset_uri=path) path2 = 'file://{uri}'.format(uri=os.path.abspath(dataset_predict_path)) dataset_predict = D3MDatasetLoader().load(dataset_uri=path2) # Loading pipeline description file. with open(pipeline_path, 'r') as file: pipeline_description = pipeline_module.Pipeline.from_json( string_or_file=file) # Creating an instance on runtime with pipeline description and problem description. runtime = Runtime(pipeline=pipeline_description, problem_description=problem_description, context=metadata_base.Context.TESTING) # Fitting pipeline on input dataset. fit_results = runtime.fit(inputs=[dataset]) fit_results.check_success() # Producing results using the fitted pipeline. produce_results = runtime.produce(inputs=[dataset_predict]) produce_results.check_success() print(produce_results.values)