def write_exported_pipeline(self, pipeline_id, rank=None): metric = self.metrics[0]['metric'].name db = self.DBSession() try: # Get pipeline pipeline = db.query(database.Pipeline).get(pipeline_id) if rank is None: # Find most recent cross-validation crossval_id = ( select([database.CrossValidation.id]) .where(database.CrossValidation.pipeline_id == pipeline_id) .order_by(database.CrossValidation.date.desc()) ).as_scalar() # Get score from that cross-validation score = db.query( select([func.avg(database.CrossValidationScore.value)]) .where( database.CrossValidationScore.cross_validation_id == crossval_id ) .where(database.CrossValidationScore.metric == metric) .as_scalar() ) if score is None: rank = 1000.0 logger.error("Writing pipeline JSON for pipeline %s, but " "it is not scored for %s. Rank set to %s. " "origin=%s", pipeline_id, metric, rank, pipeline.origin) else: logger.warning("Writing pipeline JSON for pipeline %s " "%s=%s origin=%s", pipeline_id, metric, score.value, pipeline.origin) rank = 1.0 - self.metrics[0]['metric'].normalize(score.value) else: logger.warning("Writing pipeline JSON for pipeline %s with " "provided rank %s. origin=%s", pipeline_id, rank, pipeline.origin) obj = to_d3m_json(pipeline) with open(os.path.join(self._ranked_pipelines_dir, '%s.json' % pipeline_id), 'w') as fout: json.dump(obj, fout, indent=2) with open(os.path.join(self._ranked_pipelines_dir, '%s.rank' % pipeline_id), 'w') as fout: fout.write(str(rank)) finally: db.close()
def evaluate(pipeline, data_pipeline, dataset, metrics, problem, scoring_config, dataset_uri, timeout_run): if is_collection(dataset_uri[7:]): dataset = get_dataset_sample(dataset, problem) json_pipeline = convert.to_d3m_json(pipeline) if TaskKeyword.GRAPH in problem['problem'][ 'task_keywords'] and json_pipeline['description'].startswith( 'MtLDB'): return {0: {'ACCURACY': 1.0}, 1: {'ACCURACY': 1.0}} logger.info( "Pipeline to be scored:\n\t%s", '\n\t'.join( [x['primitive']['python_path'] for x in json_pipeline['steps']])) d3m_pipeline = Pipeline.from_json_structure(json_pipeline, ) if 'method' in scoring_config: scoring_config.pop('method') manager = Manager() return_dict = manager.dict() p = Process(target=worker, args=(d3m_pipeline, data_pipeline, scoring_pipeline, problem, dataset, scoring_config, metrics, return_dict)) p.start() p.join(timeout_run) p.terminate() if 'run_results' not in return_dict or 'run_scores' not in return_dict: raise TimeoutError('Reached timeout (%d seconds) to score a pipeline' % timeout_run) run_results = return_dict['run_results'] run_scores = return_dict['run_scores'] for result in run_results: if result.has_error(): raise RuntimeError(result.pipeline_run.status['message']) #save_pipeline_runs(run_results.pipeline_runs) combined_folds = d3m.runtime.combine_folds([fold for fold in run_scores]) scores = {} for _, row in combined_folds.iterrows(): if row['fold'] not in scores: scores[row['fold']] = {} scores[row['fold']][row['metric']] = row['value'] return scores
def execute(pipeline_id, dataset, problem, results_path, msg_queue, db): # Get pipeline from database pipeline = (db.query( database.Pipeline).filter(database.Pipeline.id == pipeline_id).options( joinedload(database.Pipeline.modules), joinedload(database.Pipeline.connections))).one() logger.info('About to execute pipeline, id=%s, dataset=%r', pipeline_id, dataset) # Load data dataset = Dataset.load(dataset) logger.info('Loaded dataset') json_pipeline = convert.to_d3m_json(pipeline) logger.info( 'Pipeline to be executed:\n%s', '\n'.join( [x['primitive']['python_path'] for x in json_pipeline['steps']])) d3m_pipeline = d3m.metadata.pipeline.Pipeline.from_json_structure( json_pipeline, ) runtime = d3m.runtime.Runtime(pipeline=d3m_pipeline, problem_description=problem, context=metadata_base.Context.TESTING) manager = Manager() return_dict = manager.dict() p = Process(target=worker, args=(runtime, dataset, return_dict)) p.start() p.join(180) # Maximum 3 minutes fit_results = return_dict['fit_results'] fit_results.check_success() if results_path is not None: logger.info('Storing fit results at %s', results_path) fit_results.values['outputs.0'].to_csv(results_path) else: logger.info('NOT storing fit results') return fit_results.values
def train(pipeline_id, dataset, problem, storage_dir, steps_to_expose, msg_queue, db): # Get pipeline from database pipeline = ( db.query(database.Pipeline) .filter(database.Pipeline.id == pipeline_id) .options(joinedload(database.Pipeline.modules), joinedload(database.Pipeline.connections)) ).one() logger.info('About to train pipeline, id=%s, dataset=%r', pipeline_id, dataset) # Load data dataset = Dataset.load(dataset) logger.info('Loaded dataset') # Training step - fit pipeline on training data logger.info('Running training') d3m_pipeline = d3m.metadata.pipeline.Pipeline.from_json_structure( convert.to_d3m_json(pipeline), ) expose_outputs = True if len(steps_to_expose) > 0 else False fitted_pipeline, predictions, results = d3m.runtime.fit(d3m_pipeline, [dataset], problem_description=problem, context=metadata_base.Context.TESTING, volumes_dir=os.environ.get('D3MSTATICDIR', None), random_seed=0, expose_produced_outputs=expose_outputs) results.check_success() logger.info('Storing fit results at %s', storage_dir) for step_id in results.values: if step_id in steps_to_expose and isinstance(results.values[step_id], DataFrame): results.values[step_id].to_csv(join(storage_dir, 'fit_%s_%s.csv' % (pipeline_id, step_id))) with open(join(storage_dir, 'fitted_solution_%s.pkl' % pipeline_id), 'wb') as fout: pickle.dump(fitted_pipeline, fout)
def write_scored_pipeline(self, pipeline_id): if not self._scored_pipelines_dir: logger.info("Not writing log file") return db = self.DBSession() try: # Get pipeline pipeline = db.query(database.Pipeline).get(pipeline_id) logger.warning("Writing scored_pipeline JSON for pipeline %s " "origin=%s", pipeline_id, pipeline.origin) filename = os.path.join(self._scored_pipelines_dir, '%s.json' % pipeline_id) obj = to_d3m_json(pipeline) with open(filename, 'w') as fp: json.dump(obj, fp, indent=2) except Exception: logger.exception("Error writing scored_pipeline for %s", pipeline_id) finally: db.close()