예제 #1
0
    def write_exported_pipeline(self, pipeline_id, rank=None):
        metric = self.metrics[0]['metric'].name

        db = self.DBSession()
        try:
            # Get pipeline
            pipeline = db.query(database.Pipeline).get(pipeline_id)

            if rank is None:
                # Find most recent cross-validation
                crossval_id = (
                    select([database.CrossValidation.id])
                    .where(database.CrossValidation.pipeline_id == pipeline_id)
                    .order_by(database.CrossValidation.date.desc())
                ).as_scalar()
                # Get score from that cross-validation
                score = db.query(
                    select([func.avg(database.CrossValidationScore.value)])
                    .where(
                        database.CrossValidationScore.cross_validation_id ==
                        crossval_id
                    )
                    .where(database.CrossValidationScore.metric == metric)
                    .as_scalar()
                )
                if score is None:
                    rank = 1000.0
                    logger.error("Writing pipeline JSON for pipeline %s, but "
                                 "it is not scored for %s. Rank set to %s. "
                                 "origin=%s",
                                 pipeline_id, metric, rank, pipeline.origin)
                else:
                    logger.warning("Writing pipeline JSON for pipeline %s "
                                   "%s=%s origin=%s",
                                   pipeline_id, metric, score.value,
                                   pipeline.origin)
                    rank = 1.0 - self.metrics[0]['metric'].normalize(score.value)
            else:
                logger.warning("Writing pipeline JSON for pipeline %s with "
                               "provided rank %s. origin=%s",
                               pipeline_id, rank, pipeline.origin)

            obj = to_d3m_json(pipeline)

            with open(os.path.join(self._ranked_pipelines_dir, '%s.json' % pipeline_id), 'w') as fout:
                json.dump(obj, fout, indent=2)
            with open(os.path.join(self._ranked_pipelines_dir, '%s.rank' % pipeline_id), 'w') as fout:
                fout.write(str(rank))

        finally:
            db.close()
예제 #2
0
def evaluate(pipeline, data_pipeline, dataset, metrics, problem,
             scoring_config, dataset_uri, timeout_run):
    if is_collection(dataset_uri[7:]):
        dataset = get_dataset_sample(dataset, problem)

    json_pipeline = convert.to_d3m_json(pipeline)

    if TaskKeyword.GRAPH in problem['problem'][
            'task_keywords'] and json_pipeline['description'].startswith(
                'MtLDB'):
        return {0: {'ACCURACY': 1.0}, 1: {'ACCURACY': 1.0}}

    logger.info(
        "Pipeline to be scored:\n\t%s", '\n\t'.join(
            [x['primitive']['python_path'] for x in json_pipeline['steps']]))

    d3m_pipeline = Pipeline.from_json_structure(json_pipeline, )
    if 'method' in scoring_config:
        scoring_config.pop('method')

    manager = Manager()
    return_dict = manager.dict()
    p = Process(target=worker,
                args=(d3m_pipeline, data_pipeline, scoring_pipeline, problem,
                      dataset, scoring_config, metrics, return_dict))
    p.start()
    p.join(timeout_run)
    p.terminate()

    if 'run_results' not in return_dict or 'run_scores' not in return_dict:
        raise TimeoutError('Reached timeout (%d seconds) to score a pipeline' %
                           timeout_run)

    run_results = return_dict['run_results']
    run_scores = return_dict['run_scores']

    for result in run_results:
        if result.has_error():
            raise RuntimeError(result.pipeline_run.status['message'])

    #save_pipeline_runs(run_results.pipeline_runs)
    combined_folds = d3m.runtime.combine_folds([fold for fold in run_scores])
    scores = {}

    for _, row in combined_folds.iterrows():
        if row['fold'] not in scores:
            scores[row['fold']] = {}
        scores[row['fold']][row['metric']] = row['value']

    return scores
예제 #3
0
def execute(pipeline_id, dataset, problem, results_path, msg_queue, db):
    # Get pipeline from database

    pipeline = (db.query(
        database.Pipeline).filter(database.Pipeline.id == pipeline_id).options(
            joinedload(database.Pipeline.modules),
            joinedload(database.Pipeline.connections))).one()

    logger.info('About to execute pipeline, id=%s, dataset=%r', pipeline_id,
                dataset)

    # Load data
    dataset = Dataset.load(dataset)
    logger.info('Loaded dataset')

    json_pipeline = convert.to_d3m_json(pipeline)
    logger.info(
        'Pipeline to be executed:\n%s', '\n'.join(
            [x['primitive']['python_path'] for x in json_pipeline['steps']]))

    d3m_pipeline = d3m.metadata.pipeline.Pipeline.from_json_structure(
        json_pipeline, )

    runtime = d3m.runtime.Runtime(pipeline=d3m_pipeline,
                                  problem_description=problem,
                                  context=metadata_base.Context.TESTING)

    manager = Manager()
    return_dict = manager.dict()
    p = Process(target=worker, args=(runtime, dataset, return_dict))
    p.start()
    p.join(180)  # Maximum 3 minutes
    fit_results = return_dict['fit_results']
    fit_results.check_success()

    if results_path is not None:
        logger.info('Storing fit results at %s', results_path)
        fit_results.values['outputs.0'].to_csv(results_path)
    else:
        logger.info('NOT storing fit results')

    return fit_results.values
예제 #4
0
def train(pipeline_id, dataset, problem, storage_dir, steps_to_expose, msg_queue, db):
    # Get pipeline from database
    pipeline = (
        db.query(database.Pipeline)
            .filter(database.Pipeline.id == pipeline_id)
            .options(joinedload(database.Pipeline.modules),
                     joinedload(database.Pipeline.connections))
    ).one()

    logger.info('About to train pipeline, id=%s, dataset=%r',
                pipeline_id, dataset)

    # Load data
    dataset = Dataset.load(dataset)
    logger.info('Loaded dataset')

    # Training step - fit pipeline on training data
    logger.info('Running training')

    d3m_pipeline = d3m.metadata.pipeline.Pipeline.from_json_structure(
        convert.to_d3m_json(pipeline),
    )

    expose_outputs = True if len(steps_to_expose) > 0 else False

    fitted_pipeline, predictions, results = d3m.runtime.fit(d3m_pipeline, [dataset], problem_description=problem,
                                                            context=metadata_base.Context.TESTING,
                                                            volumes_dir=os.environ.get('D3MSTATICDIR', None),
                                                            random_seed=0,
                                                            expose_produced_outputs=expose_outputs)

    results.check_success()

    logger.info('Storing fit results at %s', storage_dir)
    for step_id in results.values:
        if step_id in steps_to_expose and isinstance(results.values[step_id], DataFrame):
            results.values[step_id].to_csv(join(storage_dir, 'fit_%s_%s.csv' % (pipeline_id, step_id)))

    with open(join(storage_dir, 'fitted_solution_%s.pkl' % pipeline_id), 'wb') as fout:
        pickle.dump(fitted_pipeline, fout)
예제 #5
0
    def write_scored_pipeline(self, pipeline_id):
        if not self._scored_pipelines_dir:
            logger.info("Not writing log file")
            return

        db = self.DBSession()
        try:
            # Get pipeline
            pipeline = db.query(database.Pipeline).get(pipeline_id)

            logger.warning("Writing scored_pipeline JSON for pipeline %s "
                           "origin=%s",
                           pipeline_id, pipeline.origin)

            filename = os.path.join(self._scored_pipelines_dir,
                                    '%s.json' % pipeline_id)
            obj = to_d3m_json(pipeline)
            with open(filename, 'w') as fp:
                json.dump(obj, fp, indent=2)
        except Exception:
            logger.exception("Error writing scored_pipeline for %s",
                             pipeline_id)
        finally:
            db.close()