Пример #1
0
    def test_evaluate_pipeline(self):
        pipeline = get_classification_pipeline()
        ray_runner = RayRunner(random_seed=42,
                               volumes_dir=None,
                               scratch_dir=self.test_dir,
                               n_workers=1)
        problem_description, dataset = get_data()
        data_pipeline = schemas_utils.get_splitting_pipeline("TRAINING_DATA")
        scoring_pipeline = schemas_utils.get_scoring_pipeline()

        no_split = schemas_utils.DATA_PREPARATION_PARAMS['no_split']

        result = ray_runner.evaluate_pipeline(
            problem_description=problem_description,
            pipeline=pipeline,
            input_data=[dataset],
            metrics=schemas_utils.MULTICLASS_CLASSIFICATION_METRICS,
            data_preparation_pipeline=data_pipeline,
            scoring_pipeline=scoring_pipeline,
            data_preparation_params=no_split)

        self.assertEqual(result.error, None)
        self.assertEqual(
            result.scores.values.tolist(),
            [['ACCURACY', 0.9133333333333333, 0.9133333333333333, 42, 0],
             ['F1_MICRO', 0.9133333333333333, 0.9133333333333333, 42, 0],
             ['F1_MACRO', 0.9123688388315397, 0.9123688388315397, 42, 0]])
Пример #2
0
    def ScoreSolution(self, request, context):
        solution_id = request.solution_id
        logger.info('method=SocreSolution, solution_id=%s', solution_id)

        pipeline, problem_description, _ = self.get_solution_problem(
            solution_id)
        if pipeline is None:
            logger.info(
                'method=FitSolution, solution_id=%s, status=ERRORED, error=Solution_id not found',
                solution_id)
            response = core_pb2.ScoreSolutionResponse()
            return response

        input_data = [load_data(utils.decode_value(x)) for x in request.inputs]
        metrics = [
            utils.decode_performance_metric(metric)
            for metric in request.performance_metrics
        ]
        scoring_pipeline = schemas_utils.get_scoring_pipeline()
        data_preparation_params = decode_scoring_configuration(
            request.configuration)
        data_preparation_pipeline = schemas_utils.get_splitting_pipeline(
            data_preparation_params['method'])

        request_id = self.backend.evaluate_pipeline_request(
            problem_description=problem_description,
            pipeline=pipeline,
            input_data=input_data,
            metrics=metrics,
            data_preparation_pipeline=data_preparation_pipeline,
            scoring_pipeline=scoring_pipeline,
            data_preparation_params=data_preparation_params)

        response = core_pb2.ScoreSolutionResponse(request_id=request_id)
        return response
Пример #3
0
    def SplitData(self, request, context):
        input_data = [load_data(utils.decode_value(x)) for x in request.inputs]
        scoring_configuration = decode_scoring_configuration(
            request.scoring_configuration)
        problem_description = utils.decode_problem_description(request.problem)
        data_pipeline = schemas_utils.get_splitting_pipeline(
            scoring_configuration['method'])

        data_random_seed = 0
        outputs, data_result = runtime_module.prepare_data(
            data_pipeline=data_pipeline,
            problem_description=problem_description,
            inputs=input_data,
            data_params=scoring_configuration,
            context=Context.TESTING,
            random_seed=data_random_seed,
            volumes_dir=EnvVars.D3MSTATICDIR,
            scratch_dir=Path.TEMP_STORAGE_ROOT,
            runtime_environment=None,
        )

        if data_result.has_error():
            logger.info('method=SplitData, error={}', data_result.error)
            response = core_pb2.SplitDataResponse()
            yield response
            return
        else:
            for i, (train_output, test_output,
                    score_output) in enumerate(zip(*outputs)):
                uri_list = []
                for output, tag in (
                    (train_output, 'train'),
                    (test_output, 'test'),
                    (score_output, 'score'),
                ):
                    path = os.path.join(Path.TEMP_STORAGE_ROOT,
                                        '{}_output_{}'.format(tag, i),
                                        'datasetDoc.json')
                    uri = get_uri(path)
                    output.save(uri)
                    uri_list.append(uri)
                # response
                response = core_pb2.SplitDataResponse(
                    train_output=value_pb2.Value(dataset_uri=uri_list[0]),
                    test_output=value_pb2.Value(dataset_uri=uri_list[1]),
                    score_output=value_pb2.Value(dataset_uri=uri_list[2]),
                )
                yield response
Пример #4
0
    def __init__(self, problem_description, backend, *, primitives_blocklist=None, ranking_function=None):
        super().__init__(problem_description=problem_description, backend=backend,
                         primitives_blocklist=primitives_blocklist, ranking_function=ranking_function)
        if self.ranking_function is None:
            self.ranking_function = dummy_ranking_function
        self.task_description = schemas_utils.get_task_description(self.problem_description['problem']['task_keywords'])

        self.available_pipelines = self._return_pipelines(
            self.task_description['task_type'], self.task_description['task_subtype'], self.task_description['data_types'])

        # TODO update this to be defined on problem/metrics terms
        self.data_preparation_pipeline = schemas_utils.get_splitting_pipeline("TRAINING_DATA")
        self.metrics = self.problem_description['problem']['performance_metrics']

        self.scoring_pipeline = schemas_utils.get_scoring_pipeline()
        self.data_preparation_params = schemas_utils.DATA_PREPARATION_PARAMS['no_split']

        self.offset = 10
        self.current_pipeline_index = 0
Пример #5
0
    def __init__(self, problem_description, backend,
                 primitives_blocklist=None, ranking_function=None, num_eval_trials=None):
        if ranking_function is None:
            ranking_function = dummy_ranking_function
        if num_eval_trials is None:
            num_eval_trials = multiprocessing.cpu_count()
        super(TunableBase, self).__init__(problem_description, backend,
                                          primitives_blocklist=primitives_blocklist, ranking_function=ranking_function)
        # TODO update this to be defined on problem/metrics terms
        self.data_preparation_pipeline = schemas_utils.get_splitting_pipeline("TRAINING_DATA")
        self.data_preparation_params = schemas_utils.DATA_PREPARATION_PARAMS['no_split']

        self.scoring_pipeline = schemas_utils.get_scoring_pipeline()
        self.scoring_params = None

        self.metrics = problem_description['problem']['performance_metrics']

        self.oracle = None
        self.tuner_id = 'tuner'
        self.hyperparameters = HyperParameters()
        self.pipeline_candidates = {}
        self.num_eval_trials = num_eval_trials
Пример #6
0
    def test_evaluate_pipelines(self):
        pipeline = get_classification_pipeline()
        ray_runner = RayRunner(random_seed=42,
                               volumes_dir=None,
                               scratch_dir=self.test_dir,
                               n_workers=1)
        problem_description, dataset = get_data()
        data_pipeline = schemas_utils.get_splitting_pipeline("TRAINING_DATA")
        scoring_pipeline = schemas_utils.get_scoring_pipeline()

        no_split = schemas_utils.DATA_PREPARATION_PARAMS['no_split']

        results = ray_runner.evaluate_pipelines(
            problem_description=problem_description,
            pipelines=[pipeline] * 3,
            input_data=[dataset],
            metrics=schemas_utils.MULTICLASS_CLASSIFICATION_METRICS,
            data_preparation_pipeline=data_pipeline,
            scoring_pipeline=scoring_pipeline,
            data_preparation_params=no_split)

        for result in results:
            self.assertEqual(result.error, None)
            self.assertEqual(result.status, 'COMPLETED')
Пример #7
0
def _generate_data_preparation_pipeline():
    from axolotl.utils import schemas as schemas_utils
    data_preparation_pipeline = schemas_utils.get_splitting_pipeline(
        "TRAINING_DATA")
    return data_preparation_pipeline