예제 #1
0
def test_pipelinesearcher(makedirs_mock):
    # static methods
    assert hasattr(PipelineSearcher, '_find_datasets')
    assert hasattr(PipelineSearcher, '_new_pipeline')

    # default parameters
    instance = PipelineSearcher()

    makedirs_mock.assert_called_with(instance.ranked_dir, exist_ok=True)

    assert instance.input == 'input'
    assert instance.output == 'output'
    assert not instance.dump
    assert instance.ranked_dir == '{}/pipelines_ranked'.format(instance.output)
    assert isinstance(instance.data_pipeline, Pipeline)
    assert isinstance(instance.scoring_pipeline, Pipeline)

    # other parameters
    instance = PipelineSearcher(input_dir='new-input',
                                output_dir='new-output',
                                dump=True)

    makedirs_mock.assert_called_with(instance.ranked_dir, exist_ok=True)

    assert instance.input == 'new-input'
    assert instance.output == 'new-output'
    assert instance.dump
    assert instance.ranked_dir == '{}/pipelines_ranked'.format(instance.output)
    assert isinstance(instance.data_pipeline, Pipeline)
    assert isinstance(instance.scoring_pipeline, Pipeline)
    assert instance.datasets == {}
예제 #2
0
def test_pipelinesearcher_stop():
    instance = PipelineSearcher()

    assert not hasattr(instance, '_stop')

    # setting _stop
    instance.stop()
    assert instance._stop
예제 #3
0
def search(dataset_root, problem, args):
    pps = PipelineSearcher(
        args.input,
        args.output,
        args.static,
        dump=True,
        hard_timeout=args.hard,
    )

    return pps.search(problem, args.timeout, args.budget, args.template)
예제 #4
0
def test_pipelinesearcher_setup_search():
    instance = PipelineSearcher()

    assert hasattr(instance, 'solutions')
    assert not hasattr(instance, '_stop')
    assert not hasattr(instance, 'done')
    assert not hasattr(instance, 'start_time')
    assert not hasattr(instance, 'timeout')
    assert not hasattr(instance, 'max_end_time')

    # without timeout
    instance.timeout = None
    instance.setup_search()

    assert instance.solutions == []
    assert instance._stop is False
    assert instance.done is False
    assert hasattr(instance, 'start_time')
    assert instance.timeout is None
    assert instance.max_end_time is None

    # with timeout
    instance.timeout = 0.5
    instance.setup_search()

    assert instance.timeout == 0.5
    assert instance.max_end_time == instance.start_time + timedelta(seconds=0.5)
예제 #5
0
def test_pipelinesearcher_load_pipeline(json_loader_mock, yaml_loader_mock):
    instance = PipelineSearcher()
    open_mock = mock_open(read_data='data')

    json_loader_mock.reset_mock()
    yaml_loader_mock.reset_mock()

    # yaml file
    with patch('ta2.search.open', open_mock) as _:
        instance._load_pipeline('test.yml')

    open_mock.assert_called_with('{}/test.yml'.format(PIPELINES_DIR), 'r')

    assert yaml_loader_mock.call_count == 1
    assert json_loader_mock.call_count == 0

    # json file
    with patch('ta2.search.open', open_mock) as _:
        instance._load_pipeline('test.json')

    open_mock.assert_called_with('{}/test.json'.format(PIPELINES_DIR), 'r')

    assert yaml_loader_mock.call_count == 1
    assert json_loader_mock.call_count == 1

    # without file extension
    with patch('ta2.search.open', open_mock) as _:
        instance._load_pipeline('test')

    open_mock.assert_called_with('{}/test.json'.format(PIPELINES_DIR), 'r')

    assert yaml_loader_mock.call_count == 1
    assert json_loader_mock.call_count == 2
예제 #6
0
def test_pipelinesearcher_find_datasets(tmp_path):
    input_dir = tmp_path / 'test-input'
    input_dir.mkdir()

    content = {
        'about': {
            'datasetID': None
        }
    }

    num_datasets = 3
    for i in range(num_datasets):
        dataset_dir = input_dir / 'dataset-{}'.format(i)
        dataset_dir.mkdir()

        content['about']['datasetID'] = 'dataset-{}'.format(i)

        file = dataset_dir / 'datasetDoc.json'
        file.write_text(json.dumps(content))

    result = PipelineSearcher._find_datasets(input_dir)

    assert len(result) == num_datasets

    for i in range(num_datasets):
        dataset_id = 'dataset-{}'.format(i)

        assert dataset_id in result
        assert result[dataset_id] == 'file://{}/{}/datasetDoc.json'.format(input_dir, dataset_id)
예제 #7
0
def test_pipelinesearcher_get_template(logger_mock):
    instance = PipelineSearcher()
    data = {'problem': {'task_type': None}}

    # classification
    data['problem']['task_type'] = TaskType.CLASSIFICATION

    result = instance._get_template(None, data)  # dataset (None) is not used

    assert logger_mock.call_count == 1
    assert result == 'xgb_classification.all_hp.yml'

    # regression
    data['problem']['task_type'] = TaskType.REGRESSION

    result = instance._get_template(None, data)  # dataset (None) is not used

    assert logger_mock.call_count == 2
    assert result == 'xgb_regression.all_hp.yml'

    # not supported
    data['problem']['task_type'] = 'other-task-type'

    with pytest.raises(ValueError):
        instance._get_template(None, data)  # dataset (None) is not used
예제 #8
0
def test_pipelinesearcher_defaults(makedirs_mock):
    instance = PipelineSearcher()

    expected_calls = [
        call('output/pipelines_ranked', exist_ok=True),
        call('output/pipelines_scored', exist_ok=True),
        call('output/pipelines_searched', exist_ok=True),
    ]
    assert makedirs_mock.call_args_list == expected_calls

    assert instance.input == 'input'
    assert instance.output == 'output'
    assert not instance.dump
    assert instance.ranked_dir == 'output/pipelines_ranked'
    assert instance.scored_dir == 'output/pipelines_scored'
    assert instance.searched_dir == 'output/pipelines_searched'
    assert isinstance(instance.data_pipeline, Pipeline)
    assert isinstance(instance.scoring_pipeline, Pipeline)
예제 #9
0
def test_pipelinesearcher(makedirs_mock, from_yaml_mock):
    instance = PipelineSearcher(input_dir='new-input', output_dir='new-output', dump=True)

    expected_calls = [
        call('new-output/pipeline_runs', exist_ok=True),
        call('new-output/pipelines_ranked', exist_ok=True),
        call('new-output/pipelines_scored', exist_ok=True),
        call('new-output/pipelines_searched', exist_ok=True),
    ]
    assert makedirs_mock.call_args_list == expected_calls

    assert instance.input == 'new-input'
    assert instance.output == 'new-output'
    assert instance.dump
    assert instance.ranked_dir == 'new-output/pipelines_ranked'
    assert instance.scored_dir == 'new-output/pipelines_scored'
    assert instance.searched_dir == 'new-output/pipelines_searched'
    assert instance.data_pipeline == from_yaml_mock.return_value
    assert instance.scoring_pipeline == from_yaml_mock.return_value
예제 #10
0
def test_pipelinesearcher_save_pipeline(random_mock):
    id = 'test-id'
    score = 1.0
    random_mock.return_value = 2
    pipeline_mock = MagicMock(id=id, score=score)
    pipeline_mock.to_json_structure = MagicMock(return_value={})
    open_mock = mock_open()

    # avoid saving pipeline on file
    instance = PipelineSearcher(dump=False)
    instance.solutions = [
    ]  # normally, setted in `PipelineSearcher.setup_search`

    result = instance._save_pipeline(
        pipeline_mock, None)  # normalized_score (None) not used in this case

    assert result is None
    assert pipeline_mock.to_json_structure.call_count == 1
    assert instance.solutions == [{'score': score}]
    assert not random_mock.called
    assert not open_mock.called

    # saving the pipeline on file (dump = True)
    instance = PipelineSearcher(dump=True)
    instance.solutions = [
    ]  # normally, setted in `PipelineSearcher.setup_search`

    with patch('ta2.search.open', open_mock) as _:
        result = instance._save_pipeline(pipeline_mock, 1)

    assert result is None
    assert pipeline_mock.to_json_structure.call_count == 2
    assert instance.solutions == [{'score': score, 'pipeline_rank': 2.e-12}]
    assert random_mock.call_count == 1
    assert open_mock.call_count == 1

    open_mock.assert_called_with('{}/{}.json'.format(instance.ranked_dir, id),
                                 'w')
예제 #11
0
def test_pipelinesearcher(makedirs_mock):
    instance = PipelineSearcher(input_dir='new-input',
                                output_dir='new-output',
                                dump=True)

    expected_calls = [
        call('new-output/pipelines_ranked', exist_ok=True),
        call('new-output/pipelines_scored', exist_ok=True),
        call('new-output/pipelines_searched', exist_ok=True),
    ]
    assert makedirs_mock.call_args_list == expected_calls

    assert instance.input == 'new-input'
    assert instance.output == 'new-output'
    assert instance.dump
    assert instance.ranked_dir == 'new-output/pipelines_ranked'
    assert instance.scored_dir == 'new-output/pipelines_scored'
    assert instance.searched_dir == 'new-output/pipelines_searched'
    assert isinstance(instance.data_pipeline, Pipeline)
    assert isinstance(instance.scoring_pipeline, Pipeline)
    assert instance.datasets == {}
예제 #12
0
def test_pipelinesearcher_check_stop(datetime_mock):
    datetime_mock.now = MagicMock(return_value=10)

    # no stop
    instance = PipelineSearcher()
    instance._stop = False       # normally, setted in `PipelineSearcher.setup_search`
    instance.timeout = None      # normally, setted in `PipelineSearcher.setup_search`

    assert instance.check_stop() is None

    # stop by `_stop` attribute
    instance._stop = True

    with pytest.raises(KeyboardInterrupt):
        instance.check_stop()

    # stop by `max_end_time`
    instance._stop = False
    instance.timeout = 10
    instance.max_end_time = 5

    with pytest.raises(KeyboardInterrupt):
        instance.check_stop()
예제 #13
0
def test_pipelinesearcher_score_pipeline(evaluate_mock):
    instance = PipelineSearcher()
    expected_scores = [MagicMock(value=[1])]
    evaluate_mock.return_value = (expected_scores, expected_scores)

    # parameters
    dataset = {}
    problem = {'problem': {'performance_metrics': None}}
    pipeline_mock = MagicMock()
    metrics = {'test': 'metric'}
    random_seed = 0
    folds = 5
    stratified = False
    shuffle = False

    data_params = {
        'number_of_folds': json.dumps(folds),
        'stratified': json.dumps(stratified),
        'shuffle': json.dumps(shuffle),
    }

    # with custom metrics
    instance.score_pipeline(
        dataset, problem, pipeline_mock,
        metrics=metrics, random_seed=random_seed,
        folds=folds, stratified=stratified, shuffle=shuffle
    )

    evaluate_mock.assert_called_with(
        pipeline=pipeline_mock,
        inputs=[dataset],
        data_pipeline=instance.data_pipeline,
        scoring_pipeline=instance.scoring_pipeline,
        problem_description=problem,
        data_params=data_params,            # folds, stratified, shuffle
        metrics=metrics,                    # custom metrics
        context=Context.TESTING,
        random_seed=random_seed,
        data_random_seed=random_seed,
        scoring_random_seed=random_seed,
        volumes_dir=instance.static
    )

    assert pipeline_mock.cv_scores == [score.value[0] for score in expected_scores]

    # with problem metrics

    instance.score_pipeline(
        dataset, problem, pipeline_mock,
        metrics=None, random_seed=random_seed,
        folds=folds, stratified=stratified, shuffle=shuffle
    )

    evaluate_mock.assert_called_with(
        pipeline=pipeline_mock,
        inputs=[dataset],
        data_pipeline=instance.data_pipeline,
        scoring_pipeline=instance.scoring_pipeline,
        problem_description=problem,
        data_params=data_params,                            # folds, stratified, shuffle
        metrics=problem['problem']['performance_metrics'],  # custom metrics
        context=Context.TESTING,
        random_seed=random_seed,
        data_random_seed=random_seed,
        scoring_random_seed=random_seed,
        volumes_dir=instance.static
    )

    assert pipeline_mock.cv_scores == [score.value[0] for score in expected_scores]
예제 #14
0
def search(dataset_root, problem, args):

    pps = PipelineSearcher(args.input, args.output, dump=True)

    return pps.search(problem, timeout=args.timeout, budget=args.budget)
예제 #15
0
def process_dataset(dataset_name, dataset, problem, args):
    box_print("Processing dataset {}".format(dataset_name), True)

    output_path = os.path.join(args.output, dataset_name)
    os.makedirs(output_path, exist_ok=True)

    LOGGER.info("Searching Pipeline for dataset {}".format(dataset_name))
    try:
        start_ts = datetime.utcnow()
        pps = PipelineSearcher(args.input,
                               output_path,
                               args.static,
                               dump=True,
                               hard_timeout=args.hard,
                               ignore_errors=args.ignore_errors,
                               cv_folds=args.folds,
                               subprocess_timeout=args.subprocess_timeout,
                               max_errors=args.max_errors,
                               store_summary=True)
        result = pps.search(dataset, problem, args.timeout, args.budget,
                            args.templates_csv)

        result['elapsed'] = datetime.utcnow() - start_ts
        result['dataset'] = dataset_name

    except Exception as ex:
        result = {
            'dataset': dataset_name,
            'error': '{}: {}'.format(type(ex).__name__, ex),
        }
    else:
        try:
            summary = result.pop('summary')
            candidates = _select_candidates(summary)
            if candidates.empty:
                box_print('No valid pipelines found for dataset {}'.format(
                    dataset_name))
            else:
                ranked_path = os.path.join(output_path, 'pipelines_ranked')
                test_scores = list()
                for _, candidate in candidates.iterrows():
                    try:
                        pipeline = candidate.pipeline
                        pipeline_path = os.path.join(ranked_path, pipeline)
                        test_score = score_pipeline(dataset, problem,
                                                    pipeline_path, args.static,
                                                    output_path)
                        test_scores.append(test_score)
                    except Exception:
                        test_scores.append(None)

                candidates['test_score'] = test_scores
                candidates = candidates.sort_values('test_score',
                                                    ascending=False)

                best = candidates.iloc[0]
                result['test_score'] = best.test_score
                result['template'] = best.template
                result['cv_score'] = best.score
                box_print('Best pipelines for dataset {}:\n{}'.format(
                    dataset_name, candidates.to_string()))

        except Exception as ex:
            LOGGER.exception('Error while testing the winner pipeline')
            result['error'] = 'TEST Error: {}'.format(ex)

    return result