def test_hyperparams_repository_should_save_success_trial(tmpdir):
    hyperparams_json_repository = HyperparamsJSONRepository(tmpdir)
    hyperparams = HyperparameterSamples(HYPERPARAMS)

    hyperparams_json_repository._save_successful_trial_json(hyperparams, FIRST_SCORE_FOR_TRIAL)

    trial_hash = hyperparams_json_repository._get_trial_hash(hyperparams.to_flat_as_dict_primitive())
    file_name = str(float(FIRST_SCORE_FOR_TRIAL)).replace('.', ',') + '_' + trial_hash + '.json'
    path = os.path.join(tmpdir, file_name)
    with open(path) as f:
        trial_json = json.load(f)
    assert trial_json['hyperparams'] == hyperparams.to_flat_as_dict_primitive()
    assert trial_json['score'] == FIRST_SCORE_FOR_TRIAL
def test_hyperparams_repository_should_create_new_trial(tmpdir):
    hyperparams_json_repository = HyperparamsJSONRepository(tmpdir)
    hyperparams = HyperparameterSamples(HYPERPARAMS)

    hyperparams_json_repository.create_new_trial(hyperparams)

    trial_hash = hyperparams_json_repository._get_trial_hash(hyperparams.to_flat_as_dict_primitive())
    file_name = 'NEW_' + trial_hash + '.json'
    path = os.path.join(tmpdir, file_name)
    with open(path) as f:
        trial_json = json.load(f)
    assert trial_json['hyperparams'] == hyperparams.to_flat_as_dict_primitive()
    assert trial_json['score'] is None
def test_hyperparams_repository_should_save_failed_trial(tmpdir):
    hyperparams_json_repository = HyperparamsJSONRepository(tmpdir)
    hyperparams = HyperparameterSamples(HYPERPARAMS)

    hyperparams_json_repository._save_failed_trial_json(hyperparams, Exception('trial failed'))

    trial_hash = hyperparams_json_repository._get_trial_hash(hyperparams.to_flat_as_dict_primitive())
    file_name = 'FAILED_' + trial_hash + '.json'
    path = os.path.join(tmpdir, file_name)
    with open(path) as f:
        trial_json = json.load(f)
    assert trial_json['hyperparams'] == hyperparams.to_flat_as_dict_primitive()
    assert 'exception' in trial_json.keys()
    assert trial_json['score'] is None
def test_hyperparams_repository_should_load_all_trials(tmpdir):
    tmpdir = os.path.join(tmpdir, "__json__")
    os.mkdir(tmpdir)
    hyperparams_json_repository = HyperparamsJSONRepository(tmpdir)
    n_trials = 3
    for i in range(n_trials):
        hyperparams = HyperparameterSamples({'learning_rate': 0.01 + i * 0.01})
        hyperparams_json_repository.save_score_for_success_trial(hyperparams, i)

    trials = hyperparams_json_repository.load_all_trials()

    assert len(trials) == n_trials
    for i in range(n_trials):
        assert trials[i].hyperparams == HyperparameterSamples(
            {'learning_rate': 0.01 + i * 0.01}).to_flat_as_dict_primitive(), (i, str(trials))
def test_automl_sequential_wrapper(tmpdir):
    # Given
    data_inputs = np.array(range(100))
    expected_outputs = np.array(range(100, 200))

    hyperparameter_space = HyperparameterSpace({
        'multiplication_1__multiply_by':
        RandInt(1, 3),
        'multiplication_2__multiply_by':
        RandInt(1, 3),
        'multiplication_3__multiply_by':
        RandInt(1, 3),
    })

    pipeline = Pipeline(
        [('multiplication_1', MultiplyByN()),
         ('multiplication_2', MultiplyByN()),
         ('multiplication_3', MultiplyByN())],
        cache_folder=tmpdir).set_hyperparams_space(hyperparameter_space)

    auto_ml = RandomSearch(
        pipeline,
        hyperparams_repository=HyperparamsJSONRepository(tmpdir),
        n_iter=100)

    # When
    auto_ml: AutoMLSequentialWrapper = auto_ml.fit(data_inputs,
                                                   expected_outputs)
    best_model: Pipeline = auto_ml.get_best_model()
    predicted_outputs = best_model.transform(data_inputs)

    # Then
    actual_mse = ((predicted_outputs - expected_outputs)**2).mean()
    assert actual_mse < 5000
def test_hyperparams_repository_should_load_all_trials(tmpdir):
    hyperparams_json_repository = HyperparamsJSONRepository(tmpdir)
    for i in range(2):
        hyperparams = HyperparameterSamples({'learning_rate': 0.01 + i * 0.01})
        hyperparams_json_repository.save_score_for_success_trial(
            hyperparams, i)

    trials = hyperparams_json_repository.load_all_trials()

    assert len(trials) == 2
    assert trials[0].hyperparams == HyperparameterSamples({
        'learning_rate':
        0.01 + 0 * 0.01
    }).to_flat_as_dict_primitive()
    assert trials[1].hyperparams == HyperparameterSamples({
        'learning_rate':
        0.01 + 1 * 0.01
    }).to_flat_as_dict_primitive()
Пример #7
0
def test_automl_savebestmodel_callback(tmpdir):
    # Given
    hp_repository = HyperparamsJSONRepository(cache_folder=str('caching'))
    validation_splitter = ValidationSplitter(0.20)
    auto_ml = AutoML(
        pipeline=Pipeline([
            MultiplyByN(2).set_hyperparams_space(HyperparameterSpace({
                'multiply_by': FixedHyperparameter(2)
            })),
            NumpyReshape(new_shape=(-1, 1)),
            linear_model.LinearRegression()
        ]),
        validation_splitter=validation_splitter,
        hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(),
        scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False),
        callbacks=[
            BestModelCheckpoint()
        ],
        n_trials=1,
        epochs=10,
        refit_trial=False,
        print_func=print,
        hyperparams_repository=hp_repository,
        continue_loop_on_error=False
    )

    data_inputs = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
    expected_outputs = data_inputs * 4

    # When
    auto_ml.fit(data_inputs=data_inputs, expected_outputs=expected_outputs)


    #Then
    trials: Trials = hp_repository.load_all_trials()
    best_trial = trials.get_best_trial()
    best_trial_score = best_trial.get_validation_score()
    best_trial.cache_folder = hp_repository.cache_folder
    best_model = best_trial.get_model('best')
    _, _, valid_inputs, valid_outputs = ValidationSplitter(0.20).split(data_inputs, expected_outputs)
    predicted_output = best_model.predict(valid_inputs)
    score = mean_squared_error(valid_outputs, predicted_output)

    assert best_trial_score == score
Пример #8
0
def test_hyperparams_json_repository_should_be_observable_in_memory():
    # todo: make a tests that asserts that an observer can receive updates from the HyperparamsJSONRepository
    # todo: given trial, a repo, and an observer
    repo: HyperparamsJSONRepository = HyperparamsJSONRepository()

    # todo: when repo.subscribe(observer)
    # todo: when repo.save_trial(trial)

    # todo: then observer.events[0] == trial
    pass
Пример #9
0
def test_hyperparams_json_repository_should_be_observable_with_file_system_changes(
):
    # todo: make a tests that asserts that an observer can receive updates from the HyperparamsJSONRepository
    # todo: given trial, a repo, and an observer
    repo: HyperparamsJSONRepository = HyperparamsJSONRepository()

    # todo: when repo.subscribe_to_cache_folder_changes(observer)
    # todo: when repo.save_trial(trial)

    # todo: then observer.events[0] == trial
    pass
Пример #10
0
def _make_autoML_loop(tmpdir, p: Pipeline):
    hp_repository = HyperparamsJSONRepository(cache_folder=tmpdir)
#    hp_repository = InMemoryHyperparamsRepository(cache_folder=str(tmpdir) + "_hp")
    n_epochs = 1
    return AutoML(
        pipeline=p,
        hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(),
        validation_splitter=ValidationSplitter(0.20),
        scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False),
        n_trials=1,
        refit_trial=True,
        epochs=n_epochs,
        hyperparams_repository=hp_repository,
        cache_folder_when_no_handle=str(tmpdir),
        continue_loop_on_error=False
    )
Пример #11
0
    def test_logger_automl(self, tmpdir):
        # Given
        context = ExecutionContext()
        self.tmpdir = str(tmpdir)
        hp_repository = HyperparamsJSONRepository(cache_folder=self.tmpdir)
        n_epochs = 2
        n_trials = 4
        auto_ml = AutoML(
            pipeline=Pipeline([
                MultiplyByN(2).set_hyperparams_space(
                    HyperparameterSpace(
                        {'multiply_by': FixedHyperparameter(2)})),
                NumpyReshape(new_shape=(-1, 1)),
                LoggingStep()
            ]),
            hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(
            ),
            validation_splitter=ValidationSplitter(0.20),
            scoring_callback=ScoringCallback(mean_squared_error,
                                             higher_score_is_better=False),
            n_trials=n_trials,
            refit_trial=True,
            epochs=n_epochs,
            hyperparams_repository=hp_repository,
            continue_loop_on_error=False)

        # When
        data_container = DataContainer(
            data_inputs=np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]),
            expected_outputs=np.array([10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]))
        auto_ml.handle_fit(data_container, context)

        # Then
        file_paths = [
            os.path.join(hp_repository.cache_folder, f"trial_{i}.log")
            for i in range(n_trials)
        ]
        assert len(file_paths) == n_trials

        for f in file_paths:
            assert os.path.exists(f)

        for f in file_paths:
            with open(f, 'r') as f:
                log = f.readlines()
                assert len(log) == 36
Пример #12
0
def _test_within_auto_ml_loop(tmpdir, pipeline):
    X_train = np.random.random((25, 50)).astype(np.float32)
    Y_train = np.random.random((25, )).astype(np.float32)

    validation_splitter = KFoldCrossValidationSplitter(3)
    scoring_callback = ScoringCallback(median_absolute_error,
                                       higher_score_is_better=False)

    auto_ml = AutoML(
        pipeline=pipeline,
        hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(),
        validation_splitter=validation_splitter,
        scoring_callback=scoring_callback,
        n_trials=2,
        epochs=1,
        hyperparams_repository=HyperparamsJSONRepository(cache_folder=tmpdir),
        refit_trial=True,
        continue_loop_on_error=False)

    auto_ml.fit(X_train, Y_train)
def test_automl_sequential_wrapper_with_validation_split_wrapper(tmpdir):
    # Given
    data_inputs = np.array(range(100))
    expected_outputs = np.array(range(100, 200))

    hyperparameter_space = HyperparameterSpace({
        'multiplication_1__multiply_by':
        RandInt(1, 3),
        'multiplication_2__multiply_by':
        RandInt(1, 3),
        'multiplication_3__multiply_by':
        RandInt(1, 3),
    })

    pipeline = Pipeline(
        [('multiplication_1', MultiplyByN()),
         ('multiplication_2', MultiplyByN()),
         ('multiplication_3', MultiplyByN())],
        cache_folder=tmpdir).set_hyperparams_space(hyperparameter_space)

    random_search = RandomSearch(
        pipeline,
        validation_technique=ValidationSplitWrapper(
            test_size=0.2,
            scoring_function=mean_squared_error,
            run_validation_split_in_test_mode=False),
        hyperparams_repository=HyperparamsJSONRepository(tmpdir),
        higher_score_is_better=False,
        n_iter=100)

    # When
    mse_before = ((data_inputs - expected_outputs)**2).mean()
    random_search: AutoMLSequentialWrapper = random_search.fit(
        data_inputs, expected_outputs)
    best_model: Pipeline = random_search.get_best_model()
    predicted_outputs = best_model.transform(data_inputs)

    # Then
    actual_mse = ((predicted_outputs - expected_outputs)**2).mean()
    assert actual_mse < mse_before
Пример #14
0
def test_automl_with_kfold(tmpdir):
    # Given
    hp_repository = HyperparamsJSONRepository(cache_folder=str('caching'))
    auto_ml = AutoML(
        pipeline=Pipeline([
            MultiplyByN(2).set_hyperparams_space(
                HyperparameterSpace({'multiply_by': FixedHyperparameter(2)})),
            NumpyReshape(new_shape=(-1, 1)),
            linear_model.LinearRegression()
        ]),
        validation_splitter=ValidationSplitter(0.20),
        hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(),
        scoring_callback=ScoringCallback(mean_squared_error,
                                         higher_score_is_better=False),
        callbacks=[
            MetricCallback('mse',
                           metric_function=mean_squared_error,
                           higher_score_is_better=False),
        ],
        n_trials=1,
        epochs=10,
        refit_trial=True,
        print_func=print,
        hyperparams_repository=hp_repository,
        continue_loop_on_error=False)

    data_inputs = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
    expected_outputs = data_inputs * 4

    # When
    auto_ml.fit(data_inputs=data_inputs, expected_outputs=expected_outputs)

    # Then
    p = auto_ml.get_best_model()
    outputs = p.transform(data_inputs)
    mse = mean_squared_error(expected_outputs, outputs)

    assert mse < 1000
Пример #15
0
def main():
    # Define classification models, and hyperparams.
    # See also HyperparameterSpace documentation : https://www.neuraxle.org/stable/api/neuraxle.hyperparams.space.html#neuraxle.hyperparams.space.HyperparameterSpace

    decision_tree_classifier = SKLearnWrapper(
        DecisionTreeClassifier(),
        HyperparameterSpace({
            'criterion': Choice(['gini', 'entropy']),
            'splitter': Choice(['best', 'random']),
            'min_samples_leaf': RandInt(2, 5),
            'min_samples_split': RandInt(2, 4)
        }))

    extra_tree_classifier = SKLearnWrapper(
        ExtraTreeClassifier(),
        HyperparameterSpace({
            'criterion': Choice(['gini', 'entropy']),
            'splitter': Choice(['best', 'random']),
            'min_samples_leaf': RandInt(2, 5),
            'min_samples_split': RandInt(2, 4)
        }))

    ridge_classifier = Pipeline([
        OutputTransformerWrapper(NumpyRavel()),
        SKLearnWrapper(
            RidgeClassifier(),
            HyperparameterSpace({
                'alpha': Choice([0.0, 1.0, 10.0, 100.0]),
                'fit_intercept': Boolean(),
                'normalize': Boolean()
            }))
    ]).set_name('RidgeClassifier')

    logistic_regression = Pipeline([
        OutputTransformerWrapper(NumpyRavel()),
        SKLearnWrapper(
            LogisticRegression(),
            HyperparameterSpace({
                'C': LogUniform(0.01, 10.0),
                'fit_intercept': Boolean(),
                'penalty': Choice(['none', 'l2']),
                'max_iter': RandInt(20, 200)
            }))
    ]).set_name('LogisticRegression')

    random_forest_classifier = Pipeline([
        OutputTransformerWrapper(NumpyRavel()),
        SKLearnWrapper(
            RandomForestClassifier(),
            HyperparameterSpace({
                'n_estimators': RandInt(50, 600),
                'criterion': Choice(['gini', 'entropy']),
                'min_samples_leaf': RandInt(2, 5),
                'min_samples_split': RandInt(2, 4),
                'bootstrap': Boolean()
            }))
    ]).set_name('RandomForestClassifier')

    # Define a classification pipeline that lets the AutoML loop choose one of the classifier.
    # See also ChooseOneStepOf documentation : https://www.neuraxle.org/stable/api/neuraxle.steps.flow.html#neuraxle.steps.flow.ChooseOneStepOf

    pipeline = Pipeline([
        ChooseOneStepOf([
            decision_tree_classifier, extra_tree_classifier, ridge_classifier,
            logistic_regression, random_forest_classifier
        ])
    ])

    # Create the AutoML loop object.
    # See also AutoML documentation : https://www.neuraxle.org/stable/api/neuraxle.metaopt.auto_ml.html#neuraxle.metaopt.auto_ml.AutoML

    auto_ml = AutoML(
        pipeline=pipeline,
        hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(),
        validation_splitter=ValidationSplitter(test_size=0.20),
        scoring_callback=ScoringCallback(accuracy_score,
                                         higher_score_is_better=True),
        n_trials=7,
        epochs=1,
        hyperparams_repository=HyperparamsJSONRepository(cache_folder='cache'),
        refit_trial=True,
        continue_loop_on_error=False)

    # Load data, and launch AutoML loop !

    X_train, y_train, X_test, y_test = generate_classification_data()
    auto_ml = auto_ml.fit(X_train, y_train)

    # Get the model from the best trial, and make predictions using predict.
    # See also predict documentation : https://www.neuraxle.org/stable/api/neuraxle.base.html#neuraxle.base.BaseStep.predict

    best_pipeline = auto_ml.get_best_model()
    y_pred = best_pipeline.predict(X_test)

    accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
    print("Test accuracy score:", accuracy)

    shutil.rmtree('cache')