Пример #1
0
def test_automl_early_stopping_callback(tmpdir):
    # TODO: fix this unit test
    # Given
    hp_repository = InMemoryHyperparamsRepository(cache_folder=str(tmpdir))
    n_epochs = 60
    auto_ml = AutoML(
        pipeline=Pipeline([
            FitTransformCallbackStep().set_name('callback'),
            MultiplyByN(2).set_hyperparams_space(
                HyperparameterSpace({'multiply_by': FixedHyperparameter(2)})),
            NumpyReshape(new_shape=(-1, 1)),
            linear_model.LinearRegression()
        ]),
        hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(),
        validation_splitter=ValidationSplitter(0.20),
        scoring_callback=ScoringCallback(mean_squared_error,
                                         higher_score_is_better=False),
        callbacks=[
            MetricCallback('mse',
                           metric_function=mean_squared_error,
                           higher_score_is_better=False),
        ],
        n_trials=1,
        refit_trial=True,
        epochs=n_epochs,
        hyperparams_repository=hp_repository)

    # When
    data_inputs = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
    expected_outputs = data_inputs * 2
    auto_ml = auto_ml.fit(data_inputs=data_inputs,
                          expected_outputs=expected_outputs)

    # Then
    p = auto_ml.get_best_model()
Пример #2
0
def test_logger():
    file_path = "test.log"

    if os.path.exists(file_path):
        os.remove(file_path)

    # Given
    logger = logging.getLogger('test')
    file_handler = logging.FileHandler(file_path)
    file_handler.setLevel('DEBUG')
    logger.addHandler(file_handler)
    logger.setLevel('DEBUG')
    context = ExecutionContext(logger=logger)
    pipeline = Pipeline([
        MultiplyByN(2).set_hyperparams_space(
            HyperparameterSpace({'multiply_by': FixedHyperparameter(2)})),
        NumpyReshape(new_shape=(-1, 1)),
        LoggingStep()
    ])

    # When
    data_container = DataContainer(
        data_inputs=np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]))
    pipeline.handle_fit(data_container, context)

    # Then
    assert os.path.exists(file_path)
    with open(file_path) as f:
        l = f.read()

    # Teardown
    file_handler.close()
    os.remove(file_path)
Пример #3
0
def test_trainer_train():
    data_inputs = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
    expected_outputs = data_inputs * 4
    p = Pipeline([
        MultiplyByN(2).set_hyperparams_space(
            HyperparameterSpace({'multiply_by': FixedHyperparameter(2)})),
        NumpyReshape(new_shape=(-1, 1)),
        linear_model.LinearRegression()
    ])

    trainer: Trainer = Trainer(
        epochs=10,
        scoring_callback=ScoringCallback(mean_squared_error,
                                         higher_score_is_better=False),
        validation_splitter=ValidationSplitter(test_size=0.20))

    repo_trial: Trial = trainer.train(pipeline=p,
                                      data_inputs=data_inputs,
                                      expected_outputs=expected_outputs)

    trained_pipeline = repo_trial.get_trained_pipeline(split_number=0)

    outputs = trained_pipeline.transform(data_inputs)
    mse = mean_squared_error(expected_outputs, outputs)

    assert mse < 1
Пример #4
0
    def test_logger_automl(self, tmpdir):
        # Given
        context = ExecutionContext()
        self.tmpdir = str(tmpdir)
        hp_repository = HyperparamsJSONRepository(cache_folder=self.tmpdir)
        n_epochs = 2
        n_trials = 4
        auto_ml = AutoML(
            pipeline=Pipeline([
                MultiplyByN(2).set_hyperparams_space(
                    HyperparameterSpace(
                        {'multiply_by': FixedHyperparameter(2)})),
                NumpyReshape(new_shape=(-1, 1)),
                LoggingStep()
            ]),
            hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(
            ),
            validation_splitter=ValidationSplitter(0.20),
            scoring_callback=ScoringCallback(mean_squared_error,
                                             higher_score_is_better=False),
            n_trials=n_trials,
            refit_trial=True,
            epochs=n_epochs,
            hyperparams_repository=hp_repository,
            continue_loop_on_error=False)

        # When
        data_container = DataContainer(
            data_inputs=np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]),
            expected_outputs=np.array([10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0]))
        auto_ml.handle_fit(data_container, context)

        # Then
        file_paths = [
            os.path.join(hp_repository.cache_folder, f"trial_{i}.log")
            for i in range(n_trials)
        ]
        assert len(file_paths) == n_trials

        for f in file_paths:
            assert os.path.exists(f)

        for f in file_paths:
            with open(f, 'r') as f:
                log = f.readlines()
                assert len(log) == 36
Пример #5
0
def test_automl_savebestmodel_callback(tmpdir):
    # Given
    hp_repository = HyperparamsJSONRepository(cache_folder=str('caching'))
    validation_splitter = ValidationSplitter(0.20)
    auto_ml = AutoML(
        pipeline=Pipeline([
            MultiplyByN(2).set_hyperparams_space(HyperparameterSpace({
                'multiply_by': FixedHyperparameter(2)
            })),
            NumpyReshape(new_shape=(-1, 1)),
            linear_model.LinearRegression()
        ]),
        validation_splitter=validation_splitter,
        hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(),
        scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False),
        callbacks=[
            BestModelCheckpoint()
        ],
        n_trials=1,
        epochs=10,
        refit_trial=False,
        print_func=print,
        hyperparams_repository=hp_repository,
        continue_loop_on_error=False
    )

    data_inputs = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
    expected_outputs = data_inputs * 4

    # When
    auto_ml.fit(data_inputs=data_inputs, expected_outputs=expected_outputs)


    #Then
    trials: Trials = hp_repository.load_all_trials()
    best_trial = trials.get_best_trial()
    best_trial_score = best_trial.get_validation_score()
    best_trial.cache_folder = hp_repository.cache_folder
    best_model = best_trial.get_model('best')
    _, _, valid_inputs, valid_outputs = ValidationSplitter(0.20).split(data_inputs, expected_outputs)
    predicted_output = best_model.predict(valid_inputs)
    score = mean_squared_error(valid_outputs, predicted_output)

    assert best_trial_score == score
Пример #6
0
def test_automl_with_kfold(tmpdir):
    # Given
    hp_repository = HyperparamsJSONRepository(cache_folder=str('caching'))
    auto_ml = AutoML(
        pipeline=Pipeline([
            MultiplyByN(2).set_hyperparams_space(
                HyperparameterSpace({'multiply_by': FixedHyperparameter(2)})),
            NumpyReshape(new_shape=(-1, 1)),
            linear_model.LinearRegression()
        ]),
        validation_splitter=ValidationSplitter(0.20),
        hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(),
        scoring_callback=ScoringCallback(mean_squared_error,
                                         higher_score_is_better=False),
        callbacks=[
            MetricCallback('mse',
                           metric_function=mean_squared_error,
                           higher_score_is_better=False),
        ],
        n_trials=1,
        epochs=10,
        refit_trial=True,
        print_func=print,
        hyperparams_repository=hp_repository,
        continue_loop_on_error=False)

    data_inputs = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
    expected_outputs = data_inputs * 4

    # When
    auto_ml.fit(data_inputs=data_inputs, expected_outputs=expected_outputs)

    # Then
    p = auto_ml.get_best_model()
    outputs = p.transform(data_inputs)
    mse = mean_squared_error(expected_outputs, outputs)

    assert mse < 1000
Пример #7
0
def test_automl_early_stopping_callback(tmpdir):
    # Given
    hp_repository = InMemoryHyperparamsRepository(cache_folder=str(tmpdir))
    n_epochs = 10
    max_epochs_without_improvement = 3
    auto_ml = AutoML(
        pipeline=Pipeline([
            MultiplyByN(2).set_hyperparams_space(
                HyperparameterSpace({'multiply_by': FixedHyperparameter(2)})),
            NumpyReshape(new_shape=(-1, 1)),
        ]),
        hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(),
        validation_splitter=ValidationSplitter(0.20),
        scoring_callback=ScoringCallback(mean_squared_error,
                                         higher_score_is_better=False),
        callbacks=[
            MetricCallback('mse',
                           metric_function=mean_squared_error,
                           higher_score_is_better=False),
            EarlyStoppingCallback(max_epochs_without_improvement)
        ],
        n_trials=1,
        refit_trial=True,
        epochs=n_epochs,
        hyperparams_repository=hp_repository,
        continue_loop_on_error=False)

    # When
    data_inputs = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
    expected_outputs = data_inputs * 2
    auto_ml.fit(data_inputs=data_inputs, expected_outputs=expected_outputs)

    # Then
    trial = hp_repository.trials[0]
    assert len(trial.validation_splits) == 1
    validation_scores = trial.validation_splits[0].get_validation_scores()
    nepochs_executed = len(validation_scores)
    assert nepochs_executed == max_epochs_without_improvement + 1
Пример #8
0
def main():
    def accuracy(data_inputs, expected_outputs):
        return np.mean(
            np.argmax(np.array(data_inputs), axis=1) == np.argmax(
                np.array(expected_outputs), axis=1))

    # load the dataset
    df = read_csv('data/winequality-white.csv', sep=';')
    data_inputs = df.values
    data_inputs[:, -1] = data_inputs[:, -1] - 1
    n_features = data_inputs.shape[1] - 1
    n_classes = 10

    p = Pipeline([
        TrainOnlyWrapper(DataShuffler()),
        ColumnTransformerInputOutput(
            input_columns=[(
                [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], ToNumpy(np.float32)
            )],
            output_columns=[(11, Identity())]
        ),
        OutputTransformerWrapper(PlotDistribution(column=-1)),
        MiniBatchSequentialPipeline([
            Tensorflow2ModelStep(
                create_model=create_model,
                create_loss=create_loss,
                create_optimizer=create_optimizer
            ) \
                .set_hyperparams(HyperparameterSamples({
                'n_dense_layers': 2,
                'input_dim': n_features,
                'optimizer': 'adam',
                'activation': 'relu',
                'kernel_initializer': 'he_uniform',
                'learning_rate': 0.01,
                'hidden_dim': 20,
                'n_classes': 3
            })).set_hyperparams_space(HyperparameterSpace({
                'n_dense_layers': RandInt(2, 4),
                'hidden_dim_layer_multiplier': Uniform(0.30, 1),
                'input_dim': FixedHyperparameter(n_features),
                'optimizer': Choice([
                    OPTIMIZERS.ADAM.value,
                    OPTIMIZERS.SGD.value,
                    OPTIMIZERS.ADAGRAD.value
                ]),
                'activation': Choice([
                    ACTIVATIONS.RELU.value,
                    ACTIVATIONS.TANH.value,
                    ACTIVATIONS.SIGMOID.value,
                    ACTIVATIONS.ELU.value,
                ]),
                'kernel_initializer': Choice([
                    KERNEL_INITIALIZERS.GLOROT_NORMAL.value,
                    KERNEL_INITIALIZERS.GLOROT_UNIFORM.value,
                    KERNEL_INITIALIZERS.HE_UNIFORM.value
                ]),
                'learning_rate': LogUniform(0.005, 0.01),
                'hidden_dim': RandInt(3, 80),
                'n_classes': FixedHyperparameter(n_classes)
            }))
        ], batch_size=33),
        OutputTransformerWrapper(Pipeline([
            ExpandDim(),
            OneHotEncoder(nb_columns=n_classes, name='classes')
        ]))
    ])

    auto_ml = AutoML(
        pipeline=p,
        hyperparams_repository=InMemoryHyperparamsRepository(
            cache_folder='trials'),
        hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(),
        validation_splitter=ValidationSplitter(test_size=0.30),
        scoring_callback=ScoringCallback(accuracy,
                                         higher_score_is_better=True),
        callbacks=[
            MetricCallback(
                name='classification_report_imbalanced_metric',
                metric_function=classificaiton_report_imbalanced_metric,
                higher_score_is_better=True),
            MetricCallback(name='f1',
                           metric_function=f1_score_weighted,
                           higher_score_is_better=True),
            MetricCallback(name='recall',
                           metric_function=recall_score_weighted,
                           higher_score_is_better=True),
            MetricCallback(name='precision',
                           metric_function=precision_score_weighted,
                           higher_score_is_better=True),
            EarlyStoppingCallback(max_epochs_without_improvement=3)
        ],
        n_trials=200,
        refit_trial=True,
        epochs=75)

    auto_ml = auto_ml.fit(data_inputs=data_inputs)