예제 #1
0
def main():
    p = Pipeline([('step1', MultiplyByN()), ('step2', MultiplyByN()),
                  Pipeline([Identity(),
                            Identity(),
                            PCA(n_components=4)])])

    p.set_hyperparams_space(
        HyperparameterSpace({
            'step1__multiply_by': RandInt(42, 50),
            'step2__multiply_by': RandInt(-10, 0),
            'Pipeline__PCA__n_components': RandInt(2, 3)
        }))

    samples = p.get_hyperparams_space().rvs()
    p.set_hyperparams(samples)

    samples = p.get_hyperparams()
    assert 42 <= samples['step1__multiply_by'] <= 50
    assert -10 <= samples['step2__multiply_by'] <= 0
    assert samples['Pipeline__PCA__n_components'] in [2, 3]
    assert p['Pipeline']['PCA'].get_wrapped_sklearn_predictor(
    ).n_components in [2, 3]
예제 #2
0
    def __init__(self,
                 wrapped: BaseTransformer,
                 enabled: bool = True,
                 nullified_return_value=None,
                 cache_folder_when_no_handle=None,
                 use_hyperparameter_space=True,
                 nullify_hyperparams=True):
        hyperparameter_space = HyperparameterSpace(
            {OPTIONAL_ENABLED_HYPERPARAM: Boolean()
             }) if use_hyperparameter_space else {}

        MetaStep.__init__(self,
                          hyperparams=HyperparameterSamples(
                              {OPTIONAL_ENABLED_HYPERPARAM: enabled}),
                          hyperparams_space=hyperparameter_space,
                          wrapped=wrapped)
        ForceHandleOnlyMixin.__init__(self, cache_folder_when_no_handle)

        if nullified_return_value is None:
            nullified_return_value = []
        self.nullified_return_value = nullified_return_value
        self.nullify_hyperparams = nullify_hyperparams
예제 #3
0
def test_automl_with_kfold(tmpdir):
    # Given
    hp_repository = HyperparamsJSONRepository(cache_folder=str('caching'))
    auto_ml = AutoML(
        pipeline=Pipeline([
            MultiplyByN(2).set_hyperparams_space(
                HyperparameterSpace({'multiply_by': FixedHyperparameter(2)})),
            NumpyReshape(new_shape=(-1, 1)),
            linear_model.LinearRegression()
        ]),
        validation_splitter=ValidationSplitter(0.20),
        hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(),
        scoring_callback=ScoringCallback(mean_squared_error,
                                         higher_score_is_better=False),
        callbacks=[
            MetricCallback('mse',
                           metric_function=mean_squared_error,
                           higher_score_is_better=False),
        ],
        n_trials=1,
        epochs=10,
        refit_trial=True,
        print_func=print,
        hyperparams_repository=hp_repository,
        continue_loop_on_error=False)

    data_inputs = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
    expected_outputs = data_inputs * 4

    # When
    auto_ml.fit(data_inputs=data_inputs, expected_outputs=expected_outputs)

    # Then
    p = auto_ml.get_best_model()
    outputs = p.transform(data_inputs)
    mse = mean_squared_error(expected_outputs, outputs)

    assert mse < 1000
예제 #4
0
def test_automl_early_stopping_callback(tmpdir):
    # Given
    hp_repository = InMemoryHyperparamsRepository(cache_folder=str(tmpdir))
    n_epochs = 10
    max_epochs_without_improvement = 3
    auto_ml = AutoML(
        pipeline=Pipeline([
            MultiplyByN(2).set_hyperparams_space(
                HyperparameterSpace({'multiply_by': FixedHyperparameter(2)})),
            NumpyReshape(new_shape=(-1, 1)),
        ]),
        hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(),
        validation_splitter=ValidationSplitter(0.20),
        scoring_callback=ScoringCallback(mean_squared_error,
                                         higher_score_is_better=False),
        callbacks=[
            MetricCallback('mse',
                           metric_function=mean_squared_error,
                           higher_score_is_better=False),
            EarlyStoppingCallback(max_epochs_without_improvement)
        ],
        n_trials=1,
        refit_trial=True,
        epochs=n_epochs,
        hyperparams_repository=hp_repository,
        continue_loop_on_error=False)

    # When
    data_inputs = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
    expected_outputs = data_inputs * 2
    auto_ml.fit(data_inputs=data_inputs, expected_outputs=expected_outputs)

    # Then
    trial = hp_repository.trials[0]
    assert len(trial.validation_splits) == 1
    validation_scores = trial.validation_splits[0].get_validation_scores()
    nepochs_executed = len(validation_scores)
    assert nepochs_executed == max_epochs_without_improvement + 1
예제 #5
0
def test_automl_sequential_wrapper(tmpdir):
    # Setting seed for reproducibility
    np.random.seed(68)
    # Given
    data_inputs = np.array(range(100))
    expected_outputs = np.array(range(100, 200))

    hyperparameter_space = HyperparameterSpace({
        'multiplication_1__multiply_by':
        RandInt(1, 3),
        'multiplication_2__multiply_by':
        RandInt(1, 3),
        'multiplication_3__multiply_by':
        RandInt(1, 3),
    })

    pipeline = Pipeline(
        [('multiplication_1', MultiplyByN()),
         ('multiplication_2', MultiplyByN()),
         ('multiplication_3', MultiplyByN())],
        cache_folder=tmpdir).set_hyperparams_space(hyperparameter_space)

    auto_ml = RandomSearch(
        KFoldCrossValidationWrapper().set_step(pipeline),
        hyperparams_repository=HyperparamsJSONRepository(tmpdir),
        n_iter=10)

    # When
    auto_ml: AutoMLSequentialWrapper = auto_ml.fit(data_inputs,
                                                   expected_outputs)
    best_model: Pipeline = auto_ml.get_best_model()
    predicted_outputs = best_model.transform(data_inputs)

    # Then
    actual_mse = ((predicted_outputs - expected_outputs)**2).mean()
    assert actual_mse < 20000
예제 #6
0
def test_hyperparam_space():
    p = Pipeline([
        AddFeatures([
            SomeStep(hyperparams_space=HyperparameterSpace(
                {"n_components": RandInt(1, 5)})),
            SomeStep(hyperparams_space=HyperparameterSpace(
                {"n_components": RandInt(1, 5)}))
        ]),
        ModelStacking([
            SomeStep(hyperparams_space=HyperparameterSpace(
                {"n_estimators": RandInt(1, 1000)})),
            SomeStep(hyperparams_space=HyperparameterSpace(
                {"n_estimators": RandInt(1, 1000)})),
            SomeStep(hyperparams_space=HyperparameterSpace(
                {"max_depth": RandInt(1, 100)})),
            SomeStep(hyperparams_space=HyperparameterSpace(
                {"max_depth": RandInt(1, 100)}))
        ],
                      joiner=NumpyTranspose(),
                      judge=SomeStep(hyperparams_space=HyperparameterSpace(
                          {"alpha": LogUniform(0.1, 10.0)})))
    ])

    rvsed = p.get_hyperparams_space()
    p.set_hyperparams(rvsed)

    hyperparams = p.get_hyperparams()

    assert "AddFeatures" in hyperparams.keys()
    assert "SomeStep" in hyperparams["AddFeatures"]
    assert "n_components" in hyperparams["AddFeatures"]["SomeStep"]
    assert "SomeStep1" in hyperparams["AddFeatures"]
    assert "n_components" in hyperparams["AddFeatures"]["SomeStep1"]
    assert "SomeStep" in hyperparams["ModelStacking"]
    assert "n_estimators" in hyperparams["ModelStacking"]["SomeStep"]
    assert "SomeStep1" in hyperparams["ModelStacking"]
    assert "max_depth" in hyperparams["ModelStacking"]["SomeStep2"]
예제 #7
0
def main(tmpdir, sleep_time: float = 0, n_iter: int = 10):
    DATA_INPUTS = np.array(range(100))
    EXPECTED_OUTPUTS = np.array(range(100, 200))

    HYPERPARAMETER_SPACE = HyperparameterSpace({
        'multiplication_1__multiply_by':
        RandInt(1, 2),
        'multiplication_2__multiply_by':
        RandInt(1, 2),
        'multiplication_3__multiply_by':
        RandInt(1, 2),
    })

    print('Classic Pipeline:')

    pipeline = Pipeline([
        ('multiplication_1', MultiplyByN()),
        ('sleep_1', ForEachDataInput(Sleep(sleep_time))),
        ('multiplication_2', MultiplyByN()),
        ('sleep_2', ForEachDataInput(Sleep(sleep_time))),
        ('multiplication_3', MultiplyByN()),
    ]).set_hyperparams_space(HYPERPARAMETER_SPACE)

    time_a = time.time()
    best_model = RandomSearch(pipeline,
                              n_iter=n_iter,
                              higher_score_is_better=True).fit(
                                  DATA_INPUTS, EXPECTED_OUTPUTS)
    outputs = best_model.transform(DATA_INPUTS)
    time_b = time.time()

    actual_score = mean_squared_error(EXPECTED_OUTPUTS, outputs)
    print('{0} seconds'.format(time_b - time_a))
    print('output: {0}'.format(outputs))
    print('smallest mse: {0}'.format(actual_score))
    print('best hyperparams: {0}'.format(pipeline.get_hyperparams()))

    assert isinstance(actual_score, float)

    print('Resumable Pipeline:')

    pipeline = ResumablePipeline(
        [('multiplication_1', MultiplyByN()),
         ('ForEach(sleep_1)', ForEachDataInput(Sleep(sleep_time))),
         ('checkpoint1', ExpandDim(DefaultCheckpoint())),
         ('multiplication_2', MultiplyByN()),
         ('sleep_2', ForEachDataInput(Sleep(sleep_time))),
         ('checkpoint2', ExpandDim(DefaultCheckpoint())),
         ('multiplication_3', MultiplyByN())],
        cache_folder=tmpdir).set_hyperparams_space(HYPERPARAMETER_SPACE)

    time_a = time.time()
    best_model = RandomSearch(pipeline,
                              n_iter=n_iter,
                              higher_score_is_better=True).fit(
                                  DATA_INPUTS, EXPECTED_OUTPUTS)
    outputs = best_model.transform(DATA_INPUTS)
    time_b = time.time()
    pipeline.flush_all_cache()

    actual_score = mean_squared_error(EXPECTED_OUTPUTS, outputs)
    print('{0} seconds'.format(time_b - time_a))
    print('output: {0}'.format(outputs))
    print('smallest mse: {0}'.format(actual_score))
    print('best hyperparams: {0}'.format(pipeline.get_hyperparams()))

    assert isinstance(actual_score, float)
def main(tmpdir, sleep_time: float = 0.001, n_iter: int = 10):
    DATA_INPUTS = np.array(range(100))
    EXPECTED_OUTPUTS = np.array(range(100, 200))

    HYPERPARAMETER_SPACE = HyperparameterSpace({
        'multiplication_1__multiply_by': RandInt(1, 2),
        'multiplication_2__multiply_by': RandInt(1, 2),
        'multiplication_3__multiply_by': RandInt(1, 2),
    })

    print('Classic Pipeline:')
    classic_pipeline_folder = os.path.join(str(tmpdir), 'classic')

    pipeline = Pipeline([
        ('multiplication_1', MultiplyByN()),
        ('sleep_1', ForEachDataInput(Sleep(sleep_time))),
        ('multiplication_2', MultiplyByN()),
        ('sleep_2', ForEachDataInput(Sleep(sleep_time))),
        ('multiplication_3', MultiplyByN()),
    ], cache_folder=classic_pipeline_folder).set_hyperparams_space(HYPERPARAMETER_SPACE)

    time_a = time.time()
    auto_ml = AutoML(
        pipeline,
        refit_trial=True,
        n_trials=n_iter,
        cache_folder_when_no_handle=classic_pipeline_folder,
        validation_splitter=ValidationSplitter(0.2),
        hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(),
        scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False),
        callbacks=[
            MetricCallback('mse', metric_function=mean_squared_error, higher_score_is_better=False)
        ],
    )
    auto_ml = auto_ml.fit(DATA_INPUTS, EXPECTED_OUTPUTS)
    outputs = auto_ml.get_best_model().predict(DATA_INPUTS)
    time_b = time.time()

    actual_score = mean_squared_error(EXPECTED_OUTPUTS, outputs)
    print('{0} seconds'.format(time_b - time_a))
    print('output: {0}'.format(outputs))
    print('smallest mse: {0}'.format(actual_score))
    print('best hyperparams: {0}'.format(pipeline.get_hyperparams()))

    assert isinstance(actual_score, float)

    print('Resumable Pipeline:')
    resumable_pipeline_folder = os.path.join(str(tmpdir), 'resumable')

    pipeline = ResumablePipeline([
        ('multiplication_1', MultiplyByN()),
        ('ForEach(sleep_1)', ForEachDataInput(Sleep(sleep_time))),
        ('checkpoint1', ExpandDim(DefaultCheckpoint())),
        ('multiplication_2', MultiplyByN()),
        ('sleep_2', ForEachDataInput(Sleep(sleep_time))),
        ('checkpoint2', ExpandDim(DefaultCheckpoint())),
        ('multiplication_3', MultiplyByN())
    ], cache_folder=resumable_pipeline_folder).set_hyperparams_space(HYPERPARAMETER_SPACE)

    time_a = time.time()
    auto_ml = AutoML(
        pipeline,
        refit_trial=True,
        n_trials=n_iter,
        cache_folder_when_no_handle=resumable_pipeline_folder,
        validation_splitter=ValidationSplitter(0.2),
        hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(),
        scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False),
        callbacks=[
            MetricCallback('mse', metric_function=mean_squared_error, higher_score_is_better=False)
        ]
    )
    auto_ml = auto_ml.fit(DATA_INPUTS, EXPECTED_OUTPUTS)
    outputs = auto_ml.get_best_model().predict(DATA_INPUTS)
    time_b = time.time()
    pipeline.flush_all_cache()

    actual_score = mean_squared_error(EXPECTED_OUTPUTS, outputs)
    print('{0} seconds'.format(time_b - time_a))
    print('output: {0}'.format(outputs))
    print('smallest mse: {0}'.format(actual_score))
    print('best hyperparams: {0}'.format(pipeline.get_hyperparams()))

    assert isinstance(actual_score, float)
예제 #9
0
def plot_distribution_space(hyperparameter_space: HyperparameterSpace,
                            num_bins=50):
    for title, distribution in hyperparameter_space.items():
        print(title + ":")
        plot_histogram(title, distribution, num_bins=num_bins)
        plot_pdf_cdf(title, distribution)
예제 #10
0
    assert from_flat_dic.to_nested_dict() == expected_dic


HYPE_SPACE = HyperparameterSpace({
    "a__test":
    Boolean(),
    "a__lr":
    Choice([0, 1, False, "Test"]),
    "a__b__c":
    PriorityChoice([0, 1, False, "Test"]),
    "a__b__q":
    Quantized(Uniform(-10, 10)),
    "d__param":
    RandInt(-10, 10),
    "d__u":
    Uniform(-10, 10),
    "e__other":
    LogUniform(0.001, 10),
    "e__alpha":
    Normal(0.0, 1.0),
    "e__f__g":
    LogNormal(0.0, 2.0),
    "p__other_nondistribution_params":
    "hey",
    "p__could_also_be_as_fixed":
    FixedHyperparameter("also hey"),
    "p__its_over_9k":
    9001
})


def test_hyperparams_space_rvs_outputs_samples():
예제 #11
0
def main():
    def accuracy(data_inputs, expected_outputs):
        return np.mean(
            np.argmax(np.array(data_inputs), axis=1) == np.argmax(
                np.array(expected_outputs), axis=1))

    # load the dataset
    df = read_csv('data/winequality-white.csv', sep=';')
    data_inputs = df.values
    data_inputs[:, -1] = data_inputs[:, -1] - 1
    n_features = data_inputs.shape[1] - 1
    n_classes = 10

    p = Pipeline([
        TrainOnlyWrapper(DataShuffler()),
        ColumnTransformerInputOutput(
            input_columns=[(
                [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], ToNumpy(np.float32)
            )],
            output_columns=[(11, Identity())]
        ),
        OutputTransformerWrapper(PlotDistribution(column=-1)),
        MiniBatchSequentialPipeline([
            Tensorflow2ModelStep(
                create_model=create_model,
                create_loss=create_loss,
                create_optimizer=create_optimizer
            ) \
                .set_hyperparams(HyperparameterSamples({
                'n_dense_layers': 2,
                'input_dim': n_features,
                'optimizer': 'adam',
                'activation': 'relu',
                'kernel_initializer': 'he_uniform',
                'learning_rate': 0.01,
                'hidden_dim': 20,
                'n_classes': 3
            })).set_hyperparams_space(HyperparameterSpace({
                'n_dense_layers': RandInt(2, 4),
                'hidden_dim_layer_multiplier': Uniform(0.30, 1),
                'input_dim': FixedHyperparameter(n_features),
                'optimizer': Choice([
                    OPTIMIZERS.ADAM.value,
                    OPTIMIZERS.SGD.value,
                    OPTIMIZERS.ADAGRAD.value
                ]),
                'activation': Choice([
                    ACTIVATIONS.RELU.value,
                    ACTIVATIONS.TANH.value,
                    ACTIVATIONS.SIGMOID.value,
                    ACTIVATIONS.ELU.value,
                ]),
                'kernel_initializer': Choice([
                    KERNEL_INITIALIZERS.GLOROT_NORMAL.value,
                    KERNEL_INITIALIZERS.GLOROT_UNIFORM.value,
                    KERNEL_INITIALIZERS.HE_UNIFORM.value
                ]),
                'learning_rate': LogUniform(0.005, 0.01),
                'hidden_dim': RandInt(3, 80),
                'n_classes': FixedHyperparameter(n_classes)
            }))
        ], batch_size=33),
        OutputTransformerWrapper(Pipeline([
            ExpandDim(),
            OneHotEncoder(nb_columns=n_classes, name='classes')
        ]))
    ])

    auto_ml = AutoML(
        pipeline=p,
        hyperparams_repository=InMemoryHyperparamsRepository(
            cache_folder='trials'),
        hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(),
        validation_splitter=ValidationSplitter(test_size=0.30),
        scoring_callback=ScoringCallback(accuracy,
                                         higher_score_is_better=True),
        callbacks=[
            MetricCallback(
                name='classification_report_imbalanced_metric',
                metric_function=classificaiton_report_imbalanced_metric,
                higher_score_is_better=True),
            MetricCallback(name='f1',
                           metric_function=f1_score_weighted,
                           higher_score_is_better=True),
            MetricCallback(name='recall',
                           metric_function=recall_score_weighted,
                           higher_score_is_better=True),
            MetricCallback(name='precision',
                           metric_function=precision_score_weighted,
                           higher_score_is_better=True),
            EarlyStoppingCallback(max_epochs_without_improvement=3)
        ],
        n_trials=200,
        refit_trial=True,
        epochs=75)

    auto_ml = auto_ml.fit(data_inputs=data_inputs)
예제 #12
0
def main():
    boston = load_boston()
    X, y = shuffle(boston.data, boston.target, random_state=13)
    X = X.astype(np.float32)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.25,
                                                        shuffle=False)

    # Note that the hyperparameter spaces are defined here during the pipeline definition, but it could be already set
    # within the classes ar their definition if using custom classes, or also it could be defined after declaring the
    # pipeline using a flat dict or a nested dict.

    p = Pipeline([
        AddFeatures([
            SKLearnWrapper(
                PCA(n_components=2),
                HyperparameterSpace({"n_components": RandInt(1, 3)})),
            SKLearnWrapper(
                FastICA(n_components=2),
                HyperparameterSpace({"n_components": RandInt(1, 3)})),
        ]),
        ModelStacking(
            [
                SKLearnWrapper(
                    GradientBoostingRegressor(),
                    HyperparameterSpace({
                        "n_estimators": RandInt(50, 600),
                        "max_depth": RandInt(1, 10),
                        "learning_rate": LogUniform(0.07, 0.7)
                    })),
                SKLearnWrapper(
                    KMeans(),
                    HyperparameterSpace({"n_clusters": RandInt(5, 10)})),
            ],
            joiner=NumpyTranspose(),
            judge=SKLearnWrapper(
                Ridge(),
                HyperparameterSpace({
                    "alpha": LogUniform(0.7, 1.4),
                    "fit_intercept": Boolean()
                })),
        )
    ])
    print("Meta-fitting on train:")
    p = p.meta_fit(X_train,
                   y_train,
                   metastep=RandomSearch(
                       n_iter=10,
                       higher_score_is_better=True,
                       validation_technique=KFoldCrossValidationWrapper(
                           scoring_function=r2_score, k_fold=10)))
    # Here is an alternative way to do it, more "pipeliney":
    # p = RandomSearch(
    #     p,
    #     n_iter=15,
    #     higher_score_is_better=True,
    #     validation_technique=KFoldCrossValidation(scoring_function=r2_score, k_fold=3)
    # ).fit(X_train, y_train)

    print("")

    print("Transforming train and test:")
    y_train_predicted = p.predict(X_train)
    y_test_predicted = p.predict(X_test)

    print("")

    print("Evaluating transformed train:")
    score_transform = r2_score(y_train_predicted, y_train)
    print('R2 regression score:', score_transform)

    print("")

    print("Evaluating transformed test:")
    score_test = r2_score(y_test_predicted, y_test)
    print('R2 regression score:', score_test)
예제 #13
0
from neuraxle.base import MetaStepMixin, BaseStep, NonFittableMixin, NonTransformableMixin
from neuraxle.hyperparams.distributions import RandInt, Boolean
from neuraxle.hyperparams.space import HyperparameterSpace, HyperparameterSamples
from neuraxle.steps.loop import StepClonerForEachDataInput
from testing.test_pipeline import SomeStep

SOME_STEP_HP_KEY = 'somestep_hyperparam'
RAND_INT_SOME_STEP = RandInt(-10, 0)
RAND_INT_STEP_CLONER = RandInt(0, 10)

META_STEP_HP = 'metastep_hyperparam'
SOME_STEP_HP = "SomeStep__somestep_hyperparam"
META_STEP_HP_VALUE = 1
SOME_STEP_HP_VALUE = 2

HYPE_SPACE = HyperparameterSpace({"a__test": Boolean()})

HYPE_SAMPLE = HyperparameterSamples({"a__test": True})


class SomeMetaStepMixin(NonTransformableMixin, NonFittableMixin, MetaStepMixin,
                        BaseStep):
    pass


class SomeStepInverseTransform(SomeStep):
    def fit_transform(self, data_inputs, expected_outputs=None):
        return self, 'fit_transform'

    def inverse_transform(self, processed_outputs):
        return 'inverse_transform'
예제 #14
0
def test_flat_to_dict_hyperparams_with_hyperparameter_space(
        flat: dict, expected_dic: dict):
    dic = HyperparameterSpace(flat).to_nested_dict_as_dict_primitive()

    assert dict(dic) == dict(expected_dic)
예제 #15
0
 def get_params_space(self, deep=True):
     neuraxle_params = HyperparameterSpace(
         self.p.get_hyperparams_space()).to_flat_as_dict_primitive(
         )
     return neuraxle_params
boston = load_boston()
X, y = shuffle(boston.data, boston.target, random_state=13)
X = X.astype(np.float32)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    shuffle=False)

# Note that the hyperparameter spaces are defined here during the pipeline definition, but it could be already set
# within the classes ar their definition if using custom classes, or also it could be defined after declaring the
# pipeline using a flat dict or a nested dict.
p = Pipeline([
    AddFeatures([
        SKLearnWrapper(PCA(n_components=2),
                       HyperparameterSpace({"n_components": RandInt(1, 3)})),
        SKLearnWrapper(FastICA(n_components=2),
                       HyperparameterSpace({"n_components": RandInt(1, 3)})),
    ]),
    ModelStacking(
        [
            SKLearnWrapper(
                GradientBoostingRegressor(),
                HyperparameterSpace({
                    "n_estimators": RandInt(50, 600),
                    "max_depth": RandInt(1, 10),
                    "learning_rate": LogUniform(0.07, 0.7)
                })),
            SKLearnWrapper(
                GradientBoostingRegressor(),
                HyperparameterSpace({
예제 #17
0
def test_deep_learning_pipeline():
    # Given
    boston = load_boston()
    data_inputs, expected_outputs = shuffle(boston.data,
                                            boston.target,
                                            random_state=13)
    expected_outputs = expected_outputs.astype(np.float32)
    data_inputs = data_inputs.astype(np.float32)

    pipeline = Pipeline([
        AddFeatures([
            SKLearnWrapper(
                PCA(n_components=2),
                HyperparameterSpace({"n_components": RandInt(1, 3)})),
            SKLearnWrapper(
                FastICA(n_components=2),
                HyperparameterSpace({"n_components": RandInt(1, 3)})),
        ]),
        ModelStacking(
            [
                SKLearnWrapper(
                    GradientBoostingRegressor(),
                    HyperparameterSpace({
                        "n_estimators": RandInt(50, 600),
                        "max_depth": RandInt(1, 10),
                        "learning_rate": LogUniform(0.07, 0.7)
                    })),
                SKLearnWrapper(
                    KMeans(n_clusters=7),
                    HyperparameterSpace({"n_clusters": RandInt(5, 10)})),
            ],
            joiner=NumpyTranspose(),
            judge=SKLearnWrapper(
                Ridge(),
                HyperparameterSpace({
                    "alpha": LogUniform(0.7, 1.4),
                    "fit_intercept": Boolean()
                })),
        )
    ])

    p = DeepLearningPipeline(
        pipeline,
        validation_size=VALIDATION_SIZE,
        batch_size=BATCH_SIZE,
        batch_metrics={'mse': to_numpy_metric_wrapper(mean_squared_error)},
        shuffle_in_each_epoch_at_train=True,
        n_epochs=N_EPOCHS,
        epochs_metrics={'mse': to_numpy_metric_wrapper(mean_squared_error)},
        scoring_function=to_numpy_metric_wrapper(mean_squared_error),
    )

    # When
    p, outputs = p.fit_transform(data_inputs, expected_outputs)

    # Then
    batch_mse_train = p.get_batch_metric_train('mse')
    epoch_mse_train = p.get_epoch_metric_train('mse')

    batch_mse_validation = p.get_batch_metric_validation('mse')
    epoch_mse_validation = p.get_epoch_metric_validation('mse')

    assert len(epoch_mse_train) == N_EPOCHS
    assert len(epoch_mse_validation) == N_EPOCHS

    expected_len_batch_mse_train = math.ceil(
        (len(data_inputs) / BATCH_SIZE) * (1 - VALIDATION_SIZE)) * N_EPOCHS
    expected_len_batch_mse_validation = math.ceil(
        (len(data_inputs) / BATCH_SIZE) * VALIDATION_SIZE) * N_EPOCHS

    assert len(batch_mse_train) == expected_len_batch_mse_train
    assert len(batch_mse_validation) == expected_len_batch_mse_validation

    last_batch_mse_validation = batch_mse_validation[-1]
    last_batch_mse_train = batch_mse_train[-1]

    last_epoch_mse_train = epoch_mse_train[-1]
    last_epoch_mse_validation = epoch_mse_validation[-1]

    assert last_batch_mse_train < last_batch_mse_validation
    assert last_epoch_mse_train < last_epoch_mse_validation
    assert last_batch_mse_train < 1
    assert last_epoch_mse_train < 1
class TensorflowV1ModelStep(BaseTensorflowModelStep):
    """
    Base class for tensorflow 1 steps.
    It uses :class:`TensorflowV1StepSaver` for saving the model.

    .. seealso::
        `Using the saved model format <https://www.tensorflow.org/guide/checkpoint>`_,
        :class:`~neuraxle.base.BaseStep`
    """
    HYPERPARAMS = HyperparameterSamples({})
    HYPERPARAMS_SPACE = HyperparameterSpace({})

    def __init__(self,
                 create_graph,
                 create_loss,
                 create_optimizer,
                 create_feed_dict=None,
                 data_inputs_dtype=None,
                 expected_outputs_dtype=None,
                 variable_scope=None,
                 has_expected_outputs=True,
                 print_loss=False,
                 print_func=None):
        BaseTensorflowModelStep.__init__(
            self,
            create_model=create_graph,
            create_loss=create_loss,
            create_optimizer=create_optimizer,
            create_inputs=create_feed_dict,
            data_inputs_dtype=data_inputs_dtype,
            expected_outputs_dtype=expected_outputs_dtype,
            step_saver=TensorflowV1StepSaver(),
            print_loss=print_loss,
            print_func=print_func)

        if variable_scope is None:
            variable_scope = self.name
        self.variable_scope = variable_scope
        self.has_expected_outputs = has_expected_outputs
        self.create_feed_dict = create_feed_dict

    def setup(self) -> BaseStep:
        """
        Setup tensorflow 1 graph, and session using a variable scope.

        :return: self
        :rtype: BaseStep
        """
        if self.is_initialized:
            return self

        self.graph = tf.Graph()
        with self.graph.as_default():
            with tf.variable_scope(self.variable_scope, reuse=tf.AUTO_REUSE):
                self.session = tf.Session(
                    config=tf.ConfigProto(log_device_placement=True),
                    graph=self.graph)

                model = self.create_model(self)
                if not isinstance(model, tuple):
                    tf.identity(model, name='output')
                else:
                    tf.identity(model[0], name='output')
                    tf.identity(model[1], name='inference_output')

                tf.identity(self.create_loss(self), name='loss')
                self.create_optimizer(self).minimize(self['loss'],
                                                     name='optimizer')

                init = tf.global_variables_initializer()
                self.session.run(init)
                self.is_initialized = True

    def teardown(self) -> BaseStep:
        """
        Close session on teardown.

        :return:
        """
        if self.session is not None:
            self.session.close()
        self.is_initialized = False

        return self

    def strip(self):
        """
        Strip tensorflow 1 properties from to step to make the step serializable.

        :return: stripped step
        :rtype: BaseStep
        """
        self.graph = None
        self.session = None

        return self

    def fit(self, data_inputs, expected_outputs=None) -> 'BaseStep':
        with tf.variable_scope(self.variable_scope, reuse=tf.AUTO_REUSE):
            return self.fit_model(data_inputs, expected_outputs)

    def fit_model(self, data_inputs, expected_outputs=None) -> BaseStep:
        """
        Fit tensorflow model using the variable scope.

        :param data_inputs: data inputs
        :param expected_outputs: expected outputs to fit on
        :return: fitted self
        :rtype: BaseStep
        """
        feed_dict = {self['data_inputs']: data_inputs}

        if self.has_expected_outputs:
            feed_dict.update({self['expected_outputs']: expected_outputs})

        if self.create_inputs is not None:
            additional_feed_dict_arguments = self.create_inputs(
                self, data_inputs, expected_outputs)
            feed_dict.update(additional_feed_dict_arguments)

        results = self.session.run([self['optimizer'], self['loss']],
                                   feed_dict=feed_dict)

        loss = results[1]
        self.add_new_loss(loss)

        return self

    def transform(self, data_inputs, expected_outputs=None) -> 'BaseStep':
        with tf.variable_scope(self.variable_scope, reuse=tf.AUTO_REUSE):
            return self.transform_model(data_inputs)

    def transform_model(self, data_inputs):
        """
        Transform tensorflow model using the variable scope.

        :param data_inputs:
        :return:
        """
        inference_output_name = self._get_inference_output_name()

        feed_dict = {self['data_inputs']: data_inputs}

        results = self.session.run([self[inference_output_name], self['loss']],
                                   feed_dict=feed_dict)
        self.add_new_loss(results[1], test_only=True)

        return results[0]

    def _get_inference_output_name(self):
        """
        Return the output tensor name for inference (transform).
        In create_graph, the user can return a tuple of two elements : the output tensor for training, and the output tensor for inference.

        :return:
        """
        inference_output_name = 'output'
        if len(self['inference_output'].get_shape().as_list()) > 0:
            inference_output_name = 'inference_output'

        return inference_output_name

    def __getitem__(self, item):
        """
        Get a graph tensor by name using get item.

        :param item: tensor name
        :type item: str

        :return: tensor
        :rtype: tf.Tensor
        """
        if ":" in item:
            split = item.split(":")
            tensor_name = split[0]
            device = split[1]
        else:
            tensor_name = item
            device = "0"

        try:
            result = self.graph.get_tensor_by_name("{0}/{1}:{2}".format(
                self.variable_scope, tensor_name, device))
        except KeyError:
            result = None

        if result is None:
            try:
                result = self.graph.get_operation_by_name("{0}/{1}".format(
                    self.variable_scope, tensor_name))
            except KeyError:
                result = tf.get_variable(tensor_name, [])

        return result
예제 #19
0
def main(tmpdir):
    boston = load_boston()
    X, y = shuffle(boston.data, boston.target, random_state=13)
    X = X.astype(np.float32)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False)

    # Note that the hyperparameter spaces are defined here during the pipeline definition, but it could be already set
    # within the classes ar their definition if using custom classes, or also it could be defined after declaring the
    # pipeline using a flat dict or a nested dict.

    p = Pipeline([
        AddFeatures([
            SKLearnWrapper(
                PCA(n_components=2),
                HyperparameterSpace({"n_components": RandInt(1, 3)})
            ),
            SKLearnWrapper(
                FastICA(n_components=2),
                HyperparameterSpace({"n_components": RandInt(1, 3)})
            ),
        ]),
        ModelStacking([
            SKLearnWrapper(
                GradientBoostingRegressor(),
                HyperparameterSpace({
                    "n_estimators": RandInt(50, 300), "max_depth": RandInt(1, 4),
                    "learning_rate": LogUniform(0.07, 0.7)
                })
            ),
            SKLearnWrapper(
                KMeans(),
                HyperparameterSpace({"n_clusters": RandInt(5, 10)})
            ),
        ],
            joiner=NumpyTranspose(),
            judge=SKLearnWrapper(
                Ridge(),
                HyperparameterSpace({"alpha": LogUniform(0.7, 1.4), "fit_intercept": Boolean()})
            ),
        )
    ])

    print("Meta-fitting on train:")
    auto_ml = AutoML(
        p,
        validation_splitter=ValidationSplitter(0.20),
        refit_trial=True,
        n_trials=10,
        epochs=1,  # 1 epoc here due to using sklearn models that just fit once.
        cache_folder_when_no_handle=str(tmpdir),
        scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False),
        callbacks=[MetricCallback('mse', metric_function=mean_squared_error, higher_score_is_better=False)],
        hyperparams_repository=InMemoryHyperparamsRepository(cache_folder=str(tmpdir))
    )

    random_search = auto_ml.fit(X_train, y_train)
    p = random_search.get_best_model()
    print("")

    print("Transforming train and test:")
    y_train_predicted = p.predict(X_train)
    y_test_predicted = p.predict(X_test)

    print("")

    print("Evaluating transformed train:")
    score_transform = r2_score(y_train_predicted, y_train)
    print('R2 regression score:', score_transform)

    print("")

    print("Evaluating transformed test:")
    score_test = r2_score(y_test_predicted, y_test)
    print('R2 regression score:', score_test)
예제 #20
0
 def set_params(self, **params) -> BaseEstimator:
     self.p.set_hyperparams(HyperparameterSpace(params))
     return self
예제 #21
0
from neuraxle.base import BaseStep, TruncableSteps, NonFittableMixin, MetaStepMixin
from neuraxle.hyperparams.distributions import LogUniform, Quantized, RandInt, Boolean
from neuraxle.hyperparams.space import HyperparameterSpace, HyperparameterSamples

HYPERPARAMETERS_SPACE = HyperparameterSpace({
    'learning_rate':
    LogUniform(0.0001, 0.1),
    'l2_weight_reg':
    LogUniform(0.0001, 0.1),
    'momentum':
    LogUniform(0.01, 1.0),
    'hidden_size':
    Quantized(LogUniform(16, 512)),
    'num_layers':
    RandInt(1, 4),
    'num_lstm_layers':
    RandInt(1, 2),
    'use_xavier_init':
    Boolean(),
    'use_max_pool_else_avg_pool':
    Boolean(),
    'dropout_drop_proba':
    LogUniform(0.3, 0.7)
})

HYPERPARAMETERS = HyperparameterSamples({
    'learning_rate': 0.1,
    'l2_weight_reg': 0.001,
    'hidden_size': 32,
    'num_layers': 3,
    'num_lstm_layers': 1,
예제 #22
0
 def set_hyperparams_space(
         self, hyperparams_space: HyperparameterSpace) -> 'BaseStep':
     self.hyperparams_space = HyperparameterSpace(hyperparams_space)
     return self
예제 #23
0
def main():
    # Define classification models, and hyperparams.
    # See also HyperparameterSpace documentation : https://www.neuraxle.org/stable/api/neuraxle.hyperparams.space.html#neuraxle.hyperparams.space.HyperparameterSpace

    decision_tree_classifier = SKLearnWrapper(
        DecisionTreeClassifier(),
        HyperparameterSpace({
            'criterion': Choice(['gini', 'entropy']),
            'splitter': Choice(['best', 'random']),
            'min_samples_leaf': RandInt(2, 5),
            'min_samples_split': RandInt(2, 4)
        }))

    extra_tree_classifier = SKLearnWrapper(
        ExtraTreeClassifier(),
        HyperparameterSpace({
            'criterion': Choice(['gini', 'entropy']),
            'splitter': Choice(['best', 'random']),
            'min_samples_leaf': RandInt(2, 5),
            'min_samples_split': RandInt(2, 4)
        }))

    ridge_classifier = Pipeline([
        OutputTransformerWrapper(NumpyRavel()),
        SKLearnWrapper(
            RidgeClassifier(),
            HyperparameterSpace({
                'alpha': Choice([0.0, 1.0, 10.0, 100.0]),
                'fit_intercept': Boolean(),
                'normalize': Boolean()
            }))
    ]).set_name('RidgeClassifier')

    logistic_regression = Pipeline([
        OutputTransformerWrapper(NumpyRavel()),
        SKLearnWrapper(
            LogisticRegression(),
            HyperparameterSpace({
                'C': LogUniform(0.01, 10.0),
                'fit_intercept': Boolean(),
                'penalty': Choice(['none', 'l2']),
                'max_iter': RandInt(20, 200)
            }))
    ]).set_name('LogisticRegression')

    random_forest_classifier = Pipeline([
        OutputTransformerWrapper(NumpyRavel()),
        SKLearnWrapper(
            RandomForestClassifier(),
            HyperparameterSpace({
                'n_estimators': RandInt(50, 600),
                'criterion': Choice(['gini', 'entropy']),
                'min_samples_leaf': RandInt(2, 5),
                'min_samples_split': RandInt(2, 4),
                'bootstrap': Boolean()
            }))
    ]).set_name('RandomForestClassifier')

    # Define a classification pipeline that lets the AutoML loop choose one of the classifier.
    # See also ChooseOneStepOf documentation : https://www.neuraxle.org/stable/api/neuraxle.steps.flow.html#neuraxle.steps.flow.ChooseOneStepOf

    pipeline = Pipeline([
        ChooseOneStepOf([
            decision_tree_classifier, extra_tree_classifier, ridge_classifier,
            logistic_regression, random_forest_classifier
        ])
    ])

    # Create the AutoML loop object.
    # See also AutoML documentation : https://www.neuraxle.org/stable/api/neuraxle.metaopt.auto_ml.html#neuraxle.metaopt.auto_ml.AutoML

    auto_ml = AutoML(
        pipeline=pipeline,
        hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(),
        validation_splitter=ValidationSplitter(test_size=0.20),
        scoring_callback=ScoringCallback(accuracy_score,
                                         higher_score_is_better=True),
        n_trials=7,
        epochs=1,
        hyperparams_repository=HyperparamsJSONRepository(cache_folder='cache'),
        refit_trial=True,
        continue_loop_on_error=False)

    # Load data, and launch AutoML loop !

    X_train, y_train, X_test, y_test = generate_classification_data()
    auto_ml = auto_ml.fit(X_train, y_train)

    # Get the model from the best trial, and make predictions using predict.
    # See also predict documentation : https://www.neuraxle.org/stable/api/neuraxle.base.html#neuraxle.base.BaseStep.predict

    best_pipeline = auto_ml.get_best_model()
    y_pred = best_pipeline.predict(X_test)

    accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
    print("Test accuracy score:", accuracy)

    shutil.rmtree('cache')
class Tensorflow2ModelStep(BaseTensorflowModelStep):
    """
    Base class for tensorflow 2 steps.
    It uses :class:`TensorflowV2StepSaver` for saving the model.

    .. seealso::
        `Using the checkpoint model format <https://www.tensorflow.org/guide/checkpoint>`_,
        :class:`~neuraxle.base.BaseStep`
    """
    HYPERPARAMS = HyperparameterSamples({})
    HYPERPARAMS_SPACE = HyperparameterSpace({})

    def __init__(
            self,
            create_model,
            create_loss,
            create_optimizer,
            create_inputs=None,
            data_inputs_dtype=None,
            expected_outputs_dtype=None,
            tf_model_checkpoint_folder=None,
            print_loss=False,
            print_func=None,
            device_name=None
    ):
        BaseTensorflowModelStep.__init__(
            self,
            create_model=create_model,
            create_loss=create_loss,
            create_optimizer=create_optimizer,
            create_inputs=create_inputs,
            data_inputs_dtype=data_inputs_dtype,
            expected_outputs_dtype=expected_outputs_dtype,
            step_saver=TensorflowV2StepSaver(),
            print_loss=print_loss,
            print_func=print_func
        )

        if device_name is None:
            device_name = '/CPU:0'
        self.device_name = device_name

        if tf_model_checkpoint_folder is None:
            tf_model_checkpoint_folder = 'tensorflow_ckpts'
        self.tf_model_checkpoint_folder = tf_model_checkpoint_folder

    def setup(self) -> BaseStep:
        """
        Setup optimizer, model, and checkpoints for saving.

        :return: step
        :rtype: BaseStep
        """
        if self.is_initialized:
            return self

        with tf.device(self.device_name):
            self.optimizer = self.create_optimizer(self)
            self.model = self.create_model(self)

            self.checkpoint = tf.train.Checkpoint(step=tf.Variable(1), optimizer=self.optimizer, net=self.model)
            self.checkpoint_manager = tf.train.CheckpointManager(
                self.checkpoint,
                self.tf_model_checkpoint_folder,
                max_to_keep=3
            )

        self.is_initialized = True

        return self

    def strip(self):
        """
        Strip tensorflow 2 properties from to step to make it serializable.

        :return:
        """
        self.optimizer = None
        self.model = None
        self.checkpoint = None
        self.checkpoint_manager = None

    def fit(self, data_inputs, expected_outputs=None) -> 'BaseStep':
        with tf.device(self.device_name):
            self._fit_model(data_inputs, expected_outputs)

        return self

    def _fit_model(self, data_inputs, expected_outputs):
        inputs = self._create_inputs(data_inputs, expected_outputs)
        with tf.GradientTape() as tape:
            output = self.model(inputs, training=True)
            loss = self.create_loss(
                self,
                expected_outputs=tf.convert_to_tensor(expected_outputs, dtype=self.expected_outputs_dtype),
                predicted_outputs=output
            )
            self.add_new_loss(loss)
            self.model.losses.append(loss)

        self.optimizer.apply_gradients(zip(
            tape.gradient(loss, self.model.trainable_variables),
            self.model.trainable_variables
        ))

    def _transform_data_container(self, data_container: DataContainer, context: ExecutionContext) -> DataContainer:
        data_inputs = data_container.data_inputs
        expected_outputs = data_container.expected_outputs

        with tf.device(self.device_name):
            output = self._transform_model(data_inputs, expected_outputs)

        data_container.set_data_inputs(output.numpy())
        return data_container

    def _transform_model(self, data_inputs, expected_outputs):
        output = self.model(self._create_inputs(data_inputs), training=False)

        if expected_outputs is not None:
            loss = self.create_loss(
                self,
                expected_outputs=tf.convert_to_tensor(expected_outputs, dtype=self.expected_outputs_dtype),
                predicted_outputs=output
            )
            self.add_new_loss(loss, test_only=True)
        return output

    def transform(self, data_inputs):
        with tf.device(self.device_name):
            output = self.model(self._create_inputs(data_inputs), training=False)
        return output.numpy()

    def _create_inputs(self, data_inputs, expected_outputs=None):
        if self.create_inputs is not None:
            inputs = self.create_inputs(self, data_inputs, expected_outputs)
        else:
            inputs = tf.convert_to_tensor(data_inputs, self.data_inputs_dtype)
        return inputs
예제 #25
0
from neuraxle.metaopt.callbacks import MetricCallback, ScoringCallback
from neuraxle.metaopt.tpe import TreeParzenEstimatorHyperparameterSelectionStrategy
from neuraxle.pipeline import Pipeline
from neuraxle.steps.misc import FitTransformCallbackStep
from neuraxle.steps.numpy import AddN
import os


@pytest.mark.parametrize(
    "expected_output_mult, pipeline",
    [(3.5,
      Pipeline([
          FitTransformCallbackStep().set_name('callback'),
          AddN(0.).set_hyperparams_space(
              HyperparameterSpace({
                  'add':
                  Choice(choice_list=[0, 1.5, 2, 3.5, 4, 5, 6]),
              })),
          AddN(0.).set_hyperparams_space(
              HyperparameterSpace({
                  'add':
                  Choice(choice_list=[0, 1.5, 2, 3.5, 4, 5, 6]),
              }))
      ])),
     (3.5,
      Pipeline([
          FitTransformCallbackStep().set_name('callback'),
          AddN(0.).set_hyperparams_space(
              HyperparameterSpace({
                  'add': Quantized(hd=Uniform(0, 10)),
              })),
          AddN(0.).set_hyperparams_space(