示例#1
0
def test_pipeline_nested_mutate_inverse_transform():
    expected_tape = ["1", "2", "3", "4", "5", "6", "7", "7", "6", "5", "4", "3", "2", "1"]
    tape = TapeCallbackFunction()

    p = Pipeline([
        Identity(),
        TransformCallbackStep(tape.callback, ["1"]),
        TransformCallbackStep(tape.callback, ["2"]),
        Pipeline([
            Identity(),
            TransformCallbackStep(tape.callback, ["3"]),
            TransformCallbackStep(tape.callback, ["4"]),
            TransformCallbackStep(tape.callback, ["5"]),
            Identity()
        ]),
        TransformCallbackStep(tape.callback, ["6"]),
        TransformCallbackStep(tape.callback, ["7"]),
        Identity()
    ])

    p, _ = p.fit_transform(np.ones((1, 1)))  # will add range(1, 8) to tape.

    print("[mutating]")
    p = p.mutate(new_method="inverse_transform", method_to_assign_to="transform")

    p.transform(np.ones((1, 1)))  # will add reversed(range(1, 8)) to tape.

    print(expected_tape)
    print(tape.get_name_tape())
    assert expected_tape == tape.get_name_tape()
示例#2
0
def test_forcehandleidentity_does_not_crash(tmpdir):
    p = Pipeline([
        ForceHandleIdentity()
    ])
    data_inputs = np.array([0, 1, 2, 3])
    expected_outputs = data_inputs * 2
    p.fit(data_inputs, expected_outputs)
    p.fit_transform(data_inputs, expected_outputs)
    p.transform(data_inputs=data_inputs)
def test_add_service_assertions_should_fail_when_services_are_missing(tmpdir):
    with pytest.raises(AssertionError) as exception_info:
        context = ExecutionContext(root=tmpdir)
        p = Pipeline([SomeStep().assert_has_services(BaseService)
                      ]).with_context(context=context)
        data_inputs = np.array([0, 1, 2, 3])

        p.transform(data_inputs=data_inputs)

    assert 'BaseService dependency missing' in exception_info.value.args[0]
def test_with_context_should_inject_dependencies_properly(tmpdir):
    data_inputs = np.array([0, 1, 2, 3])
    context = ExecutionContext(root=tmpdir)
    service = SomeService()
    context.set_service_locator({BaseService: service})
    p = Pipeline([SomeStep().assert_has_services(BaseService)
                  ]).with_context(context=context)

    p.transform(data_inputs=data_inputs)

    assert np.array_equal(service.data, data_inputs)
示例#5
0
def main():
    p = Pipeline([MultiplyByN(2), MultiplyByN(4)])

    outputs = p.transform(list(range(10)))
    print('transform: {}'.format(outputs))

    p = p.mutate(new_method='inverse_transform',
                 method_to_assign_to='transform')

    outputs = p.transform(list(range(10)))
    print('inverse_transform: {}'.format(outputs))
示例#6
0
def test_localassert_should_assert_dependencies_properly_at_exec(tmpdir):
    data_inputs = np.array([0, 1, 2, 3])
    context = ExecutionContext(root=tmpdir)
    p = Pipeline([
        RegisterServiceDynamically(),
        SomeStep().assert_has_services_at_execution(SomeBaseService)
    ]).with_context(context=context)

    p.transform(data_inputs=data_inputs)
    service = context.get_service(SomeBaseService)
    assert np.array_equal(service.data, data_inputs)
示例#7
0
def main():
    p = Pipeline([
        ForceAlwaysAlwaysHandleMixinStep(),
    ])

    p = p.fit(np.array([0, 1]), np.array([0, 1]))
    p = p.transform(np.array([0, 1]))
示例#8
0
def test_model_stacking_transform():
    model_stacking = Pipeline([
        ModelStacking(
            [
                SKLearnWrapper(
                    GradientBoostingRegressor(),
                    HyperparameterSpace({
                        "n_estimators": RandInt(50, 600),
                        "max_depth": RandInt(1, 10),
                        "learning_rate": LogUniform(0.07, 0.7)
                    })),
                SKLearnWrapper(
                    KMeans(),
                    HyperparameterSpace({"n_clusters": RandInt(5, 10)})),
            ],
            joiner=NumpyTranspose(),
            judge=SKLearnWrapper(
                Ridge(),
                HyperparameterSpace({
                    "alpha": LogUniform(0.7, 1.4),
                    "fit_intercept": Boolean()
                })),
        )
    ])
    expected_outputs_shape = (379, 1)
    data_inputs_shape = (379, 13)
    data_inputs = _create_data(data_inputs_shape)
    expected_outputs = _create_data(expected_outputs_shape)

    model_stacking = model_stacking.fit(data_inputs, expected_outputs)
    outputs = model_stacking.transform(data_inputs)

    assert outputs.shape == expected_outputs_shape
def main():
    p = Pipeline([MultiplyByN(multiply_by=2)])

    data_inputs = np.array([1, 2])
    generated_outputs = p.transform(data_inputs)
    regenerated_inputs = p.inverse_transform(generated_outputs)

    assert np.array_equal(regenerated_inputs, data_inputs)
    assert np.array_equal(generated_outputs, 2 * data_inputs)
示例#10
0
def test_pipeline_fit_then_transform(steps_list, pipeline_runner):
    data_input_ = [AN_INPUT]
    expected_output_ = [AN_EXPECTED_OUTPUT]
    p = Pipeline(steps_list, pipeline_runner=pipeline_runner())

    p = p.fit(data_input_, expected_output_)
    result = p.transform(data_input_)

    assert tuple(result) == tuple(expected_output_)
示例#11
0
def main():
    p = Pipeline([
        NonFittableStep(),
        NonTransformableStep(),
        Identity()  # Note: Identity does nothing: it inherits from both NonFittableMixin and NonTransformableMixin.
    ])

    p = p.fit(np.array([0, 1]), np.array([0, 1]))

    out = p.transform(np.array([0, 1]))
def main():
    boston = load_boston()
    X, y = shuffle(boston.data, boston.target, random_state=13)
    X = X.astype(np.float32)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=False)

    p = Pipeline([
        NumpyShapePrinter(),
        AddFeatures([
            PCA(n_components=2),
            FastICA(n_components=2),
        ]),
        NumpyShapePrinter(),
        RidgeModelStacking([
            GradientBoostingRegressor(),
            GradientBoostingRegressor(n_estimators=500),
            GradientBoostingRegressor(max_depth=5),
            KMeans(),
        ]),
        NumpyShapePrinter(),
    ])

    print("Fitting on train:")
    p = p.fit(X_train, y_train)
    print("")
    print("Transforming train and test:")
    y_train_predicted = p.transform(X_train)
    y_test_predicted = p.transform(X_test)
    print("")
    print("Evaluating transformed train:")
    score_train = r2_score(y_train_predicted, y_train)
    print('R2 regression score:', score_train)
    print("")
    print("Evaluating transformed test:")
    score_test = r2_score(y_test_predicted, y_test)
    print('R2 regression score:', score_test)

    assert y_train_predicted.shape == (379,)
    assert y_test_predicted.shape == (127,)
    assert isinstance(score_train, float)
    assert isinstance(score_test, float)

    return y_train_predicted, y_test_predicted, score_train, score_test
示例#13
0
def test_wrapped_queued_pipeline_with_n_workers_step():
    p = Pipeline([
        SequentialQueuedPipeline([(1, MultiplyByN(2)), (1, MultiplyByN(2)),
                                  (1, MultiplyByN(2)), (1, MultiplyByN(2))],
                                 batch_size=10,
                                 max_queue_size=5)
    ])

    outputs = p.transform(list(range(100)))

    assert np.array_equal(outputs, EXPECTED_OUTPUTS)
示例#14
0
def test_pipeline_simple_mutate_inverse_transform():
    expected_tape = ["1", "2", "3", "4", "4", "3", "2", "1"]
    tape = TapeCallbackFunction()

    p = Pipeline([
        Identity(),
        TransformCallbackStep(tape.callback, ["1"]),
        TransformCallbackStep(tape.callback, ["2"]),
        TransformCallbackStep(tape.callback, ["3"]),
        TransformCallbackStep(tape.callback, ["4"]),
        Identity()
    ])

    p, _ = p.fit_transform(np.ones((1, 1)))

    print("[mutating]")
    p = p.mutate(new_method="inverse_transform", method_to_assign_to="transform")

    p.transform(np.ones((1, 1)))

    assert expected_tape == tape.get_name_tape()
示例#15
0
def test_feature_union_should_transform_with_numpy_transpose():
    p = Pipeline(
        [FeatureUnion([
            Identity(),
            Identity(),
        ], joiner=NumpyTranspose())])
    data_inputs = np.random.randint((1, 20))

    outputs = p.transform(data_inputs)

    assert np.array_equal(outputs,
                          np.array([data_inputs, data_inputs]).transpose())
示例#16
0
def test_should_transform_each_steps(steps: List[BaseStep],
                                     expected_tape: List[str]):
    pipeline = Pipeline(steps=steps)
    pipeline = pipeline.fit(data_inputs)
    tape.data = []
    tape.name_tape = []

    actual_data_inputs = pipeline.transform(data_inputs)

    actual_tape = tape.get_name_tape()
    assert actual_tape == expected_tape
    assert np.array_equal(actual_data_inputs, data_inputs)
示例#17
0
def test_feature_union_should_transform_with_zip_features():
    p = Pipeline(
        [FeatureUnion([
            Identity(),
            Identity(),
        ], joiner=ZipFeatures())])
    data_inputs = np.random.randint(low=0, high=100, size=(2, 20))

    outputs = p.transform(data_inputs)

    assert np.array_equal(outputs, np.stack([data_inputs, data_inputs],
                                            axis=1))
示例#18
0
def test_feature_union_should_transform_with_concatenate_inner_features():
    p = Pipeline([
        FeatureUnion([
            Identity(),
            Identity(),
        ],
                     joiner=NumpyConcatenateInnerFeatures())
    ])
    data_inputs = np.random.randint((1, 20))

    outputs = p.transform(data_inputs)

    assert np.array_equal(outputs, np.concatenate([data_inputs, data_inputs]))
示例#19
0
def test_transform_should_transform_all_steps_for_each_data_inputs_expected_outputs():
    tape = TapeCallbackFunction()
    p = Pipeline([
        ForEachDataInput(Pipeline([
            TransformCallbackStep(tape.callback, ["1"]),
            TransformCallbackStep(tape.callback, ["2"]),
        ]))
    ])
    data_inputs = [[0, 1], [1, 2]]

    outputs = p.transform(data_inputs)

    assert tape.get_name_tape() == ["1", "2", "1", "2"]
示例#20
0
def main():
    value_caching_folder = 'value_caching'
    if not os.path.exists(value_caching_folder):
        os.makedirs(value_caching_folder)

    data_inputs = list(range(100))

    sleep_time = 0.001
    a = time.time()
    for i in range(5):
        p = Pipeline([
            PickleValueCachingWrapper(ForEach(
                Pipeline([Sleep(sleep_time=sleep_time),
                          MultiplyByN(2)])),
                                      cache_folder=value_caching_folder)
        ])
        outputs_value_caching = p.transform(data_inputs)
    b = time.time()
    time_value_caching_pipeline = b - a
    print('Pipeline with ValueCachingWrapper')
    print('execution time: {} seconds'.format(time_value_caching_pipeline))

    a = time.time()
    for i in range(5):
        p = Pipeline([
            ForEach(Pipeline([Sleep(sleep_time=sleep_time),
                              MultiplyByN(2)])),
        ])

        outputs_vanilla = p.transform(data_inputs)
    b = time.time()
    time_vanilla_pipeline = b - a
    print('Pipeline without value caching')
    print('execution time: {} seconds'.format(time_vanilla_pipeline))

    shutil.rmtree(value_caching_folder)

    assert np.array_equal(outputs_value_caching, outputs_vanilla)
    assert time_value_caching_pipeline < time_vanilla_pipeline
示例#21
0
def test_predict_should_transform_with_initial_is_train_mode_after_predict():
    tape_fit = TapeCallbackFunction()
    tape_transform = TapeCallbackFunction()
    p = Pipeline([
        TestOnlyWrapper(
            CallbackWrapper(MultiplyByN(2), tape_transform, tape_fit)),
        TrainOnlyWrapper(
            CallbackWrapper(MultiplyByN(4), tape_transform, tape_fit))
    ])

    p.predict(np.array([1, 1]))
    outputs = p.transform(np.array([1, 1]))

    assert np.array_equal(outputs, np.array([4, 4]))
示例#22
0
def test_choose_one_step_of_set_hyperparams(method_name, args, kwargs):
    a_callback = TapeCallbackFunction()
    b_callback = TapeCallbackFunction()
    c_callback = TapeCallbackFunction()
    d_callback = TapeCallbackFunction()

    choose_one_step_of = ChooseOneStepOf([
        ('a',
         FitTransformCallbackStep(
             a_callback, c_callback,
             transform_function=lambda di: di * 2).set_name("step_1")),
        ('b',
         FitTransformCallbackStep(
             b_callback, d_callback,
             transform_function=lambda di: di * 2).set_name("step_1"))
    ])

    p = Pipeline([choose_one_step_of])

    p.transform(DATA_INPUTS)

    assert len(a_callback.data) == 1
    assert all(a_callback.data[0] == DATA_INPUTS)
    assert len(b_callback.data) == 0
    assert len(c_callback.data) == 0
    assert len(d_callback.data) == 0

    getattr(choose_one_step_of, method_name)(*args, **kwargs)

    p.transform(DATA_INPUTS)

    assert len(a_callback.data) == 1
    assert all(a_callback.data[0] == DATA_INPUTS)
    assert len(b_callback.data) == 1
    assert all(b_callback.data[0] == DATA_INPUTS)
    assert len(c_callback.data) == 0
    assert len(d_callback.data) == 0
示例#23
0
def test_choose_one_step_of_update_hyperparams():
    a_callback = TapeCallbackFunction()
    b_callback = TapeCallbackFunction()
    c_callback = TapeCallbackFunction()
    d_callback = TapeCallbackFunction()

    choose_one_step_of = ChooseOneStepOf([
        ('a',
         FitTransformCallbackStep(
             a_callback, c_callback,
             transform_function=lambda di: di * 2).set_name("step_1")),
        ('b',
         FitTransformCallbackStep(
             b_callback, d_callback,
             transform_function=lambda di: di * 2).set_name("step_1"))
    ])

    p = Pipeline([choose_one_step_of])

    p.transform(DATA_INPUTS)

    assert len(a_callback.data) == 1
    assert all(a_callback.data[0] == DATA_INPUTS)
    assert len(b_callback.data) == 0
    assert len(c_callback.data) == 0
    assert len(d_callback.data) == 0

    choose_one_step_of.update_hyperparams({'choice': 'b'})

    p.transform(DATA_INPUTS)

    assert len(a_callback.data) == 1
    assert all(a_callback.data[0] == DATA_INPUTS)
    assert len(b_callback.data) == 1
    assert all(b_callback.data[0] == DATA_INPUTS)
    assert len(c_callback.data) == 0
    assert len(d_callback.data) == 0
示例#24
0
def test_transform_should_use_cache(tmpdir):
    tape_transform = TapeCallbackFunction()
    tape_fit = TapeCallbackFunction()
    p = Pipeline([
        JoblibValueCachingWrapper(
            LogFitTransformCallbackStep(tape_transform,
                                        tape_fit,
                                        transform_function=np.log), tmpdir)
    ])

    outputs = p.transform([1, 1, 2, 2])

    assert outputs == EXPECTED_OUTPUTS
    assert tape_transform.data == [[1], [2]]
    assert tape_fit.data == []
def main():
    p = Pipeline([
        NonFittableStep(),
        NonTransformableStep(),
        Identity()  # Note: Identity does nothing: it inherits from both NonFittableMixin and NonTransformableMixin.
    ])

    some_data = np.array([0, 1])
    p = p.fit(some_data)
    # Out:
    #     NonFittableStep: I transformed.
    #     NonTransformableStep: I fitted.

    out = p.transform(some_data)
    # Out:
    #     NonFittableStep: I transformed.

    assert np.array_equal(out, some_data)
示例#26
0
def test_expand_dim_transform():
    handle_fit_callback = TapeCallbackFunction()
    handle_transform_callback = TapeCallbackFunction()
    handle_fit_transform_callback = TapeCallbackFunction()
    p = Pipeline([
        ExpandDim(
            HandleCallbackStep(handle_fit_callback, handle_transform_callback,
                               handle_fit_transform_callback))
    ])
    p['ExpandDim'].hashers = [SomeSummaryHasher(fake_summary_id=SUMMARY_ID)]

    outputs = p.transform(np.array(range(10)))

    assert np.array_equal(outputs, np.array(range(10)))
    assert handle_fit_callback.data == []
    assert handle_transform_callback.data[0][0].current_ids == [SUMMARY_ID]
    assert np.array_equal(
        np.array(handle_transform_callback.data[0][0].data_inputs),
        np.array([np.array(range(10))]))
    assert np.array_equal(
        np.array(handle_transform_callback.data[0][0].expected_outputs),
        np.array([[None] * 10]))
    assert handle_fit_transform_callback.data == []
示例#27
0
def test_expand_dim_transform():
    handle_fit_callback = TapeCallbackFunction()
    handle_transform_callback = TapeCallbackFunction()
    handle_fit_transform_callback = TapeCallbackFunction()
    p = Pipeline([
        ExpandDim(
            HandleCallbackStep(handle_fit_callback, handle_transform_callback,
                               handle_fit_transform_callback))
    ])

    outputs = p.transform(np.array(range(10)))

    assert np.array_equal(outputs, np.array(range(10)))
    assert handle_fit_callback.data == []
    assert handle_transform_callback.data[0][0].current_ids == [
        '781e5e245d69b566979b86e28d23f2c7'
    ]
    assert np.array_equal(
        np.array(handle_transform_callback.data[0][0].data_inputs),
        np.array([np.array(range(10))]))
    assert np.array_equal(
        np.array(handle_transform_callback.data[0][0].expected_outputs),
        np.array([[None] * 10]))
    assert handle_fit_transform_callback.data == []
示例#28
0
def main():
    boston = load_boston()
    X, y = shuffle(boston.data, boston.target, random_state=13)
    X = X.astype(np.float32)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.25,
                                                        shuffle=False)

    # Note that the hyperparameter spaces are defined here during the pipeline definition, but it could be already set
    # within the classes ar their definition if using custom classes, or also it could be defined after declaring the
    # pipeline using a flat dict or a nested dict.

    p = Pipeline([
        AddFeatures([
            SKLearnWrapper(
                PCA(n_components=2),
                HyperparameterSpace({"n_components": RandInt(1, 3)})),
            SKLearnWrapper(
                FastICA(n_components=2),
                HyperparameterSpace({"n_components": RandInt(1, 3)})),
        ]),
        ModelStacking(
            [
                SKLearnWrapper(
                    GradientBoostingRegressor(),
                    HyperparameterSpace({
                        "n_estimators": RandInt(50, 600),
                        "max_depth": RandInt(1, 10),
                        "learning_rate": LogUniform(0.07, 0.7)
                    })),
                SKLearnWrapper(
                    KMeans(),
                    HyperparameterSpace({"n_clusters": RandInt(5, 10)})),
            ],
            joiner=NumpyTranspose(),
            judge=SKLearnWrapper(
                Ridge(),
                HyperparameterSpace({
                    "alpha": LogUniform(0.7, 1.4),
                    "fit_intercept": Boolean()
                })),
        )
    ])
    print("Meta-fitting on train:")
    p = p.meta_fit(X_train,
                   y_train,
                   metastep=RandomSearch(
                       n_iter=10,
                       higher_score_is_better=True,
                       validation_technique=KFoldCrossValidation(
                           scoring_function=r2_score, k_fold=10)))
    # Here is an alternative way to do it, more "pipeliney":
    # p = RandomSearch(
    #     p,
    #     n_iter=15,
    #     higher_score_is_better=True,
    #     validation_technique=KFoldCrossValidation(scoring_function=r2_score, k_fold=3)
    # ).fit(X_train, y_train)

    print("")

    print("Transforming train and test:")
    y_train_predicted = p.transform(X_train)
    y_test_predicted = p.transform(X_test)

    print("")

    print("Evaluating transformed train:")
    score_transform = r2_score(y_train_predicted, y_train)
    print('R2 regression score:', score_transform)

    print("")

    print("Evaluating transformed test:")
    score_test = r2_score(y_test_predicted, y_test)
    print('R2 regression score:', score_test)
print("Meta-fitting on train:")
p = p.meta_fit(X_train,
               y_train,
               metastep=RandomSearch(n_iter=10,
                                     higher_score_is_better=True,
                                     validation_technique=KFoldCrossValidation(
                                         scoring_function=r2_score,
                                         k_fold=10)))
# Here is an alternative way to do it, more "pipeliney":
# p = RandomSearch(
#     n_iter=15,
#     higher_score_is_better=True,
#     validation_technique=KFoldCrossValidation(scoring_function=r2_score, k_fold=3)
# ).set_step(p).fit(X_train, y_train).get_best_model()
print("")

print("Transforming train and test:")
y_train_predicted = p.transform(X_train)
y_test_predicted = p.transform(X_test)
print("")

print("Evaluating transformed train:")
score = r2_score(y_train_predicted, y_train)
print('R2 regression score:', score)
print("")

print("Evaluating transformed test:")
score = r2_score(y_test_predicted, y_test)
print('R2 regression score:', score)
示例#30
0
def main():
    boston = load_boston()
    X, y = shuffle(boston.data, boston.target, random_state=13)
    X = X.astype(np.float32)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.25,
                                                        shuffle=False)

    pipeline = Pipeline([
        AddFeatures([
            PCA(n_components=2),
            FastICA(n_components=2),
        ]),
        RidgeModelStacking([
            GradientBoostingRegressor(),
            KMeans(),
        ]),
    ])

    print("Fitting on train:")
    pipeline = pipeline.fit(X_train, y_train)
    print("")
    print("Transforming train and test:")
    y_train_predicted = pipeline.transform(X_train)
    y_test_predicted = pipeline.transform(X_test)
    print("")
    print("Evaluating transformed train:")
    score = r2_score(y_train_predicted, y_train)
    print('R2 regression score:', score)
    print("")
    print("Evaluating transformed test:")
    score = r2_score(y_test_predicted, y_test)
    print('R2 regression score:', score)
    print("Deploying the application by routing data to the transform method:")

    class CustomJSONDecoderFor2DArray(JSONDataBodyDecoder):
        """This is a custom JSON decoder class that precedes the pipeline's transformation."""
        def decode(self, data_inputs):
            """
            Transform a JSON list object into an np.array object.

            :param data_inputs: json object
            :return: np array for data inputs
            """
            return np.array(data_inputs)

    class CustomJSONEncoderOfOutputs(JSONDataResponseEncoder):
        """This is a custom JSON response encoder class for converting the pipeline's transformation outputs."""
        def encode(self, data_inputs) -> dict:
            """
            Convert predictions to a dict for creating a JSON Response object.

            :param data_inputs:
            :return:
            """
            return {'predictions': list(data_inputs)}

    app = FlaskRestApiWrapper(
        json_decoder=CustomJSONDecoderFor2DArray(),
        wrapped=pipeline,
        json_encoder=CustomJSONEncoderOfOutputs()).get_app()

    print("Finally, run the app by uncommenting this next line of code:")

    # app.run(debug=False, port=5000)

    print("You can now call your pipeline over HTTP with a (JSON) REST API.")

    # test_predictictions = requests.post(
    #     url='http://127.0.0.1:5000/',
    #     json=X_test.tolist()
    # )
    # print(test_predictictions)
    # print(test_predictictions.content)

    assert isinstance(app, Flask)

    return app