def test_hyperparam_space(): p = Pipeline([ AddFeatures([ SomeStep(hyperparams_space=HyperparameterSpace({"n_components": RandInt(1, 5)})), SomeStep(hyperparams_space=HyperparameterSpace({"n_components": RandInt(1, 5)})) ]), ModelStacking([ SomeStep(hyperparams_space=HyperparameterSpace({"n_estimators": RandInt(1, 1000)})), SomeStep(hyperparams_space=HyperparameterSpace({"n_estimators": RandInt(1, 1000)})), SomeStep(hyperparams_space=HyperparameterSpace({"max_depth": RandInt(1, 100)})), SomeStep(hyperparams_space=HyperparameterSpace({"max_depth": RandInt(1, 100)})) ], joiner=NumpyTranspose(), judge=SomeStep(hyperparams_space=HyperparameterSpace({"alpha": LogUniform(0.1, 10.0)})) ) ]) rvsed = p.get_hyperparams_space() p.set_hyperparams(rvsed) hyperparams = p.get_hyperparams() assert "AddFeatures" in hyperparams.keys() assert "SomeStep" in hyperparams["AddFeatures"] assert "n_components" in hyperparams["AddFeatures"]["SomeStep"] assert "SomeStep1" in hyperparams["AddFeatures"] assert "n_components" in hyperparams["AddFeatures"]["SomeStep1"] assert "SomeStep" in hyperparams["ModelStacking"] assert "n_estimators" in hyperparams["ModelStacking"]["SomeStep"] assert "SomeStep1" in hyperparams["ModelStacking"] assert "max_depth" in hyperparams["ModelStacking"]["SomeStep2"]
def main(): p = Pipeline([ ('step1', MultiplyByN()), ('step2', MultiplyByN()), Pipeline([ Identity(), Identity(), PCA(n_components=4) ]) ]) p.set_hyperparams_space({ 'step1__multiply_by': RandInt(42, 50), 'step2__multiply_by': RandInt(-10, 0), 'Pipeline__PCA__n_components': RandInt(2, 3) }) samples = p.get_hyperparams_space().rvs() p.set_hyperparams(samples) samples = p.get_hyperparams().to_flat_as_dict_primitive() assert 42 <= samples['step1__multiply_by'] <= 50 assert -10 <= samples['step2__multiply_by'] <= 0 assert samples['Pipeline__PCA__n_components'] in [2, 3] assert p['Pipeline']['PCA'].get_wrapped_sklearn_predictor().n_components in [2, 3]
def test_pipeline_set_one_hyperparam_level_two_dict(): p = Pipeline([ ("a", SomeStep()), ("b", Pipeline([ ("a", SomeStep()), ("b", SomeStep()), ("c", SomeStep()) ])), ("c", SomeStep()) ]) p.set_hyperparams({ "b": { "a": { "learning_rate": 7 }, "learning_rate": 9 } }) print(p.get_hyperparams()) assert p["b"]["a"].hyperparams["learning_rate"] == 7 assert p["b"]["c"].hyperparams == dict() assert p["b"].hyperparams["learning_rate"] == 9 assert p["c"].hyperparams == dict()
def test_apply_on_pipeline_with_meta_step_and_positional_argument_should_call_method_on_each_steps(): pipeline = Pipeline([OutputTransformerWrapper(MultiplyByN(1)), MultiplyByN(1)]) pipeline.apply('set_hyperparams', hyperparams=HyperparameterSamples({'multiply_by': 2})) assert pipeline.get_hyperparams()['multiply_by'] == 2 assert pipeline['OutputTransformerWrapper'].wrapped.get_hyperparams()['multiply_by'] == 2 assert pipeline['MultiplyByN'].get_hyperparams()['multiply_by'] == 2
def test_apply_on_pipeline_with_positional_argument_should_call_method_on_each_steps(): pipeline = Pipeline([MultiplyByN(1), MultiplyByN(1)]) pipeline.apply('set_hyperparams', hyperparams=HyperparameterSamples({'multiply_by': 2})) assert pipeline.get_hyperparams()['multiply_by'] == 2 assert pipeline['MultiplyByN'].get_hyperparams()['multiply_by'] == 2 assert pipeline['MultiplyByN1'].get_hyperparams()['multiply_by'] == 2
def test_apply_method_on_pipeline_should_call_method_on_each_steps(): pipeline = Pipeline([MultiplyByN(1), MultiplyByN(1)]) pipeline.apply_method(lambda step: step.set_hyperparams( HyperparameterSamples({'multiply_by': 2}))) assert pipeline.get_hyperparams()['multiply_by'] == 2 assert pipeline['MultiplyByN'].get_hyperparams()['multiply_by'] == 2 assert pipeline['MultiplyByN1'].get_hyperparams()['multiply_by'] == 2
def test_apply_method_on_pipeline_with_meta_step_should_call_method_on_each_steps(): pipeline = Pipeline([OutputTransformerWrapper(MultiplyByN(1)), MultiplyByN(1)]) pipeline.apply_method( lambda step: step.set_hyperparams(HyperparameterSamples({'multiply_by': 2})) ) assert pipeline.get_hyperparams()['multiply_by'] == 2 assert pipeline['OutputTransformerWrapper'].wrapped.get_hyperparams()['multiply_by'] == 2 assert pipeline['MultiplyByN'].get_hyperparams()['multiply_by'] == 2
def test_pipeline_set_one_hyperparam_level_two_flat(): p = Pipeline([("a", SomeStep()), ("b", Pipeline([("a", SomeStep()), ("b", SomeStep()), ("c", SomeStep())])), ("c", SomeStep())]) p.set_hyperparams({"b__a__learning_rate": 7}) print(p.get_hyperparams()) assert p["b"]["a"].hyperparams["learning_rate"] == 7 assert p["b"]["c"].hyperparams.to_flat_dict() == dict() assert p["b"].hyperparams.to_flat_dict() == {'a__learning_rate': 7} assert p["c"].hyperparams.to_flat_dict() == dict()
def test_hyperparam_space(): p = Pipeline([ AddFeatures([ SomeStep(hyperparams_space=HyperparameterSpace( {"n_components": RandInt(1, 5)})), SomeStep(hyperparams_space=HyperparameterSpace( {"n_components": RandInt(1, 5)})) ]), ModelStacking([ SomeStep(hyperparams_space=HyperparameterSpace( {"n_estimators": RandInt(1, 1000)})), SomeStep(hyperparams_space=HyperparameterSpace( {"n_estimators": RandInt(1, 1000)})), SomeStep(hyperparams_space=HyperparameterSpace( {"max_depth": RandInt(1, 100)})), SomeStep(hyperparams_space=HyperparameterSpace( {"max_depth": RandInt(1, 100)})) ], joiner=NumpyTranspose(), judge=SomeStep(hyperparams_space=HyperparameterSpace( {"alpha": LogUniform(0.1, 10.0)}))) ]) rvsed = p.get_hyperparams_space() p.set_hyperparams(rvsed) hyperparams = p.get_hyperparams() flat_hyperparams_keys = hyperparams.to_flat_dict().keys() assert 'AddFeatures' in hyperparams assert 'SomeStep' in hyperparams["AddFeatures"] assert "n_components" in hyperparams["AddFeatures"]["SomeStep"] assert 'SomeStep1' in hyperparams["AddFeatures"] assert "n_components" in hyperparams["AddFeatures"]["SomeStep1"] assert 'ModelStacking' in hyperparams assert 'SomeStep' in hyperparams["ModelStacking"] assert 'n_estimators' in hyperparams["ModelStacking"]["SomeStep"] assert 'SomeStep1' in hyperparams["ModelStacking"] assert 'n_estimators' in hyperparams["ModelStacking"]["SomeStep1"] assert 'SomeStep2' in hyperparams["ModelStacking"] assert 'max_depth' in hyperparams["ModelStacking"]["SomeStep2"] assert 'SomeStep3' in hyperparams["ModelStacking"] assert 'max_depth' in hyperparams["ModelStacking"]["SomeStep3"] assert 'AddFeatures__SomeStep1__n_components' in flat_hyperparams_keys assert 'AddFeatures__SomeStep__n_components' in flat_hyperparams_keys assert 'ModelStacking__SomeStep__n_estimators' in flat_hyperparams_keys assert 'ModelStacking__SomeStep1__n_estimators' in flat_hyperparams_keys assert 'ModelStacking__SomeStep2__max_depth' in flat_hyperparams_keys assert 'ModelStacking__SomeStep3__max_depth' in flat_hyperparams_keys
def test_pipeline_should_get_hyperparams(): p = Pipeline([ SomeStep().set_name('step_1'), SomeStep().set_name('step_2') ]) p.set_hyperparams({ 'hp': 1, 'step_1__hp': 2, 'step_2__hp': 3 }) hyperparams = p.get_hyperparams() assert isinstance(hyperparams, HyperparameterSamples) assert hyperparams['hp'] == 1 assert hyperparams['step_1__hp'] == 2 assert hyperparams['step_2__hp'] == 3
def test_apply_on_pipeline_with_meta_step_and_positional_argument(): pipeline = Pipeline( [OutputTransformerWrapper(MultiplyByN(1)), MultiplyByN(1)]) pipeline.apply('_set_hyperparams', hyperparams=HyperparameterSamples({ 'multiply_by': 2, 'OutputTransformerWrapper__multiply_by': 3, 'OutputTransformerWrapper__MultiplyByN__multiply_by': 4, 'MultiplyByN__multiply_by': 5 })) assert pipeline.get_hyperparams()['multiply_by'] == 2 assert pipeline['OutputTransformerWrapper'].get_hyperparams( )['multiply_by'] == 3 assert pipeline['OutputTransformerWrapper'].wrapped.get_hyperparams( )['multiply_by'] == 4 assert pipeline['MultiplyByN'].get_hyperparams()['multiply_by'] == 5
def main(tmpdir, sleep_time: float = 0.001, n_iter: int = 10): DATA_INPUTS = np.array(range(100)) EXPECTED_OUTPUTS = np.array(range(100, 200)) HYPERPARAMETER_SPACE = HyperparameterSpace({ 'multiplication_1__multiply_by': RandInt(1, 2), 'multiplication_2__multiply_by': RandInt(1, 2), 'multiplication_3__multiply_by': RandInt(1, 2), }) print('Classic Pipeline:') classic_pipeline_folder = os.path.join(str(tmpdir), 'classic') pipeline = Pipeline([ ('multiplication_1', MultiplyByN()), ('sleep_1', ForEachDataInput(Sleep(sleep_time))), ('multiplication_2', MultiplyByN()), ('sleep_2', ForEachDataInput(Sleep(sleep_time))), ('multiplication_3', MultiplyByN()), ], cache_folder=classic_pipeline_folder).set_hyperparams_space(HYPERPARAMETER_SPACE) time_a = time.time() auto_ml = AutoML( pipeline, refit_trial=True, n_trials=n_iter, cache_folder_when_no_handle=classic_pipeline_folder, validation_splitter=ValidationSplitter(0.2), hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(), scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False), callbacks=[ MetricCallback('mse', metric_function=mean_squared_error, higher_score_is_better=False) ], ) auto_ml = auto_ml.fit(DATA_INPUTS, EXPECTED_OUTPUTS) outputs = auto_ml.get_best_model().predict(DATA_INPUTS) time_b = time.time() actual_score = mean_squared_error(EXPECTED_OUTPUTS, outputs) print('{0} seconds'.format(time_b - time_a)) print('output: {0}'.format(outputs)) print('smallest mse: {0}'.format(actual_score)) print('best hyperparams: {0}'.format(pipeline.get_hyperparams())) assert isinstance(actual_score, float) print('Resumable Pipeline:') resumable_pipeline_folder = os.path.join(str(tmpdir), 'resumable') pipeline = ResumablePipeline([ ('multiplication_1', MultiplyByN()), ('ForEach(sleep_1)', ForEachDataInput(Sleep(sleep_time))), ('checkpoint1', ExpandDim(DefaultCheckpoint())), ('multiplication_2', MultiplyByN()), ('sleep_2', ForEachDataInput(Sleep(sleep_time))), ('checkpoint2', ExpandDim(DefaultCheckpoint())), ('multiplication_3', MultiplyByN()) ], cache_folder=resumable_pipeline_folder).set_hyperparams_space(HYPERPARAMETER_SPACE) time_a = time.time() auto_ml = AutoML( pipeline, refit_trial=True, n_trials=n_iter, cache_folder_when_no_handle=resumable_pipeline_folder, validation_splitter=ValidationSplitter(0.2), hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(), scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False), callbacks=[ MetricCallback('mse', metric_function=mean_squared_error, higher_score_is_better=False) ] ) auto_ml = auto_ml.fit(DATA_INPUTS, EXPECTED_OUTPUTS) outputs = auto_ml.get_best_model().predict(DATA_INPUTS) time_b = time.time() pipeline.flush_all_cache() actual_score = mean_squared_error(EXPECTED_OUTPUTS, outputs) print('{0} seconds'.format(time_b - time_a)) print('output: {0}'.format(outputs)) print('smallest mse: {0}'.format(actual_score)) print('best hyperparams: {0}'.format(pipeline.get_hyperparams())) assert isinstance(actual_score, float)
def main(tmpdir, sleep_time: float = 0, n_iter: int = 10): DATA_INPUTS = np.array(range(100)) EXPECTED_OUTPUTS = np.array(range(100, 200)) HYPERPARAMETER_SPACE = HyperparameterSpace({ 'multiplication_1__multiply_by': RandInt(1, 2), 'multiplication_2__multiply_by': RandInt(1, 2), 'multiplication_3__multiply_by': RandInt(1, 2), }) print('Classic Pipeline:') pipeline = Pipeline([ ('multiplication_1', MultiplyByN()), ('sleep_1', ForEachDataInput(Sleep(sleep_time))), ('multiplication_2', MultiplyByN()), ('sleep_2', ForEachDataInput(Sleep(sleep_time))), ('multiplication_3', MultiplyByN()), ]).set_hyperparams_space(HYPERPARAMETER_SPACE) time_a = time.time() best_model = RandomSearch(pipeline, n_iter=n_iter, higher_score_is_better=True).fit( DATA_INPUTS, EXPECTED_OUTPUTS) outputs = best_model.transform(DATA_INPUTS) time_b = time.time() actual_score = mean_squared_error(EXPECTED_OUTPUTS, outputs) print('{0} seconds'.format(time_b - time_a)) print('output: {0}'.format(outputs)) print('smallest mse: {0}'.format(actual_score)) print('best hyperparams: {0}'.format(pipeline.get_hyperparams())) assert isinstance(actual_score, float) print('Resumable Pipeline:') pipeline = ResumablePipeline( [('multiplication_1', MultiplyByN()), ('ForEach(sleep_1)', ForEachDataInput(Sleep(sleep_time))), ('checkpoint1', ExpandDim(DefaultCheckpoint())), ('multiplication_2', MultiplyByN()), ('sleep_2', ForEachDataInput(Sleep(sleep_time))), ('checkpoint2', ExpandDim(DefaultCheckpoint())), ('multiplication_3', MultiplyByN())], cache_folder=tmpdir).set_hyperparams_space(HYPERPARAMETER_SPACE) time_a = time.time() best_model = RandomSearch(pipeline, n_iter=n_iter, higher_score_is_better=True).fit( DATA_INPUTS, EXPECTED_OUTPUTS) outputs = best_model.transform(DATA_INPUTS) time_b = time.time() pipeline.flush_all_cache() actual_score = mean_squared_error(EXPECTED_OUTPUTS, outputs) print('{0} seconds'.format(time_b - time_a)) print('output: {0}'.format(outputs)) print('smallest mse: {0}'.format(actual_score)) print('best hyperparams: {0}'.format(pipeline.get_hyperparams())) assert isinstance(actual_score, float)