def test_model_stacking_fit_transform(): model_stacking = Pipeline([ ModelStacking( [ SKLearnWrapper( GradientBoostingRegressor(), HyperparameterSpace({ "n_estimators": RandInt(50, 600), "max_depth": RandInt(1, 10), "learning_rate": LogUniform(0.07, 0.7) })), SKLearnWrapper( KMeans(), HyperparameterSpace({"n_clusters": RandInt(5, 10)})), ], joiner=NumpyTranspose(), judge=SKLearnWrapper( Ridge(), HyperparameterSpace({ "alpha": LogUniform(0.7, 1.4), "fit_intercept": Boolean() })), ) ]) expected_outputs_shape = (379, 1) data_inputs_shape = (379, 13) data_inputs = _create_data(data_inputs_shape) expected_outputs = _create_data(expected_outputs_shape) model_stacking, outputs = model_stacking.fit_transform( data_inputs, expected_outputs) assert outputs.shape == expected_outputs_shape
def test_hyperparams_space_round_robin(to_nested_dict_func_name, to_flat_func_name): orig_space = copy.deepcopy(HYPE_SPACE) print(orig_space.keys()) nestened = HyperparameterSpace( getattr(orig_space, to_nested_dict_func_name)()) print(nestened) flattened = HyperparameterSpace(getattr(nestened, to_flat_func_name)()) print(flattened.keys()) assert flattened.to_flat_as_dict_primitive( ) == orig_space.to_flat_as_dict_primitive()
def test_meta_step_mixin_update_hyperparams_space_should_update_wrapped_step_hyperparams(): p = SomeMetaStepMixin(SomeStep()) p.set_hyperparams_space(HyperparameterSpace({ META_STEP_HP: RAND_INT_META_STEP, SOME_STEP_HP: RAND_INT_SOME_STEP })) updated_some_step_hp_space = RandInt(0, 100) p.update_hyperparams_space(HyperparameterSpace({ SOME_STEP_HP: updated_some_step_hp_space })) assert p.hyperparams_space[META_STEP_HP] == RAND_INT_META_STEP assert p.wrapped.get_hyperparams_space()['somestep_hyperparam'] == updated_some_step_hp_space
def test_step_cloner_update_hyperparams_space_should_update_wrapped_step_hyperparams(): p = StepClonerForEachDataInput(SomeStep()) p.set_hyperparams_space(HyperparameterSpace({ META_STEP_HP: RAND_INT_META_STEP, SOME_STEP_HP: RAND_INT_SOME_STEP })) updated_some_step_hp_space = RandInt(0, 400) p.update_hyperparams_space(HyperparameterSpace({ SOME_STEP_HP: updated_some_step_hp_space })) assert isinstance(p.hyperparams, HyperparameterSamples) assert p.hyperparams_space[META_STEP_HP] == RAND_INT_META_STEP assert p.wrapped.get_hyperparams_space()[SOME_STEP_HP_KEY] == updated_some_step_hp_space
def test_can_update_scipy_distribution(): p = Identity().set_hyperparams_space(HyperparameterSpace({ 'rand_int_neuraxle': RandInt(2, 5) # neuraxle })) p.update_hyperparams_space(HyperparameterSpace({ 'rand_int_scipy': randint(low=2, high=5), # scipy 'gamma_scipy': gamma(0.2), # scipy })) assert isinstance(p.get_hyperparams_space()['rand_int_scipy'], ScipyDiscreteDistributionWrapper) assert isinstance(p.get_hyperparams_space()['gamma_scipy'], ScipyContinuousDistributionWrapper) randint_sample = p.get_hyperparams_space()['rand_int_scipy'].rvs() gamma_sample = p.get_hyperparams_space()['gamma_scipy'].rvs() assert 5 >= randint_sample >= 2 assert isinstance(gamma_sample, float)
def test_dict_to_flat_hyperparams_with_hyperparameter_space( expected_flat: dict, dic: dict): flat = HyperparameterSpace(dic).to_flat_as_dict_primitive() pprint(dict(flat)) pprint(expected_flat) assert dict(flat) == dict(expected_flat)
def test_automl_early_stopping_callback(tmpdir): # TODO: fix this unit test # Given hp_repository = InMemoryHyperparamsRepository(cache_folder=str(tmpdir)) n_epochs = 60 auto_ml = AutoML( pipeline=Pipeline([ FitTransformCallbackStep().set_name('callback'), MultiplyByN(2).set_hyperparams_space( HyperparameterSpace({'multiply_by': FixedHyperparameter(2)})), NumpyReshape(new_shape=(-1, 1)), linear_model.LinearRegression() ]), hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(), validation_splitter=ValidationSplitter(0.20), scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False), callbacks=[ MetricCallback('mse', metric_function=mean_squared_error, higher_score_is_better=False), ], n_trials=1, refit_trial=True, epochs=n_epochs, hyperparams_repository=hp_repository) # When data_inputs = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) expected_outputs = data_inputs * 2 auto_ml = auto_ml.fit(data_inputs=data_inputs, expected_outputs=expected_outputs) # Then p = auto_ml.get_best_model()
def _create_posterior(self, flat_hyperparameter_space: HyperparameterSpace, trials: Trials) -> HyperparameterSpace: # Create a list of all hyperparams and their trials. # Loop through all hyperparams posterior_distributions: HyperparameterSpace = HyperparameterSpace() for (hyperparam_key, hyperparam_distribution) in flat_hyperparameter_space.items(): # Get trial hyperparams trial_hyperparams: List[HyperparameterSamples] = [ trial.hyperparams.to_flat_as_dict_primitive()[hyperparam_key] for trial in trials ] if hyperparam_distribution.is_discrete(): posterior_distribution = self._reweights_categorical( discrete_distribution=hyperparam_distribution, trial_hyperparameters=trial_hyperparams) else: posterior_distribution = self._create_gaussian_mixture( continuous_distribution=hyperparam_distribution, trial_hyperparameters=trial_hyperparams) posterior_distributions.update( {hyperparam_key: posterior_distribution}) return posterior_distributions
def test_logger(): file_path = "test.log" if os.path.exists(file_path): os.remove(file_path) # Given logger = logging.getLogger('test') file_handler = logging.FileHandler(file_path) file_handler.setLevel('DEBUG') logger.addHandler(file_handler) logger.setLevel('DEBUG') context = ExecutionContext(logger=logger) pipeline = Pipeline([ MultiplyByN(2).set_hyperparams_space( HyperparameterSpace({'multiply_by': FixedHyperparameter(2)})), NumpyReshape(new_shape=(-1, 1)), LoggingStep() ]) # When data_container = DataContainer( data_inputs=np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])) pipeline.handle_fit(data_container, context) # Then assert os.path.exists(file_path) with open(file_path) as f: l = f.read() # Teardown file_handler.close() os.remove(file_path)
def test_automl_sequential_wrapper(tmpdir): # Given data_inputs = np.array(range(100)) expected_outputs = np.array(range(100, 200)) hyperparameter_space = HyperparameterSpace({ 'multiplication_1__multiply_by': RandInt(1, 3), 'multiplication_2__multiply_by': RandInt(1, 3), 'multiplication_3__multiply_by': RandInt(1, 3), }) pipeline = Pipeline( [('multiplication_1', MultiplyByN()), ('multiplication_2', MultiplyByN()), ('multiplication_3', MultiplyByN())], cache_folder=tmpdir).set_hyperparams_space(hyperparameter_space) auto_ml = RandomSearch( KFoldCrossValidationWrapper().set_step(pipeline), hyperparams_repository=HyperparamsJSONRepository(tmpdir), n_iter=10) # When auto_ml: AutoMLSequentialWrapper = auto_ml.fit(data_inputs, expected_outputs) best_model: Pipeline = auto_ml.get_best_model() predicted_outputs = best_model.transform(data_inputs) # Then actual_mse = ((predicted_outputs - expected_outputs)**2).mean() assert actual_mse < 20000
def test_trainer_train(): data_inputs = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) expected_outputs = data_inputs * 4 p = Pipeline([ MultiplyByN(2).set_hyperparams_space( HyperparameterSpace({'multiply_by': FixedHyperparameter(2)})), NumpyReshape(new_shape=(-1, 1)), linear_model.LinearRegression() ]) trainer: Trainer = Trainer( epochs=10, scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False), validation_splitter=ValidationSplitter(test_size=0.20)) repo_trial: Trial = trainer.train(pipeline=p, data_inputs=data_inputs, expected_outputs=expected_outputs) trained_pipeline = repo_trial.get_trained_pipeline(split_number=0) outputs = trained_pipeline.transform(data_inputs) mse = mean_squared_error(expected_outputs, outputs) assert mse < 1
def test_automl_should_shallow_copy_data_before_each_epoch(): # see issue #332 https://github.com/Neuraxio/Neuraxle/issues/332 data_inputs = np.random.randint(0, 100, (100, 3)) expected_outputs = np.random.randint(0, 3, 100) from sklearn.preprocessing import StandardScaler p = Pipeline([ SKLearnWrapper(StandardScaler()), SKLearnWrapper(LinearSVC(), HyperparameterSpace({'C': RandInt(0, 10000)})), ]) auto_ml = AutoML(p, validation_splitter=ValidationSplitter(0.20), refit_trial=True, n_trials=10, epochs=10, cache_folder_when_no_handle='cache', scoring_callback=ScoringCallback( mean_squared_error, higher_score_is_better=False), callbacks=[ MetricCallback('mse', metric_function=mean_squared_error, higher_score_is_better=False) ], hyperparams_repository=InMemoryHyperparamsRepository( cache_folder='cache'), continue_loop_on_error=False) random_search = auto_ml.fit(data_inputs, expected_outputs) best_model = random_search.get_best_model() assert isinstance(best_model, Pipeline)
def main(): p = Pipeline([ IdentityWithRvs().set_hyperparams_space( HyperparameterSpace({'a': randint(low=2, high=5)})), IdentityWithRvs().set_hyperparams_space( HyperparameterSpace({'b': randint(low=100, high=400)})) ]) samples: HyperparameterSamples = p.apply(rvs) print('p.apply(rvs) ==>') print(json.dumps(samples, indent=4)) # or equivalently: samples: HyperparameterSamples = p.apply('_rvs') print('p.apply(\'_rvs\') ==>') print(json.dumps(samples, indent=4))
def test_hyperparam_space(): p = Pipeline([ AddFeatures([ SomeStep(hyperparams_space=HyperparameterSpace({"n_components": RandInt(1, 5)})), SomeStep(hyperparams_space=HyperparameterSpace({"n_components": RandInt(1, 5)})) ]), ModelStacking([ SomeStep(hyperparams_space=HyperparameterSpace({"n_estimators": RandInt(1, 1000)})), SomeStep(hyperparams_space=HyperparameterSpace({"n_estimators": RandInt(1, 1000)})), SomeStep(hyperparams_space=HyperparameterSpace({"max_depth": RandInt(1, 100)})), SomeStep(hyperparams_space=HyperparameterSpace({"max_depth": RandInt(1, 100)})) ], joiner=NumpyTranspose(), judge=SomeStep(hyperparams_space=HyperparameterSpace({"alpha": LogUniform(0.1, 10.0)})) ) ]) rvsed = p.get_hyperparams_space() p.set_hyperparams(rvsed) hyperparams = p.get_hyperparams() assert "AddFeatures" in hyperparams.keys() assert "SomeStep" in hyperparams["AddFeatures"] assert "n_components" in hyperparams["AddFeatures"]["SomeStep"] assert "SomeStep1" in hyperparams["AddFeatures"] assert "n_components" in hyperparams["AddFeatures"]["SomeStep1"] assert "SomeStep" in hyperparams["ModelStacking"] assert "n_estimators" in hyperparams["ModelStacking"]["SomeStep"] assert "SomeStep1" in hyperparams["ModelStacking"] assert "max_depth" in hyperparams["ModelStacking"]["SomeStep2"]
def __init__(self, brothers): super().__init__(brothers, SKLearnWrapper( Ridge(), HyperparameterSpace({ "alpha": LogUniform(0.1, 10.0), "fit_intercept": Boolean() })), joiner=NumpyTranspose())
def test_meta_step_mixin_should_set_hyperparams_space(): p = SomeMetaStepMixin(SomeStep()) p.set_hyperparams_space(HyperparameterSpace({ META_STEP_HP: RAND_INT_META_STEP, SOME_STEP_HP: RAND_INT_SOME_STEP })) assert p.hyperparams_space[META_STEP_HP] == RAND_INT_META_STEP assert p.get_step().hyperparams_space[SOME_STEP_HP_KEY] == RAND_INT_SOME_STEP
def test_choose_one_or_many_step_of_transform_should_choose_step( test_case: NeuraxleTestCase): p = test_case.pipeline p.set_hyperparams_space(HyperparameterSpace(test_case.hyperparams_space)) p.set_hyperparams(test_case.hyperparams) outputs = p.transform(DATA_INPUTS) assert np.array_equal(outputs, test_case.expected_processed_outputs) assert_callback_data_is_as_expected(test_case)
def test_step_cloner_should_set_steps_hyperparams_space(): p = StepClonerForEachDataInput(SomeStep()) p.set_hyperparams_space(HyperparameterSpace({ META_STEP_HP: RAND_INT_STEP_CLONER, SOME_STEP_HP: RAND_INT_SOME_STEP })) assert isinstance(p.get_step().hyperparams_space, HyperparameterSpace) assert p.get_step().hyperparams_space[SOME_STEP_HP_KEY] == RAND_INT_SOME_STEP
def test_step_cloner_should_get_hyperparams_space(): p = StepClonerForEachDataInput(SomeStep()) p.set_hyperparams_space(HyperparameterSpace({ META_STEP_HP: RAND_INT_STEP_CLONER, SOME_STEP_HP: RAND_INT_SOME_STEP })) hyperparams_space = p.get_hyperparams_space() assert hyperparams_space[META_STEP_HP] == RAND_INT_STEP_CLONER assert hyperparams_space[SOME_STEP_HP] == RAND_INT_SOME_STEP
def get_hyperparams_space(self, flat=False): all_hyperparams = HyperparameterSpace() for step_name, step in self.steps_as_tuple: hspace = step.get_hyperparams_space(flat=flat) all_hyperparams.update({step_name: hspace}) all_hyperparams.update(super().get_hyperparams_space()) if flat: all_hyperparams = all_hyperparams.to_flat() else: all_hyperparams = all_hyperparams.to_nested_dict() return all_hyperparams
def create_model_step(): return TensorflowV1ModelStep(create_graph=create_graph, create_loss=create_loss, create_optimizer=create_optimizer, has_expected_outputs=True).set_hyperparams( HyperparameterSamples( {'learning_rate': 0.01})).set_hyperparams_space( HyperparameterSpace({ 'learning_rate': LogUniform(0.0001, 0.01) }))
def set_hyperparams_space(self, hyperparams_space: Union[HyperparameterSpace, OrderedDict, dict]) -> BaseStep: hyperparams_space: HyperparameterSpace = HyperparameterSpace(hyperparams_space).to_nested_dict() remainders = dict() for name, hparams in hyperparams_space.items(): if name in self.steps.keys(): self.steps[name].set_hyperparams_space(hparams) else: remainders[name] = hparams self.hyperparams = remainders return self
def __init__(self, steps, hyperparams=None): FeatureUnion.__init__(self, steps, joiner=SelectNonEmptyDataInputs()) self._make_all_steps_optional() if hyperparams is None: choices = list(self.keys())[:-1] self.set_hyperparams(HyperparameterSamples({ CHOICE_HYPERPARAM: choices[0] })) self.set_hyperparams_space(HyperparameterSpace({ CHOICE_HYPERPARAM: Choice(choices) }))
def test_automl_sklearn_model_with_base_estimator(tmpdir): grad_boost = GradientBoostingRegressor() bagged_regressor = BaggingRegressor(grad_boost, random_state=5, n_jobs=-1) wrapped_bagged_regressor = SKLearnWrapper( bagged_regressor, HyperparameterSpace({ "n_estimators": RandInt(10, 100), "max_features": Uniform(0.6, 1.0) }), # return_all_sklearn_default_params_on_get=True ) _test_within_auto_ml_loop(tmpdir, wrapped_bagged_regressor)
def test_hyperparam_space(): p = Pipeline([ AddFeatures([ SomeStep(hyperparams_space=HyperparameterSpace( {"n_components": RandInt(1, 5)})), SomeStep(hyperparams_space=HyperparameterSpace( {"n_components": RandInt(1, 5)})) ]), ModelStacking([ SomeStep(hyperparams_space=HyperparameterSpace( {"n_estimators": RandInt(1, 1000)})), SomeStep(hyperparams_space=HyperparameterSpace( {"n_estimators": RandInt(1, 1000)})), SomeStep(hyperparams_space=HyperparameterSpace( {"max_depth": RandInt(1, 100)})), SomeStep(hyperparams_space=HyperparameterSpace( {"max_depth": RandInt(1, 100)})) ], joiner=NumpyTranspose(), judge=SomeStep(hyperparams_space=HyperparameterSpace( {"alpha": LogUniform(0.1, 10.0)}))) ]) rvsed = p.get_hyperparams_space() p.set_hyperparams(rvsed) hyperparams = p.get_hyperparams() flat_hyperparams_keys = hyperparams.to_flat_dict().keys() assert 'AddFeatures' in hyperparams assert 'SomeStep' in hyperparams["AddFeatures"] assert "n_components" in hyperparams["AddFeatures"]["SomeStep"] assert 'SomeStep1' in hyperparams["AddFeatures"] assert "n_components" in hyperparams["AddFeatures"]["SomeStep1"] assert 'ModelStacking' in hyperparams assert 'SomeStep' in hyperparams["ModelStacking"] assert 'n_estimators' in hyperparams["ModelStacking"]["SomeStep"] assert 'SomeStep1' in hyperparams["ModelStacking"] assert 'n_estimators' in hyperparams["ModelStacking"]["SomeStep1"] assert 'SomeStep2' in hyperparams["ModelStacking"] assert 'max_depth' in hyperparams["ModelStacking"]["SomeStep2"] assert 'SomeStep3' in hyperparams["ModelStacking"] assert 'max_depth' in hyperparams["ModelStacking"]["SomeStep3"] assert 'AddFeatures__SomeStep1__n_components' in flat_hyperparams_keys assert 'AddFeatures__SomeStep__n_components' in flat_hyperparams_keys assert 'ModelStacking__SomeStep__n_estimators' in flat_hyperparams_keys assert 'ModelStacking__SomeStep1__n_estimators' in flat_hyperparams_keys assert 'ModelStacking__SomeStep2__max_depth' in flat_hyperparams_keys assert 'ModelStacking__SomeStep3__max_depth' in flat_hyperparams_keys
def test_logger_automl(self, tmpdir): # Given context = ExecutionContext() self.tmpdir = str(tmpdir) hp_repository = HyperparamsJSONRepository(cache_folder=self.tmpdir) n_epochs = 2 n_trials = 4 auto_ml = AutoML( pipeline=Pipeline([ MultiplyByN(2).set_hyperparams_space( HyperparameterSpace( {'multiply_by': FixedHyperparameter(2)})), NumpyReshape(new_shape=(-1, 1)), LoggingStep() ]), hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy( ), validation_splitter=ValidationSplitter(0.20), scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False), n_trials=n_trials, refit_trial=True, epochs=n_epochs, hyperparams_repository=hp_repository, continue_loop_on_error=False) # When data_container = DataContainer( data_inputs=np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), expected_outputs=np.array([10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0])) auto_ml.handle_fit(data_container, context) # Then file_paths = [ os.path.join(hp_repository.cache_folder, f"trial_{i}.log") for i in range(n_trials) ] assert len(file_paths) == n_trials for f in file_paths: assert os.path.exists(f) for f in file_paths: with open(f, 'r') as f: log = f.readlines() assert len(log) == 36
def __init__(self, hyperparams: HyperparameterSamples = None, hyperparams_space: HyperparameterSpace = None, name: str = None): if hyperparams is None: hyperparams = dict() if hyperparams_space is None: hyperparams_space = dict() if name is None: name = self.__class__.__name__ self.hyperparams: HyperparameterSamples = HyperparameterSamples( hyperparams) self.hyperparams_space: HyperparameterSpace = HyperparameterSpace( hyperparams_space) self.name: str = name self.pending_mutate: ('BaseStep', str, str) = (None, None, None)
def test_automl_savebestmodel_callback(tmpdir): # Given hp_repository = HyperparamsJSONRepository(cache_folder=str('caching')) validation_splitter = ValidationSplitter(0.20) auto_ml = AutoML( pipeline=Pipeline([ MultiplyByN(2).set_hyperparams_space(HyperparameterSpace({ 'multiply_by': FixedHyperparameter(2) })), NumpyReshape(new_shape=(-1, 1)), linear_model.LinearRegression() ]), validation_splitter=validation_splitter, hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(), scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False), callbacks=[ BestModelCheckpoint() ], n_trials=1, epochs=10, refit_trial=False, print_func=print, hyperparams_repository=hp_repository, continue_loop_on_error=False ) data_inputs = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) expected_outputs = data_inputs * 4 # When auto_ml.fit(data_inputs=data_inputs, expected_outputs=expected_outputs) #Then trials: Trials = hp_repository.load_all_trials() best_trial = trials.get_best_trial() best_trial_score = best_trial.get_validation_score() best_trial.cache_folder = hp_repository.cache_folder best_model = best_trial.get_model('best') _, _, valid_inputs, valid_outputs = ValidationSplitter(0.20).split(data_inputs, expected_outputs) predicted_output = best_model.predict(valid_inputs) score = mean_squared_error(valid_outputs, predicted_output) assert best_trial_score == score
def __init__(self, wrapped: BaseTransformer, enabled: bool = True, nullified_return_value=None, cache_folder_when_no_handle=None, use_hyperparameter_space=True, nullify_hyperparams=True): hyperparameter_space = HyperparameterSpace({ OPTIONAL_ENABLED_HYPERPARAM: Boolean() }) if use_hyperparameter_space else {} MetaStep.__init__( self, hyperparams=HyperparameterSamples({ OPTIONAL_ENABLED_HYPERPARAM: enabled }), hyperparams_space=hyperparameter_space, wrapped=wrapped ) ForceHandleOnlyMixin.__init__(self, cache_folder_when_no_handle) if nullified_return_value is None: nullified_return_value = [] self.nullified_return_value = nullified_return_value self.nullify_hyperparams = nullify_hyperparams
def test_automl_sequential_wrapper_with_validation_split_wrapper(tmpdir): # Setting seed for reproducibility np.random.seed(75) # Given data_inputs = np.array(range(100)) expected_outputs = np.array(range(100, 200)) hyperparameter_space = HyperparameterSpace({ 'multiplication_1__multiply_by': RandInt(1, 3), 'multiplication_2__multiply_by': RandInt(1, 3), 'multiplication_3__multiply_by': RandInt(1, 3), }) pipeline = Pipeline( [('multiplication_1', MultiplyByN()), ('multiplication_2', MultiplyByN()), ('multiplication_3', MultiplyByN())], cache_folder=tmpdir).set_hyperparams_space(hyperparameter_space) random_search = RandomSearch( ValidationSplitWrapper(pipeline, test_size=0.2, scoring_function=mean_squared_error, run_validation_split_in_test_mode=False), hyperparams_repository=HyperparamsJSONRepository(tmpdir), higher_score_is_better=False, n_iter=100) # When mse_before = ((data_inputs - expected_outputs)**2).mean() random_search: AutoMLSequentialWrapper = random_search.fit( data_inputs, expected_outputs) best_model: Pipeline = random_search.get_best_model() predicted_outputs = best_model.transform(data_inputs) # Then actual_mse = ((predicted_outputs - expected_outputs)**2).mean() assert actual_mse < mse_before