def test_hyperparams_repository_should_save_success_trial(tmpdir): hyperparams_json_repository = HyperparamsJSONRepository(tmpdir) hyperparams = HyperparameterSamples(HYPERPARAMS) hyperparams_json_repository._save_successful_trial_json(hyperparams, FIRST_SCORE_FOR_TRIAL) trial_hash = hyperparams_json_repository._get_trial_hash(hyperparams.to_flat_as_dict_primitive()) file_name = str(float(FIRST_SCORE_FOR_TRIAL)).replace('.', ',') + '_' + trial_hash + '.json' path = os.path.join(tmpdir, file_name) with open(path) as f: trial_json = json.load(f) assert trial_json['hyperparams'] == hyperparams.to_flat_as_dict_primitive() assert trial_json['score'] == FIRST_SCORE_FOR_TRIAL
def test_hyperparams_repository_should_create_new_trial(tmpdir): hyperparams_json_repository = HyperparamsJSONRepository(tmpdir) hyperparams = HyperparameterSamples(HYPERPARAMS) hyperparams_json_repository.create_new_trial(hyperparams) trial_hash = hyperparams_json_repository._get_trial_hash(hyperparams.to_flat_as_dict_primitive()) file_name = 'NEW_' + trial_hash + '.json' path = os.path.join(tmpdir, file_name) with open(path) as f: trial_json = json.load(f) assert trial_json['hyperparams'] == hyperparams.to_flat_as_dict_primitive() assert trial_json['score'] is None
def test_hyperparams_repository_should_save_failed_trial(tmpdir): hyperparams_json_repository = HyperparamsJSONRepository(tmpdir) hyperparams = HyperparameterSamples(HYPERPARAMS) hyperparams_json_repository._save_failed_trial_json(hyperparams, Exception('trial failed')) trial_hash = hyperparams_json_repository._get_trial_hash(hyperparams.to_flat_as_dict_primitive()) file_name = 'FAILED_' + trial_hash + '.json' path = os.path.join(tmpdir, file_name) with open(path) as f: trial_json = json.load(f) assert trial_json['hyperparams'] == hyperparams.to_flat_as_dict_primitive() assert 'exception' in trial_json.keys() assert trial_json['score'] is None
def test_hyperparams_repository_should_load_all_trials(tmpdir): tmpdir = os.path.join(tmpdir, "__json__") os.mkdir(tmpdir) hyperparams_json_repository = HyperparamsJSONRepository(tmpdir) n_trials = 3 for i in range(n_trials): hyperparams = HyperparameterSamples({'learning_rate': 0.01 + i * 0.01}) hyperparams_json_repository.save_score_for_success_trial(hyperparams, i) trials = hyperparams_json_repository.load_all_trials() assert len(trials) == n_trials for i in range(n_trials): assert trials[i].hyperparams == HyperparameterSamples( {'learning_rate': 0.01 + i * 0.01}).to_flat_as_dict_primitive(), (i, str(trials))
def test_automl_sequential_wrapper(tmpdir): # Given data_inputs = np.array(range(100)) expected_outputs = np.array(range(100, 200)) hyperparameter_space = HyperparameterSpace({ 'multiplication_1__multiply_by': RandInt(1, 3), 'multiplication_2__multiply_by': RandInt(1, 3), 'multiplication_3__multiply_by': RandInt(1, 3), }) pipeline = Pipeline( [('multiplication_1', MultiplyByN()), ('multiplication_2', MultiplyByN()), ('multiplication_3', MultiplyByN())], cache_folder=tmpdir).set_hyperparams_space(hyperparameter_space) auto_ml = RandomSearch( pipeline, hyperparams_repository=HyperparamsJSONRepository(tmpdir), n_iter=100) # When auto_ml: AutoMLSequentialWrapper = auto_ml.fit(data_inputs, expected_outputs) best_model: Pipeline = auto_ml.get_best_model() predicted_outputs = best_model.transform(data_inputs) # Then actual_mse = ((predicted_outputs - expected_outputs)**2).mean() assert actual_mse < 5000
def test_hyperparams_repository_should_load_all_trials(tmpdir): hyperparams_json_repository = HyperparamsJSONRepository(tmpdir) for i in range(2): hyperparams = HyperparameterSamples({'learning_rate': 0.01 + i * 0.01}) hyperparams_json_repository.save_score_for_success_trial( hyperparams, i) trials = hyperparams_json_repository.load_all_trials() assert len(trials) == 2 assert trials[0].hyperparams == HyperparameterSamples({ 'learning_rate': 0.01 + 0 * 0.01 }).to_flat_as_dict_primitive() assert trials[1].hyperparams == HyperparameterSamples({ 'learning_rate': 0.01 + 1 * 0.01 }).to_flat_as_dict_primitive()
def test_automl_savebestmodel_callback(tmpdir): # Given hp_repository = HyperparamsJSONRepository(cache_folder=str('caching')) validation_splitter = ValidationSplitter(0.20) auto_ml = AutoML( pipeline=Pipeline([ MultiplyByN(2).set_hyperparams_space(HyperparameterSpace({ 'multiply_by': FixedHyperparameter(2) })), NumpyReshape(new_shape=(-1, 1)), linear_model.LinearRegression() ]), validation_splitter=validation_splitter, hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(), scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False), callbacks=[ BestModelCheckpoint() ], n_trials=1, epochs=10, refit_trial=False, print_func=print, hyperparams_repository=hp_repository, continue_loop_on_error=False ) data_inputs = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) expected_outputs = data_inputs * 4 # When auto_ml.fit(data_inputs=data_inputs, expected_outputs=expected_outputs) #Then trials: Trials = hp_repository.load_all_trials() best_trial = trials.get_best_trial() best_trial_score = best_trial.get_validation_score() best_trial.cache_folder = hp_repository.cache_folder best_model = best_trial.get_model('best') _, _, valid_inputs, valid_outputs = ValidationSplitter(0.20).split(data_inputs, expected_outputs) predicted_output = best_model.predict(valid_inputs) score = mean_squared_error(valid_outputs, predicted_output) assert best_trial_score == score
def test_hyperparams_json_repository_should_be_observable_in_memory(): # todo: make a tests that asserts that an observer can receive updates from the HyperparamsJSONRepository # todo: given trial, a repo, and an observer repo: HyperparamsJSONRepository = HyperparamsJSONRepository() # todo: when repo.subscribe(observer) # todo: when repo.save_trial(trial) # todo: then observer.events[0] == trial pass
def test_hyperparams_json_repository_should_be_observable_with_file_system_changes( ): # todo: make a tests that asserts that an observer can receive updates from the HyperparamsJSONRepository # todo: given trial, a repo, and an observer repo: HyperparamsJSONRepository = HyperparamsJSONRepository() # todo: when repo.subscribe_to_cache_folder_changes(observer) # todo: when repo.save_trial(trial) # todo: then observer.events[0] == trial pass
def _make_autoML_loop(tmpdir, p: Pipeline): hp_repository = HyperparamsJSONRepository(cache_folder=tmpdir) # hp_repository = InMemoryHyperparamsRepository(cache_folder=str(tmpdir) + "_hp") n_epochs = 1 return AutoML( pipeline=p, hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(), validation_splitter=ValidationSplitter(0.20), scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False), n_trials=1, refit_trial=True, epochs=n_epochs, hyperparams_repository=hp_repository, cache_folder_when_no_handle=str(tmpdir), continue_loop_on_error=False )
def test_logger_automl(self, tmpdir): # Given context = ExecutionContext() self.tmpdir = str(tmpdir) hp_repository = HyperparamsJSONRepository(cache_folder=self.tmpdir) n_epochs = 2 n_trials = 4 auto_ml = AutoML( pipeline=Pipeline([ MultiplyByN(2).set_hyperparams_space( HyperparameterSpace( {'multiply_by': FixedHyperparameter(2)})), NumpyReshape(new_shape=(-1, 1)), LoggingStep() ]), hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy( ), validation_splitter=ValidationSplitter(0.20), scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False), n_trials=n_trials, refit_trial=True, epochs=n_epochs, hyperparams_repository=hp_repository, continue_loop_on_error=False) # When data_container = DataContainer( data_inputs=np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), expected_outputs=np.array([10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0])) auto_ml.handle_fit(data_container, context) # Then file_paths = [ os.path.join(hp_repository.cache_folder, f"trial_{i}.log") for i in range(n_trials) ] assert len(file_paths) == n_trials for f in file_paths: assert os.path.exists(f) for f in file_paths: with open(f, 'r') as f: log = f.readlines() assert len(log) == 36
def _test_within_auto_ml_loop(tmpdir, pipeline): X_train = np.random.random((25, 50)).astype(np.float32) Y_train = np.random.random((25, )).astype(np.float32) validation_splitter = KFoldCrossValidationSplitter(3) scoring_callback = ScoringCallback(median_absolute_error, higher_score_is_better=False) auto_ml = AutoML( pipeline=pipeline, hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(), validation_splitter=validation_splitter, scoring_callback=scoring_callback, n_trials=2, epochs=1, hyperparams_repository=HyperparamsJSONRepository(cache_folder=tmpdir), refit_trial=True, continue_loop_on_error=False) auto_ml.fit(X_train, Y_train)
def test_automl_sequential_wrapper_with_validation_split_wrapper(tmpdir): # Given data_inputs = np.array(range(100)) expected_outputs = np.array(range(100, 200)) hyperparameter_space = HyperparameterSpace({ 'multiplication_1__multiply_by': RandInt(1, 3), 'multiplication_2__multiply_by': RandInt(1, 3), 'multiplication_3__multiply_by': RandInt(1, 3), }) pipeline = Pipeline( [('multiplication_1', MultiplyByN()), ('multiplication_2', MultiplyByN()), ('multiplication_3', MultiplyByN())], cache_folder=tmpdir).set_hyperparams_space(hyperparameter_space) random_search = RandomSearch( pipeline, validation_technique=ValidationSplitWrapper( test_size=0.2, scoring_function=mean_squared_error, run_validation_split_in_test_mode=False), hyperparams_repository=HyperparamsJSONRepository(tmpdir), higher_score_is_better=False, n_iter=100) # When mse_before = ((data_inputs - expected_outputs)**2).mean() random_search: AutoMLSequentialWrapper = random_search.fit( data_inputs, expected_outputs) best_model: Pipeline = random_search.get_best_model() predicted_outputs = best_model.transform(data_inputs) # Then actual_mse = ((predicted_outputs - expected_outputs)**2).mean() assert actual_mse < mse_before
def test_automl_with_kfold(tmpdir): # Given hp_repository = HyperparamsJSONRepository(cache_folder=str('caching')) auto_ml = AutoML( pipeline=Pipeline([ MultiplyByN(2).set_hyperparams_space( HyperparameterSpace({'multiply_by': FixedHyperparameter(2)})), NumpyReshape(new_shape=(-1, 1)), linear_model.LinearRegression() ]), validation_splitter=ValidationSplitter(0.20), hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(), scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False), callbacks=[ MetricCallback('mse', metric_function=mean_squared_error, higher_score_is_better=False), ], n_trials=1, epochs=10, refit_trial=True, print_func=print, hyperparams_repository=hp_repository, continue_loop_on_error=False) data_inputs = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) expected_outputs = data_inputs * 4 # When auto_ml.fit(data_inputs=data_inputs, expected_outputs=expected_outputs) # Then p = auto_ml.get_best_model() outputs = p.transform(data_inputs) mse = mean_squared_error(expected_outputs, outputs) assert mse < 1000
def main(): # Define classification models, and hyperparams. # See also HyperparameterSpace documentation : https://www.neuraxle.org/stable/api/neuraxle.hyperparams.space.html#neuraxle.hyperparams.space.HyperparameterSpace decision_tree_classifier = SKLearnWrapper( DecisionTreeClassifier(), HyperparameterSpace({ 'criterion': Choice(['gini', 'entropy']), 'splitter': Choice(['best', 'random']), 'min_samples_leaf': RandInt(2, 5), 'min_samples_split': RandInt(2, 4) })) extra_tree_classifier = SKLearnWrapper( ExtraTreeClassifier(), HyperparameterSpace({ 'criterion': Choice(['gini', 'entropy']), 'splitter': Choice(['best', 'random']), 'min_samples_leaf': RandInt(2, 5), 'min_samples_split': RandInt(2, 4) })) ridge_classifier = Pipeline([ OutputTransformerWrapper(NumpyRavel()), SKLearnWrapper( RidgeClassifier(), HyperparameterSpace({ 'alpha': Choice([0.0, 1.0, 10.0, 100.0]), 'fit_intercept': Boolean(), 'normalize': Boolean() })) ]).set_name('RidgeClassifier') logistic_regression = Pipeline([ OutputTransformerWrapper(NumpyRavel()), SKLearnWrapper( LogisticRegression(), HyperparameterSpace({ 'C': LogUniform(0.01, 10.0), 'fit_intercept': Boolean(), 'penalty': Choice(['none', 'l2']), 'max_iter': RandInt(20, 200) })) ]).set_name('LogisticRegression') random_forest_classifier = Pipeline([ OutputTransformerWrapper(NumpyRavel()), SKLearnWrapper( RandomForestClassifier(), HyperparameterSpace({ 'n_estimators': RandInt(50, 600), 'criterion': Choice(['gini', 'entropy']), 'min_samples_leaf': RandInt(2, 5), 'min_samples_split': RandInt(2, 4), 'bootstrap': Boolean() })) ]).set_name('RandomForestClassifier') # Define a classification pipeline that lets the AutoML loop choose one of the classifier. # See also ChooseOneStepOf documentation : https://www.neuraxle.org/stable/api/neuraxle.steps.flow.html#neuraxle.steps.flow.ChooseOneStepOf pipeline = Pipeline([ ChooseOneStepOf([ decision_tree_classifier, extra_tree_classifier, ridge_classifier, logistic_regression, random_forest_classifier ]) ]) # Create the AutoML loop object. # See also AutoML documentation : https://www.neuraxle.org/stable/api/neuraxle.metaopt.auto_ml.html#neuraxle.metaopt.auto_ml.AutoML auto_ml = AutoML( pipeline=pipeline, hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(), validation_splitter=ValidationSplitter(test_size=0.20), scoring_callback=ScoringCallback(accuracy_score, higher_score_is_better=True), n_trials=7, epochs=1, hyperparams_repository=HyperparamsJSONRepository(cache_folder='cache'), refit_trial=True, continue_loop_on_error=False) # Load data, and launch AutoML loop ! X_train, y_train, X_test, y_test = generate_classification_data() auto_ml = auto_ml.fit(X_train, y_train) # Get the model from the best trial, and make predictions using predict. # See also predict documentation : https://www.neuraxle.org/stable/api/neuraxle.base.html#neuraxle.base.BaseStep.predict best_pipeline = auto_ml.get_best_model() y_pred = best_pipeline.predict(X_test) accuracy = accuracy_score(y_true=y_test, y_pred=y_pred) print("Test accuracy score:", accuracy) shutil.rmtree('cache')