def test_automl_early_stopping_callback(tmpdir): # TODO: fix this unit test # Given hp_repository = InMemoryHyperparamsRepository(cache_folder=str(tmpdir)) n_epochs = 60 auto_ml = AutoML( pipeline=Pipeline([ FitTransformCallbackStep().set_name('callback'), MultiplyByN(2).set_hyperparams_space( HyperparameterSpace({'multiply_by': FixedHyperparameter(2)})), NumpyReshape(new_shape=(-1, 1)), linear_model.LinearRegression() ]), hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(), validation_splitter=ValidationSplitter(0.20), scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False), callbacks=[ MetricCallback('mse', metric_function=mean_squared_error, higher_score_is_better=False), ], n_trials=1, refit_trial=True, epochs=n_epochs, hyperparams_repository=hp_repository) # When data_inputs = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) expected_outputs = data_inputs * 2 auto_ml = auto_ml.fit(data_inputs=data_inputs, expected_outputs=expected_outputs) # Then p = auto_ml.get_best_model()
def test_tpe(expected_output_mult, pipeline, tmpdir): # set random seed np.random.seed(77) # Given tpe_scores = Parallel(n_jobs=-2)(delayed(_test_trial_scores)( expected_output_mult, pipeline, TreeParzenEstimatorHyperparameterSelectionStrategy( number_of_initial_random_step=10, quantile_threshold=0.3, number_good_trials_max_cap=25, number_possible_hyperparams_candidates=100, prior_weight=0., use_linear_forgetting_weights=False, number_recent_trial_at_full_weights=25), os.path.join(tmpdir, 'tpe', str(i))) for i in range(4)) random_scores = Parallel(n_jobs=-2)(delayed(_test_trial_scores)( expected_output_mult, pipeline, RandomSearchHyperparameterSelectionStrategy(), os.path.join(tmpdir, 'random', str(i))) for i in range(4)) mean_tpe_score = np.array(tpe_scores).flatten().mean(axis=-1) mean_random_score = np.array(random_scores).flatten().mean(axis=-1) assert mean_tpe_score < mean_random_score
def _make_autoML_loop(tmpdir, p: Pipeline): hp_repository = InMemoryHyperparamsRepository(cache_folder=str(tmpdir) + "_hp") n_epochs = 1 return AutoML( pipeline=p, hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(), validation_splitter=ValidationSplitter(0.20), scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False), n_trials=1, refit_trial=True, epochs=n_epochs, hyperparams_repository=hp_repository, cache_folder_when_no_handle=str(tmpdir), continue_loop_on_error=False )
def __init__(self, wrapped: BaseStep = None, hyperparams_repository: HyperparamsRepository = None, higher_score_is_better=True, n_iter: int = 10, cache_folder_when_no_handle=None): AutoMLSequentialWrapper.__init__( self, wrapped=wrapped, auto_ml_algorithm=AutoMLAlgorithm( hyperparameter_optimizer= RandomSearchHyperparameterSelectionStrategy(), higher_score_is_better=higher_score_is_better, cache_folder_when_no_handle=cache_folder_when_no_handle), hyperparams_repository=hyperparams_repository, n_iters=n_iter)
def test_logger_automl(self, tmpdir): # Given context = ExecutionContext() self.tmpdir = str(tmpdir) hp_repository = HyperparamsJSONRepository(cache_folder=self.tmpdir) n_epochs = 2 n_trials = 4 auto_ml = AutoML( pipeline=Pipeline([ MultiplyByN(2).set_hyperparams_space( HyperparameterSpace( {'multiply_by': FixedHyperparameter(2)})), NumpyReshape(new_shape=(-1, 1)), LoggingStep() ]), hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy( ), validation_splitter=ValidationSplitter(0.20), scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False), n_trials=n_trials, refit_trial=True, epochs=n_epochs, hyperparams_repository=hp_repository, continue_loop_on_error=False) # When data_container = DataContainer( data_inputs=np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), expected_outputs=np.array([10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0])) auto_ml.handle_fit(data_container, context) # Then file_paths = [ os.path.join(hp_repository.cache_folder, f"trial_{i}.log") for i in range(n_trials) ] assert len(file_paths) == n_trials for f in file_paths: assert os.path.exists(f) for f in file_paths: with open(f, 'r') as f: log = f.readlines() assert len(log) == 36
def __init__(self, number_of_initial_random_step: int = 40, quantile_threshold: float = 0.3, number_good_trials_max_cap: int = 25, number_possible_hyperparams_candidates: int = 100, prior_weight: float = 0., use_linear_forgetting_weights: bool = False, number_recent_trial_at_full_weights: int = 25): super().__init__() self.initial_auto_ml_algo: RandomSearchHyperparameterSelectionStrategy = RandomSearchHyperparameterSelectionStrategy( ) self.number_of_initial_random_step: int = number_of_initial_random_step self.quantile_threshold: float = quantile_threshold self.number_good_trials_max_cap: int = number_good_trials_max_cap self.number_possible_hyperparams_candidates: int = number_possible_hyperparams_candidates self.prior_weight: float = prior_weight self.use_linear_forgetting_weights: bool = use_linear_forgetting_weights self.number_recent_trial_at_full_weights: int = number_recent_trial_at_full_weights
def test_automl_savebestmodel_callback(tmpdir): # Given hp_repository = HyperparamsJSONRepository(cache_folder=str('caching')) validation_splitter = ValidationSplitter(0.20) auto_ml = AutoML( pipeline=Pipeline([ MultiplyByN(2).set_hyperparams_space(HyperparameterSpace({ 'multiply_by': FixedHyperparameter(2) })), NumpyReshape(new_shape=(-1, 1)), linear_model.LinearRegression() ]), validation_splitter=validation_splitter, hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(), scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False), callbacks=[ BestModelCheckpoint() ], n_trials=1, epochs=10, refit_trial=False, print_func=print, hyperparams_repository=hp_repository, continue_loop_on_error=False ) data_inputs = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) expected_outputs = data_inputs * 4 # When auto_ml.fit(data_inputs=data_inputs, expected_outputs=expected_outputs) #Then trials: Trials = hp_repository.load_all_trials() best_trial = trials.get_best_trial() best_trial_score = best_trial.get_validation_score() best_trial.cache_folder = hp_repository.cache_folder best_model = best_trial.get_model('best') _, _, valid_inputs, valid_outputs = ValidationSplitter(0.20).split(data_inputs, expected_outputs) predicted_output = best_model.predict(valid_inputs) score = mean_squared_error(valid_outputs, predicted_output) assert best_trial_score == score
def _test_within_auto_ml_loop(tmpdir, pipeline): X_train = np.random.random((25, 50)).astype(np.float32) Y_train = np.random.random((25, )).astype(np.float32) validation_splitter = KFoldCrossValidationSplitter(3) scoring_callback = ScoringCallback(median_absolute_error, higher_score_is_better=False) auto_ml = AutoML( pipeline=pipeline, hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(), validation_splitter=validation_splitter, scoring_callback=scoring_callback, n_trials=2, epochs=1, hyperparams_repository=HyperparamsJSONRepository(cache_folder=tmpdir), refit_trial=True, continue_loop_on_error=False) auto_ml.fit(X_train, Y_train)
def test_automl_with_kfold(tmpdir): # Given hp_repository = HyperparamsJSONRepository(cache_folder=str('caching')) auto_ml = AutoML( pipeline=Pipeline([ MultiplyByN(2).set_hyperparams_space( HyperparameterSpace({'multiply_by': FixedHyperparameter(2)})), NumpyReshape(new_shape=(-1, 1)), linear_model.LinearRegression() ]), validation_splitter=ValidationSplitter(0.20), hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(), scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False), callbacks=[ MetricCallback('mse', metric_function=mean_squared_error, higher_score_is_better=False), ], n_trials=1, epochs=10, refit_trial=True, print_func=print, hyperparams_repository=hp_repository, continue_loop_on_error=False) data_inputs = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) expected_outputs = data_inputs * 4 # When auto_ml.fit(data_inputs=data_inputs, expected_outputs=expected_outputs) # Then p = auto_ml.get_best_model() outputs = p.transform(data_inputs) mse = mean_squared_error(expected_outputs, outputs) assert mse < 1000
def test_automl_early_stopping_callback(tmpdir): # Given hp_repository = InMemoryHyperparamsRepository(cache_folder=str(tmpdir)) n_epochs = 10 max_epochs_without_improvement = 3 auto_ml = AutoML( pipeline=Pipeline([ MultiplyByN(2).set_hyperparams_space( HyperparameterSpace({'multiply_by': FixedHyperparameter(2)})), NumpyReshape(new_shape=(-1, 1)), ]), hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(), validation_splitter=ValidationSplitter(0.20), scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False), callbacks=[ MetricCallback('mse', metric_function=mean_squared_error, higher_score_is_better=False), EarlyStoppingCallback(max_epochs_without_improvement) ], n_trials=1, refit_trial=True, epochs=n_epochs, hyperparams_repository=hp_repository, continue_loop_on_error=False) # When data_inputs = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) expected_outputs = data_inputs * 2 auto_ml.fit(data_inputs=data_inputs, expected_outputs=expected_outputs) # Then trial = hp_repository.trials[0] assert len(trial.validation_splits) == 1 validation_scores = trial.validation_splits[0].get_validation_scores() nepochs_executed = len(validation_scores) assert nepochs_executed == max_epochs_without_improvement + 1
def main(tmpdir, sleep_time: float = 0.001, n_iter: int = 10): DATA_INPUTS = np.array(range(100)) EXPECTED_OUTPUTS = np.array(range(100, 200)) HYPERPARAMETER_SPACE = HyperparameterSpace({ 'multiplication_1__multiply_by': RandInt(1, 2), 'multiplication_2__multiply_by': RandInt(1, 2), 'multiplication_3__multiply_by': RandInt(1, 2), }) print('Classic Pipeline:') classic_pipeline_folder = os.path.join(str(tmpdir), 'classic') pipeline = Pipeline([ ('multiplication_1', MultiplyByN()), ('sleep_1', ForEachDataInput(Sleep(sleep_time))), ('multiplication_2', MultiplyByN()), ('sleep_2', ForEachDataInput(Sleep(sleep_time))), ('multiplication_3', MultiplyByN()), ], cache_folder=classic_pipeline_folder).set_hyperparams_space(HYPERPARAMETER_SPACE) time_a = time.time() auto_ml = AutoML( pipeline, refit_trial=True, n_trials=n_iter, cache_folder_when_no_handle=classic_pipeline_folder, validation_splitter=ValidationSplitter(0.2), hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(), scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False), callbacks=[ MetricCallback('mse', metric_function=mean_squared_error, higher_score_is_better=False) ], ) auto_ml = auto_ml.fit(DATA_INPUTS, EXPECTED_OUTPUTS) outputs = auto_ml.get_best_model().predict(DATA_INPUTS) time_b = time.time() actual_score = mean_squared_error(EXPECTED_OUTPUTS, outputs) print('{0} seconds'.format(time_b - time_a)) print('output: {0}'.format(outputs)) print('smallest mse: {0}'.format(actual_score)) print('best hyperparams: {0}'.format(pipeline.get_hyperparams())) assert isinstance(actual_score, float) print('Resumable Pipeline:') resumable_pipeline_folder = os.path.join(str(tmpdir), 'resumable') pipeline = ResumablePipeline([ ('multiplication_1', MultiplyByN()), ('ForEach(sleep_1)', ForEachDataInput(Sleep(sleep_time))), ('checkpoint1', ExpandDim(DefaultCheckpoint())), ('multiplication_2', MultiplyByN()), ('sleep_2', ForEachDataInput(Sleep(sleep_time))), ('checkpoint2', ExpandDim(DefaultCheckpoint())), ('multiplication_3', MultiplyByN()) ], cache_folder=resumable_pipeline_folder).set_hyperparams_space(HYPERPARAMETER_SPACE) time_a = time.time() auto_ml = AutoML( pipeline, refit_trial=True, n_trials=n_iter, cache_folder_when_no_handle=resumable_pipeline_folder, validation_splitter=ValidationSplitter(0.2), hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(), scoring_callback=ScoringCallback(mean_squared_error, higher_score_is_better=False), callbacks=[ MetricCallback('mse', metric_function=mean_squared_error, higher_score_is_better=False) ] ) auto_ml = auto_ml.fit(DATA_INPUTS, EXPECTED_OUTPUTS) outputs = auto_ml.get_best_model().predict(DATA_INPUTS) time_b = time.time() pipeline.flush_all_cache() actual_score = mean_squared_error(EXPECTED_OUTPUTS, outputs) print('{0} seconds'.format(time_b - time_a)) print('output: {0}'.format(outputs)) print('smallest mse: {0}'.format(actual_score)) print('best hyperparams: {0}'.format(pipeline.get_hyperparams())) assert isinstance(actual_score, float)
def main(): def accuracy(data_inputs, expected_outputs): return np.mean( np.argmax(np.array(data_inputs), axis=1) == np.argmax( np.array(expected_outputs), axis=1)) # load the dataset df = read_csv('data/winequality-white.csv', sep=';') data_inputs = df.values data_inputs[:, -1] = data_inputs[:, -1] - 1 n_features = data_inputs.shape[1] - 1 n_classes = 10 p = Pipeline([ TrainOnlyWrapper(DataShuffler()), ColumnTransformerInputOutput( input_columns=[( [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], ToNumpy(np.float32) )], output_columns=[(11, Identity())] ), OutputTransformerWrapper(PlotDistribution(column=-1)), MiniBatchSequentialPipeline([ Tensorflow2ModelStep( create_model=create_model, create_loss=create_loss, create_optimizer=create_optimizer ) \ .set_hyperparams(HyperparameterSamples({ 'n_dense_layers': 2, 'input_dim': n_features, 'optimizer': 'adam', 'activation': 'relu', 'kernel_initializer': 'he_uniform', 'learning_rate': 0.01, 'hidden_dim': 20, 'n_classes': 3 })).set_hyperparams_space(HyperparameterSpace({ 'n_dense_layers': RandInt(2, 4), 'hidden_dim_layer_multiplier': Uniform(0.30, 1), 'input_dim': FixedHyperparameter(n_features), 'optimizer': Choice([ OPTIMIZERS.ADAM.value, OPTIMIZERS.SGD.value, OPTIMIZERS.ADAGRAD.value ]), 'activation': Choice([ ACTIVATIONS.RELU.value, ACTIVATIONS.TANH.value, ACTIVATIONS.SIGMOID.value, ACTIVATIONS.ELU.value, ]), 'kernel_initializer': Choice([ KERNEL_INITIALIZERS.GLOROT_NORMAL.value, KERNEL_INITIALIZERS.GLOROT_UNIFORM.value, KERNEL_INITIALIZERS.HE_UNIFORM.value ]), 'learning_rate': LogUniform(0.005, 0.01), 'hidden_dim': RandInt(3, 80), 'n_classes': FixedHyperparameter(n_classes) })) ], batch_size=33), OutputTransformerWrapper(Pipeline([ ExpandDim(), OneHotEncoder(nb_columns=n_classes, name='classes') ])) ]) auto_ml = AutoML( pipeline=p, hyperparams_repository=InMemoryHyperparamsRepository( cache_folder='trials'), hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(), validation_splitter=ValidationSplitter(test_size=0.30), scoring_callback=ScoringCallback(accuracy, higher_score_is_better=True), callbacks=[ MetricCallback( name='classification_report_imbalanced_metric', metric_function=classificaiton_report_imbalanced_metric, higher_score_is_better=True), MetricCallback(name='f1', metric_function=f1_score_weighted, higher_score_is_better=True), MetricCallback(name='recall', metric_function=recall_score_weighted, higher_score_is_better=True), MetricCallback(name='precision', metric_function=precision_score_weighted, higher_score_is_better=True), EarlyStoppingCallback(max_epochs_without_improvement=3) ], n_trials=200, refit_trial=True, epochs=75) auto_ml = auto_ml.fit(data_inputs=data_inputs)
def main(): # Define classification models, and hyperparams. # See also HyperparameterSpace documentation : https://www.neuraxle.org/stable/api/neuraxle.hyperparams.space.html#neuraxle.hyperparams.space.HyperparameterSpace decision_tree_classifier = SKLearnWrapper( DecisionTreeClassifier(), HyperparameterSpace({ 'criterion': Choice(['gini', 'entropy']), 'splitter': Choice(['best', 'random']), 'min_samples_leaf': RandInt(2, 5), 'min_samples_split': RandInt(2, 4) })) extra_tree_classifier = SKLearnWrapper( ExtraTreeClassifier(), HyperparameterSpace({ 'criterion': Choice(['gini', 'entropy']), 'splitter': Choice(['best', 'random']), 'min_samples_leaf': RandInt(2, 5), 'min_samples_split': RandInt(2, 4) })) ridge_classifier = Pipeline([ OutputTransformerWrapper(NumpyRavel()), SKLearnWrapper( RidgeClassifier(), HyperparameterSpace({ 'alpha': Choice([0.0, 1.0, 10.0, 100.0]), 'fit_intercept': Boolean(), 'normalize': Boolean() })) ]).set_name('RidgeClassifier') logistic_regression = Pipeline([ OutputTransformerWrapper(NumpyRavel()), SKLearnWrapper( LogisticRegression(), HyperparameterSpace({ 'C': LogUniform(0.01, 10.0), 'fit_intercept': Boolean(), 'penalty': Choice(['none', 'l2']), 'max_iter': RandInt(20, 200) })) ]).set_name('LogisticRegression') random_forest_classifier = Pipeline([ OutputTransformerWrapper(NumpyRavel()), SKLearnWrapper( RandomForestClassifier(), HyperparameterSpace({ 'n_estimators': RandInt(50, 600), 'criterion': Choice(['gini', 'entropy']), 'min_samples_leaf': RandInt(2, 5), 'min_samples_split': RandInt(2, 4), 'bootstrap': Boolean() })) ]).set_name('RandomForestClassifier') # Define a classification pipeline that lets the AutoML loop choose one of the classifier. # See also ChooseOneStepOf documentation : https://www.neuraxle.org/stable/api/neuraxle.steps.flow.html#neuraxle.steps.flow.ChooseOneStepOf pipeline = Pipeline([ ChooseOneStepOf([ decision_tree_classifier, extra_tree_classifier, ridge_classifier, logistic_regression, random_forest_classifier ]) ]) # Create the AutoML loop object. # See also AutoML documentation : https://www.neuraxle.org/stable/api/neuraxle.metaopt.auto_ml.html#neuraxle.metaopt.auto_ml.AutoML auto_ml = AutoML( pipeline=pipeline, hyperparams_optimizer=RandomSearchHyperparameterSelectionStrategy(), validation_splitter=ValidationSplitter(test_size=0.20), scoring_callback=ScoringCallback(accuracy_score, higher_score_is_better=True), n_trials=7, epochs=1, hyperparams_repository=HyperparamsJSONRepository(cache_folder='cache'), refit_trial=True, continue_loop_on_error=False) # Load data, and launch AutoML loop ! X_train, y_train, X_test, y_test = generate_classification_data() auto_ml = auto_ml.fit(X_train, y_train) # Get the model from the best trial, and make predictions using predict. # See also predict documentation : https://www.neuraxle.org/stable/api/neuraxle.base.html#neuraxle.base.BaseStep.predict best_pipeline = auto_ml.get_best_model() y_pred = best_pipeline.predict(X_test) accuracy = accuracy_score(y_true=y_test, y_pred=y_pred) print("Test accuracy score:", accuracy) shutil.rmtree('cache')