def _fit_dummy(self): if self.dummy_estimator is not None: logger.info("Running Dummy Estimator...") try: if isinstance(self._validation_X, np.ndarray): if len(self._validation_X.shape) > 2: logger.info( "Skipping dummy estimator because of too many dimensions" ) self.result_object.dummy_results = None return dummy_y = np.reshape(self._validation_y, (-1, 1)) self.dummy_estimator.fit(dummy_y, self._validation_y) train_scores = InnerFoldManager.score( self.dummy_estimator, self._validation_X, self._validation_y, metrics=self.optimization_info.metrics, ) # fill result tree with fold information inner_fold = MDBInnerFold() inner_fold.training = train_scores if self.cross_validaton_info.eval_final_performance: test_scores = InnerFoldManager.score( self.dummy_estimator, self._test_X, self._test_y, metrics=self.optimization_info.metrics, ) print_metrics("DUMMY", test_scores.metrics) inner_fold.validation = test_scores self.result_object.dummy_results = inner_fold # performaceConstraints: DummyEstimator if self.constraint_objects is not None: dummy_constraint_objs = [ opt for opt in self.constraint_objects if isinstance(opt, DummyPerformance) ] if dummy_constraint_objs: for dummy_constraint_obj in dummy_constraint_objs: dummy_constraint_obj.set_dummy_performance( self.result_object.dummy_results ) return inner_fold except Exception as e: logger.error(e) logger.info("Skipping dummy because of error..") return None else: logger.info("Skipping dummy ..")
def test_get_optimum_config_outer_folds(self): my_pipe_optimizer = Hyperpipe.Optimization( "grid_search", {}, [], "balanced_accuracy", None ) outer_fold_list = list() for i in range(10): outer_fold = MDBOuterFold() outer_fold.best_config = MDBConfig() outer_fold.best_config.best_config_score = MDBInnerFold() outer_fold.best_config.best_config_score.validation = MDBScoreInformation() # again fold 5 wins if i == 5: outer_fold.best_config.best_config_score.validation.metrics = { "balanced_accuracy": 0.99 } else: outer_fold.best_config.best_config_score.validation.metrics = { "balanced_accuracy": 0.5 } outer_fold_list.append(outer_fold) best_config_outer_folds = my_pipe_optimizer.get_optimum_config_outer_folds( outer_fold_list ) self.assertEqual( best_config_outer_folds.best_config_score.validation.metrics[ "balanced_accuracy" ], 0.99, ) self.assertIs(best_config_outer_folds, outer_fold_list[5].best_config)
def update_config_item_with_inner_fold(config_item, fold_cnt, curr_train_fold, curr_test_fold, time_monitor, feature_importances, learning_curves): # fill result tree with fold information inner_fold = MDBInnerFold() inner_fold.fold_nr = fold_cnt inner_fold.training = curr_train_fold inner_fold.validation = curr_test_fold inner_fold.number_samples_validation = len(curr_test_fold.indices) inner_fold.number_samples_training = len(curr_train_fold.indices) inner_fold.time_monitor = time_monitor inner_fold.feature_importances = feature_importances inner_fold.learning_curves = learning_curves # save all inner folds to the tree under the config item config_item.inner_folds.append(inner_fold)
def setUp(self): """ Set default start setting for all tests. """ self.constraint_object = PhotonBaseConstraint( strategy="first", metric="mean_squared_error", margin=0.1) metrics_list = ["f1_score", "mean_squared_error"] self.dummy_config_item = MDBConfig() self.dummy_config_item.inner_folds = [] for i in range(5): inner_fold = MDBInnerFold() inner_fold.validation = MDBScoreInformation() for metric in metrics_list: inner_fold.validation.metrics[metric] = ( np.random.randint(0, 1) / 2 + 0.0001) self.dummy_config_item.inner_folds.append(inner_fold) self.dummy_linear_config_item = MDBConfig() self.dummy_linear_config_item.inner_folds = [] for i in range(5): inner_fold = MDBInnerFold() inner_fold.validation = MDBScoreInformation() for metric in metrics_list: inner_fold.validation.metrics[metric] = i / 4 self.dummy_linear_config_item.inner_folds.append(inner_fold)
def fit(self, X, y=None, **kwargs): logger.photon_system_log('') logger.photon_system_log( '***************************************************************************************************************' ) logger.photon_system_log('Outer Cross validation Fold {}'.format( self.cross_validaton_info.outer_folds[self.outer_fold_id].fold_nr)) logger.photon_system_log( '***************************************************************************************************************' ) self._prepare_data(X, y, **kwargs) self._fit_dummy() self._generate_inner_folds() self._prepare_optimization() outer_fold_fit_start_time = datetime.datetime.now() self.best_metric_yet = None self.tested_config_counter = 0 # distribute number of folds to encapsulated child hyperpipes # self.__distribute_cv_info_to_hyperpipe_children(num_of_folds=num_folds, # outer_fold_counter=outer_fold_counter) if self.cross_validaton_info.calculate_metrics_per_fold: self.fold_operation = FoldOperations.MEAN else: self.fold_operation = FoldOperations.RAW self.max_nr_of_configs = '' if hasattr(self.optimizer, 'n_configurations'): self.max_nr_of_configs = str(self.optimizer.n_configurations) if isinstance(self.optimizer, PhotonMasterOptimizer): self.optimizer.optimize() else: # do the optimizing for current_config in self.optimizer.ask: self.objective_function(current_config) logger.clean_info( '---------------------------------------------------------------------------------------------------------------' ) logger.info( 'Hyperparameter Optimization finished. Now finding best configuration .... ' ) print(self.tested_config_counter) # now go on with the best config found if self.tested_config_counter > 0: best_config_outer_fold = self.optimization_info.get_optimum_config( self.result_object.tested_config_list, self.fold_operation) if not best_config_outer_fold: raise Exception("No best config was found!") # ... and create optimal pipeline optimum_pipe = self.copy_pipe_fnc() if self.cache_updater is not None: self.cache_updater(optimum_pipe, self.cache_folder, "fixed_fold_id") optimum_pipe.caching = False # set self to best config optimum_pipe.set_params(**best_config_outer_fold.config_dict) # Todo: set all children to best config and inform to NOT optimize again, ONLY fit # for child_name, child_config in best_config_outer_fold_mdb.children_config_dict.items(): # if child_config: # # in case we have a pipeline stacking we need to identify the particular subhyperpipe # splitted_name = child_name.split('__') # if len(splitted_name) > 1: # stacking_element = self.optimum_pipe.named_steps[splitted_name[0]] # pipe_element = stacking_element.elements[splitted_name[1]] # else: # pipe_element = self.optimum_pipe.named_steps[child_name] # pipe_element.set_params(**child_config) # pipe_element.is_final_fit = True # self.__distribute_cv_info_to_hyperpipe_children(reset=True) logger.debug( 'Fitting model with best configuration of outer fold...') optimum_pipe.fit(self._validation_X, self._validation_y, **self._validation_kwargs) self.result_object.best_config = best_config_outer_fold # save test performance best_config_performance_mdb = MDBInnerFold() best_config_performance_mdb.fold_nr = -99 best_config_performance_mdb.number_samples_training = self._validation_y.shape[ 0] best_config_performance_mdb.number_samples_validation = self._test_y.shape[ 0] best_config_performance_mdb.feature_importances = optimum_pipe.feature_importances_ if self.cross_validaton_info.eval_final_performance: # Todo: generate mean and std over outer folds as well. move this items to the top logger.info( 'Calculating best model performance on test set...') logger.debug('...scoring test data') test_score_mdb = InnerFoldManager.score( optimum_pipe, self._test_X, self._test_y, indices=self.cross_validaton_info.outer_folds[ self.outer_fold_id].test_indices, metrics=self.optimization_info.metrics, **self._test_kwargs) logger.debug('... scoring training data') train_score_mdb = InnerFoldManager.score( optimum_pipe, self._validation_X, self._validation_y, indices=self.cross_validaton_info.outer_folds[ self.outer_fold_id].train_indices, metrics=self.optimization_info.metrics, training=True, **self._validation_kwargs) best_config_performance_mdb.training = train_score_mdb best_config_performance_mdb.validation = test_score_mdb print_double_metrics(train_score_mdb.metrics, test_score_mdb.metrics) else: def _copy_inner_fold_means(metric_dict): # We copy all mean values from validation to the best config # training train_item_metrics = {} for m in metric_dict: if m.operation == str(self.fold_operation): train_item_metrics[m.metric_name] = m.value train_item = MDBScoreInformation() train_item.metrics_copied_from_inner = True train_item.metrics = train_item_metrics return train_item # training best_config_performance_mdb.training = _copy_inner_fold_means( best_config_outer_fold.metrics_train) # validation best_config_performance_mdb.validation = _copy_inner_fold_means( best_config_outer_fold.metrics_test) # write best config performance to best config item self.result_object.best_config.best_config_score = best_config_performance_mdb logger.info('Computations in outer fold {} took {} minutes.'.format( self.cross_validaton_info.outer_folds[self.outer_fold_id].fold_nr, (datetime.datetime.now() - outer_fold_fit_start_time).total_seconds() / 60))
def fit(self, X, y=None, **kwargs): logger.photon_system_log("") logger.photon_system_log( "********************************************************" ) logger.photon_system_log( "Outer Cross validation Fold {}".format( self.cross_validaton_info.outer_folds[self.outer_fold_id].fold_nr ) ) logger.photon_system_log( "********************************************************" ) self._prepare_data(X, y, **kwargs) self._fit_dummy() self._generate_inner_folds() self._prepare_optimization() outer_fold_fit_start_time = datetime.datetime.now() best_metric_yet = None tested_config_counter = 0 # distribute number of folds to encapsulated child hyperpipes # self.__distribute_cv_info_to_hyperpipe_children(num_of_folds=num_folds, # outer_fold_counter=outer_fold_counter) if self.cross_validaton_info.calculate_metrics_per_fold: fold_operation = FoldOperations.MEAN else: fold_operation = FoldOperations.RAW max_nr_of_configs = "" if hasattr(self.optimizer, "n_configurations"): max_nr_of_configs = str(self.optimizer.n_configurations) # do the optimizing1 for current_config in self.optimizer.ask: if current_config is None: continue logger.clean_info( "---------------------------------------------------------------------------------------------------------------" ) tested_config_counter += 1 if hasattr(self.optimizer, "ask_for_pipe"): pipe_ctor = self.optimizer.ask_for_pipe() else: pipe_ctor = self.copy_pipe_fnc # self.__distribute_cv_info_to_hyperpipe_children(reset=True, config_counter=tested_config_counter) hp = InnerFoldManager( pipe_ctor, current_config, self.optimization_info, self.cross_validaton_info, self.outer_fold_id, self.constraint_objects, cache_folder=self.cache_folder, cache_updater=self.cache_updater, ) # Test the configuration cross validated by inner_cv object current_config_mdb = hp.fit( self._validation_X, self._validation_y, **self._validation_kwargs ) current_config_mdb.config_nr = tested_config_counter if not current_config_mdb.config_failed: metric_train = MDBHelper.get_metric( current_config_mdb, fold_operation, self.optimization_info.best_config_metric, ) metric_test = MDBHelper.get_metric( current_config_mdb, fold_operation, self.optimization_info.best_config_metric, train=False, ) if metric_train is None or metric_test is None: raise Exception( "Config did not fail, but did not get any metrics either....!!?" ) config_performance = (metric_train, metric_test) if best_metric_yet is None: best_metric_yet = config_performance self.current_best_config = current_config_mdb else: # check if we have the next superstar around that exceeds any old performance if self.optimization_info.maximize_metric: if metric_test > best_metric_yet[1]: best_metric_yet = config_performance self.current_best_config.save_memory() self.current_best_config = current_config_mdb else: current_config_mdb.save_memory() else: if metric_test < best_metric_yet[1]: best_metric_yet = config_performance self.current_best_config.save_memory() self.current_best_config = current_config_mdb else: current_config_mdb.save_memory() # Print Result for config computation_duration = ( current_config_mdb.computation_end_time - current_config_mdb.computation_start_time ) logger.info( "Computed configuration " + str(tested_config_counter) + "/" + max_nr_of_configs + " in " + str(computation_duration) ) logger.info( "Performance: " + self.optimization_info.best_config_metric + " - Train: " + "%.4f" % config_performance[0] + ", Validation: " + "%.4f" % config_performance[1] ) logger.info( "Best Performance So Far: " + self.optimization_info.best_config_metric + " - Train: " + "%.4f" % best_metric_yet[0] + ", Validation: " + "%.4f" % best_metric_yet[1] ) else: config_performance = (-1, -1) # Print Result for config logger.debug("...failed:") logger.error(current_config_mdb.config_error) # add config to result tree self.result_object.tested_config_list.append(current_config_mdb) # 3. inform optimizer about performance logger.debug("Telling hyperparameter optimizer about recent performance.") self.optimizer.tell(current_config, config_performance) logger.debug("Asking hyperparameter optimizer for new config.") logger.clean_info( "---------------------------------------------------------------------------------------------------------------" ) logger.info( "Hyperparameter Optimization finished. Now finding best configuration .... " ) # now go on with the best config found if tested_config_counter > 0: best_config_outer_fold = self.optimization_info.get_optimum_config( self.result_object.tested_config_list, fold_operation ) if not best_config_outer_fold: raise Exception("No best config was found!") # ... and create optimal pipeline optimum_pipe = self.copy_pipe_fnc() if self.cache_updater is not None: self.cache_updater(optimum_pipe, self.cache_folder, "fixed_fold_id") optimum_pipe.caching = False # set self to best config optimum_pipe.set_params(**best_config_outer_fold.config_dict) # Todo: set all children to best config and inform to NOT optimize again, ONLY fit # for child_name, child_config in best_config_outer_fold_mdb.children_config_dict.items(): # if child_config: # # in case we have a pipeline stacking we need to identify the particular subhyperpipe # splitted_name = child_name.split('__') # if len(splitted_name) > 1: # stacking_element = self.optimum_pipe.named_steps[splitted_name[0]] # pipe_element = stacking_element.elements[splitted_name[1]] # else: # pipe_element = self.optimum_pipe.named_steps[child_name] # pipe_element.set_params(**child_config) # pipe_element.is_final_fit = True # self.__distribute_cv_info_to_hyperpipe_children(reset=True) logger.debug("Fitting model with best configuration of outer fold...") optimum_pipe.fit( self._validation_X, self._validation_y, **self._validation_kwargs ) self.result_object.best_config = best_config_outer_fold # save test performance best_config_performance_mdb = MDBInnerFold() best_config_performance_mdb.fold_nr = -99 best_config_performance_mdb.number_samples_training = self._validation_y.shape[ 0 ] best_config_performance_mdb.number_samples_validation = self._test_y.shape[ 0 ] best_config_performance_mdb.feature_importances = ( optimum_pipe.feature_importances_ ) if self.cross_validaton_info.eval_final_performance: # Todo: generate mean and std over outer folds as well. move this items to the top logger.info("Calculating best model performance on test set...") logger.debug("...scoring test data") test_score_mdb = InnerFoldManager.score( optimum_pipe, self._test_X, self._test_y, indices=self.cross_validaton_info.outer_folds[ self.outer_fold_id ].test_indices, metrics=self.optimization_info.metrics, **self._test_kwargs ) logger.debug("... scoring training data") train_score_mdb = InnerFoldManager.score( optimum_pipe, self._validation_X, self._validation_y, indices=self.cross_validaton_info.outer_folds[ self.outer_fold_id ].train_indices, metrics=self.optimization_info.metrics, training=True, **self._validation_kwargs ) best_config_performance_mdb.training = train_score_mdb best_config_performance_mdb.validation = test_score_mdb print_double_metrics(train_score_mdb.metrics, test_score_mdb.metrics) else: def _copy_inner_fold_means(metric_dict): # We copy all mean values from validation to the best config # training train_item_metrics = {} for m in metric_dict: if m.operation == str(fold_operation): train_item_metrics[m.metric_name] = m.value train_item = MDBScoreInformation() train_item.metrics_copied_from_inner = True train_item.metrics = train_item_metrics return train_item # training best_config_performance_mdb.training = _copy_inner_fold_means( best_config_outer_fold.metrics_train ) # validation best_config_performance_mdb.validation = _copy_inner_fold_means( best_config_outer_fold.metrics_test ) # write best config performance to best config item self.result_object.best_config.best_config_score = ( best_config_performance_mdb ) logger.info( "Computations in outer fold {} took {} minutes.".format( self.cross_validaton_info.outer_folds[self.outer_fold_id].fold_nr, (datetime.datetime.now() - outer_fold_fit_start_time).total_seconds() / 60, ) )