def print_double_metrics(metric_dict_train, metric_dict_test, photon_system_log=True): t = PrettyTable(["METRIC", "PERFORMANCE TRAIN", "PERFORMANCE TEST"]) for m_key, m_value in metric_dict_train.items(): t.add_row([m_key, "%.4f" % m_value, "%.4f" % metric_dict_test[m_key]]) if photon_system_log: logger.photon_system_log(t) else: logger.debug(t)
def fit(self, X, y=None, **kwargs): logger.photon_system_log('') logger.photon_system_log( '***************************************************************************************************************' ) logger.photon_system_log('Outer Cross validation Fold {}'.format( self.cross_validaton_info.outer_folds[self.outer_fold_id].fold_nr)) logger.photon_system_log( '***************************************************************************************************************' ) self._prepare_data(X, y, **kwargs) self._fit_dummy() self._generate_inner_folds() self._prepare_optimization() outer_fold_fit_start_time = datetime.datetime.now() self.best_metric_yet = None self.tested_config_counter = 0 # distribute number of folds to encapsulated child hyperpipes # self.__distribute_cv_info_to_hyperpipe_children(num_of_folds=num_folds, # outer_fold_counter=outer_fold_counter) if self.cross_validaton_info.calculate_metrics_per_fold: self.fold_operation = FoldOperations.MEAN else: self.fold_operation = FoldOperations.RAW self.max_nr_of_configs = '' if hasattr(self.optimizer, 'n_configurations'): self.max_nr_of_configs = str(self.optimizer.n_configurations) if isinstance(self.optimizer, PhotonMasterOptimizer): self.optimizer.optimize() else: # do the optimizing for current_config in self.optimizer.ask: self.objective_function(current_config) logger.clean_info( '---------------------------------------------------------------------------------------------------------------' ) logger.info( 'Hyperparameter Optimization finished. Now finding best configuration .... ' ) print(self.tested_config_counter) # now go on with the best config found if self.tested_config_counter > 0: best_config_outer_fold = self.optimization_info.get_optimum_config( self.result_object.tested_config_list, self.fold_operation) if not best_config_outer_fold: raise Exception("No best config was found!") # ... and create optimal pipeline optimum_pipe = self.copy_pipe_fnc() if self.cache_updater is not None: self.cache_updater(optimum_pipe, self.cache_folder, "fixed_fold_id") optimum_pipe.caching = False # set self to best config optimum_pipe.set_params(**best_config_outer_fold.config_dict) # Todo: set all children to best config and inform to NOT optimize again, ONLY fit # for child_name, child_config in best_config_outer_fold_mdb.children_config_dict.items(): # if child_config: # # in case we have a pipeline stacking we need to identify the particular subhyperpipe # splitted_name = child_name.split('__') # if len(splitted_name) > 1: # stacking_element = self.optimum_pipe.named_steps[splitted_name[0]] # pipe_element = stacking_element.elements[splitted_name[1]] # else: # pipe_element = self.optimum_pipe.named_steps[child_name] # pipe_element.set_params(**child_config) # pipe_element.is_final_fit = True # self.__distribute_cv_info_to_hyperpipe_children(reset=True) logger.debug( 'Fitting model with best configuration of outer fold...') optimum_pipe.fit(self._validation_X, self._validation_y, **self._validation_kwargs) self.result_object.best_config = best_config_outer_fold # save test performance best_config_performance_mdb = MDBInnerFold() best_config_performance_mdb.fold_nr = -99 best_config_performance_mdb.number_samples_training = self._validation_y.shape[ 0] best_config_performance_mdb.number_samples_validation = self._test_y.shape[ 0] best_config_performance_mdb.feature_importances = optimum_pipe.feature_importances_ if self.cross_validaton_info.eval_final_performance: # Todo: generate mean and std over outer folds as well. move this items to the top logger.info( 'Calculating best model performance on test set...') logger.debug('...scoring test data') test_score_mdb = InnerFoldManager.score( optimum_pipe, self._test_X, self._test_y, indices=self.cross_validaton_info.outer_folds[ self.outer_fold_id].test_indices, metrics=self.optimization_info.metrics, **self._test_kwargs) logger.debug('... scoring training data') train_score_mdb = InnerFoldManager.score( optimum_pipe, self._validation_X, self._validation_y, indices=self.cross_validaton_info.outer_folds[ self.outer_fold_id].train_indices, metrics=self.optimization_info.metrics, training=True, **self._validation_kwargs) best_config_performance_mdb.training = train_score_mdb best_config_performance_mdb.validation = test_score_mdb print_double_metrics(train_score_mdb.metrics, test_score_mdb.metrics) else: def _copy_inner_fold_means(metric_dict): # We copy all mean values from validation to the best config # training train_item_metrics = {} for m in metric_dict: if m.operation == str(self.fold_operation): train_item_metrics[m.metric_name] = m.value train_item = MDBScoreInformation() train_item.metrics_copied_from_inner = True train_item.metrics = train_item_metrics return train_item # training best_config_performance_mdb.training = _copy_inner_fold_means( best_config_outer_fold.metrics_train) # validation best_config_performance_mdb.validation = _copy_inner_fold_means( best_config_outer_fold.metrics_test) # write best config performance to best config item self.result_object.best_config.best_config_score = best_config_performance_mdb logger.info('Computations in outer fold {} took {} minutes.'.format( self.cross_validaton_info.outer_folds[self.outer_fold_id].fold_nr, (datetime.datetime.now() - outer_fold_fit_start_time).total_seconds() / 60))
def print_metrics(header, metric_dict): t = PrettyTable(["PERFORMANCE " + header, ""]) for m_key, m_value in metric_dict.items(): t.add_row([m_key, "%.4f" % m_value]) logger.photon_system_log(t)
def fit(self, X, y=None, **kwargs): logger.photon_system_log("") logger.photon_system_log( "********************************************************" ) logger.photon_system_log( "Outer Cross validation Fold {}".format( self.cross_validaton_info.outer_folds[self.outer_fold_id].fold_nr ) ) logger.photon_system_log( "********************************************************" ) self._prepare_data(X, y, **kwargs) self._fit_dummy() self._generate_inner_folds() self._prepare_optimization() outer_fold_fit_start_time = datetime.datetime.now() best_metric_yet = None tested_config_counter = 0 # distribute number of folds to encapsulated child hyperpipes # self.__distribute_cv_info_to_hyperpipe_children(num_of_folds=num_folds, # outer_fold_counter=outer_fold_counter) if self.cross_validaton_info.calculate_metrics_per_fold: fold_operation = FoldOperations.MEAN else: fold_operation = FoldOperations.RAW max_nr_of_configs = "" if hasattr(self.optimizer, "n_configurations"): max_nr_of_configs = str(self.optimizer.n_configurations) # do the optimizing1 for current_config in self.optimizer.ask: if current_config is None: continue logger.clean_info( "---------------------------------------------------------------------------------------------------------------" ) tested_config_counter += 1 if hasattr(self.optimizer, "ask_for_pipe"): pipe_ctor = self.optimizer.ask_for_pipe() else: pipe_ctor = self.copy_pipe_fnc # self.__distribute_cv_info_to_hyperpipe_children(reset=True, config_counter=tested_config_counter) hp = InnerFoldManager( pipe_ctor, current_config, self.optimization_info, self.cross_validaton_info, self.outer_fold_id, self.constraint_objects, cache_folder=self.cache_folder, cache_updater=self.cache_updater, ) # Test the configuration cross validated by inner_cv object current_config_mdb = hp.fit( self._validation_X, self._validation_y, **self._validation_kwargs ) current_config_mdb.config_nr = tested_config_counter if not current_config_mdb.config_failed: metric_train = MDBHelper.get_metric( current_config_mdb, fold_operation, self.optimization_info.best_config_metric, ) metric_test = MDBHelper.get_metric( current_config_mdb, fold_operation, self.optimization_info.best_config_metric, train=False, ) if metric_train is None or metric_test is None: raise Exception( "Config did not fail, but did not get any metrics either....!!?" ) config_performance = (metric_train, metric_test) if best_metric_yet is None: best_metric_yet = config_performance self.current_best_config = current_config_mdb else: # check if we have the next superstar around that exceeds any old performance if self.optimization_info.maximize_metric: if metric_test > best_metric_yet[1]: best_metric_yet = config_performance self.current_best_config.save_memory() self.current_best_config = current_config_mdb else: current_config_mdb.save_memory() else: if metric_test < best_metric_yet[1]: best_metric_yet = config_performance self.current_best_config.save_memory() self.current_best_config = current_config_mdb else: current_config_mdb.save_memory() # Print Result for config computation_duration = ( current_config_mdb.computation_end_time - current_config_mdb.computation_start_time ) logger.info( "Computed configuration " + str(tested_config_counter) + "/" + max_nr_of_configs + " in " + str(computation_duration) ) logger.info( "Performance: " + self.optimization_info.best_config_metric + " - Train: " + "%.4f" % config_performance[0] + ", Validation: " + "%.4f" % config_performance[1] ) logger.info( "Best Performance So Far: " + self.optimization_info.best_config_metric + " - Train: " + "%.4f" % best_metric_yet[0] + ", Validation: " + "%.4f" % best_metric_yet[1] ) else: config_performance = (-1, -1) # Print Result for config logger.debug("...failed:") logger.error(current_config_mdb.config_error) # add config to result tree self.result_object.tested_config_list.append(current_config_mdb) # 3. inform optimizer about performance logger.debug("Telling hyperparameter optimizer about recent performance.") self.optimizer.tell(current_config, config_performance) logger.debug("Asking hyperparameter optimizer for new config.") logger.clean_info( "---------------------------------------------------------------------------------------------------------------" ) logger.info( "Hyperparameter Optimization finished. Now finding best configuration .... " ) # now go on with the best config found if tested_config_counter > 0: best_config_outer_fold = self.optimization_info.get_optimum_config( self.result_object.tested_config_list, fold_operation ) if not best_config_outer_fold: raise Exception("No best config was found!") # ... and create optimal pipeline optimum_pipe = self.copy_pipe_fnc() if self.cache_updater is not None: self.cache_updater(optimum_pipe, self.cache_folder, "fixed_fold_id") optimum_pipe.caching = False # set self to best config optimum_pipe.set_params(**best_config_outer_fold.config_dict) # Todo: set all children to best config and inform to NOT optimize again, ONLY fit # for child_name, child_config in best_config_outer_fold_mdb.children_config_dict.items(): # if child_config: # # in case we have a pipeline stacking we need to identify the particular subhyperpipe # splitted_name = child_name.split('__') # if len(splitted_name) > 1: # stacking_element = self.optimum_pipe.named_steps[splitted_name[0]] # pipe_element = stacking_element.elements[splitted_name[1]] # else: # pipe_element = self.optimum_pipe.named_steps[child_name] # pipe_element.set_params(**child_config) # pipe_element.is_final_fit = True # self.__distribute_cv_info_to_hyperpipe_children(reset=True) logger.debug("Fitting model with best configuration of outer fold...") optimum_pipe.fit( self._validation_X, self._validation_y, **self._validation_kwargs ) self.result_object.best_config = best_config_outer_fold # save test performance best_config_performance_mdb = MDBInnerFold() best_config_performance_mdb.fold_nr = -99 best_config_performance_mdb.number_samples_training = self._validation_y.shape[ 0 ] best_config_performance_mdb.number_samples_validation = self._test_y.shape[ 0 ] best_config_performance_mdb.feature_importances = ( optimum_pipe.feature_importances_ ) if self.cross_validaton_info.eval_final_performance: # Todo: generate mean and std over outer folds as well. move this items to the top logger.info("Calculating best model performance on test set...") logger.debug("...scoring test data") test_score_mdb = InnerFoldManager.score( optimum_pipe, self._test_X, self._test_y, indices=self.cross_validaton_info.outer_folds[ self.outer_fold_id ].test_indices, metrics=self.optimization_info.metrics, **self._test_kwargs ) logger.debug("... scoring training data") train_score_mdb = InnerFoldManager.score( optimum_pipe, self._validation_X, self._validation_y, indices=self.cross_validaton_info.outer_folds[ self.outer_fold_id ].train_indices, metrics=self.optimization_info.metrics, training=True, **self._validation_kwargs ) best_config_performance_mdb.training = train_score_mdb best_config_performance_mdb.validation = test_score_mdb print_double_metrics(train_score_mdb.metrics, test_score_mdb.metrics) else: def _copy_inner_fold_means(metric_dict): # We copy all mean values from validation to the best config # training train_item_metrics = {} for m in metric_dict: if m.operation == str(fold_operation): train_item_metrics[m.metric_name] = m.value train_item = MDBScoreInformation() train_item.metrics_copied_from_inner = True train_item.metrics = train_item_metrics return train_item # training best_config_performance_mdb.training = _copy_inner_fold_means( best_config_outer_fold.metrics_train ) # validation best_config_performance_mdb.validation = _copy_inner_fold_means( best_config_outer_fold.metrics_test ) # write best config performance to best config item self.result_object.best_config.best_config_score = ( best_config_performance_mdb ) logger.info( "Computations in outer fold {} took {} minutes.".format( self.cross_validaton_info.outer_folds[self.outer_fold_id].fold_nr, (datetime.datetime.now() - outer_fold_fit_start_time).total_seconds() / 60, ) )