def setUp(self): self.configspace = CS.ConfigurationSpace(42) self.add_hyperparameters() x_train_confs = [ self.configspace.sample_configuration() for i in range(self.n_train) ] self.x_train = np.array([c.get_array() for c in x_train_confs]) x_test_confs = [ self.configspace.sample_configuration() for i in range(self.n_test) ] self.x_test = np.array([c.get_array() for c in x_test_confs]) self.sm_x_train = self.sm_transform_data(self.x_train) self.sm_x_test = self.sm_transform_data(self.x_test) self.sm_kde = sm.nonparametric.KDEMultivariate(data=self.sm_x_train, var_type=self.var_types, bw='cv_ml') self.hp_kde_full = MultivariateKDE(self.configspace, fully_dimensional=True, fix_boundary=False) self.hp_kde_factor = MultivariateKDE(self.configspace, fully_dimensional=False, fix_boundary=False) self.hp_kde_full.fit(self.x_train, bw_estimator='mlcv') self.hp_kde_factor.fit(self.x_train, bw_estimator='mlcv')
def new_result(self, job, update_model=True): """ function to register finished runs Every time a run has finished, this function should be called to register it with the result logger. If overwritten, make sure to call this method from the base class to ensure proper logging. Parameters: ----------- job: hpbandster.distributed.dispatcher.Job object contains all the info about the run """ super().new_result(job) if job.result is None: # One could skip crashed results, but we decided # assign a +inf loss and count them as bad configurations loss = np.inf else: loss = job.result["loss"] budget = job.kwargs["budget"] if budget not in self.configs.keys(): self.configs[budget] = [] self.losses[budget] = [] if len(self.configs.keys()) == 1: min_num_points = 6 else: min_num_points = self.min_points_in_model # skip model building if we already have a bigger model if max(list(self.kde_models.keys()) + [-np.inf]) > budget: return # We want to get a numerical representation of the configuration in the original space conf = ConfigSpace.Configuration( self.configspace, job.kwargs["config"]).get_array().tolist() #import pdb; pdb.set_trace() if conf in self.configs[budget]: i = self.configs[budget].index(conf) self.losses[budget][i].append(loss) print('-' * 50) print('ran config %s with loss %f again' % (conf, loss)) else: self.configs[budget].append(conf) self.losses[budget].append([loss]) # skip model building: # a) if not enough points are available if len(self.configs[budget]) < min_num_points: self.logger.debug( "Only %i run(s) for budget %f available, need more than %s -> can't build model!" % (len(self.configs[budget]), budget, min_num_points)) return # b) during warnm starting when we feed previous results in and only update once if not update_model: return if budget not in self.kde_models.keys(): self.kde_models[budget] = { 'good': MultivariateKDE(self.configspace, min_bandwidth=self.min_bandwidth, fully_dimensional=self.fully_dimensional), 'bad': MultivariateKDE(self.configspace, min_bandwidth=self.min_bandwidth, fully_dimensional=self.fully_dimensional) } #import pdb; pdb.set_trace() num_configs = len(self.losses[budget]) train_configs = np.array(self.configs[budget][-num_configs:]) train_losses = np.array( list(map(np.mean, self.losses[budget][-num_configs:]))) n_good = max(3, (num_configs * self.top_n_percent) // 100) n_bad = num_configs - n_good # Refit KDE for the current budget idx = np.argsort(train_losses) train_data_good = self.impute_conditional_data( train_configs[idx[:n_good]]) train_data_bad = self.impute_conditional_data( train_configs[idx[n_good:n_good + n_bad + 1]]) self.kde_models[budget]['bad'].fit(train_data_bad, bw_estimator=self.bw_estimator) self.kde_models[budget]['good'].fit(train_data_good, bw_estimator=self.bw_estimator) if self.bw_estimator in ['mlcv'] and n_good < 3: self.kde_models[budget]['good'].bandwidths[:] = self.kde_models[ budget]['bad'].bandwidths print('=' * 50) print(self.kde_models[budget]['good'].bandwidths) #print('best:\n',self.kde_models[budget]['good'].data[0]) print(self.kde_models[budget]['good'].data.mean(axis=0)) print(self.kde_models[budget]['good'].data.std(axis=0)) print((train_losses[idx])[:n_good]) print(self.kde_models[budget]['bad'].bandwidths) # update probs for the categorical parameters for later sampling self.logger.debug( 'done building a new model for budget %f based on %i/%i split\nBest loss for this budget:%f\n\n\n\n\n' % (budget, n_good, n_bad, np.min(train_losses)))
class BaseNdTest(object): n_train = 128 n_test = 512 def setUp(self): self.configspace = CS.ConfigurationSpace(42) self.add_hyperparameters() x_train_confs = [ self.configspace.sample_configuration() for i in range(self.n_train)] self.x_train = np.array( [c.get_array() for c in x_train_confs]) x_test_confs = [ self.configspace.sample_configuration() for i in range(self.n_test)] self.x_test= np.array( [c.get_array() for c in x_test_confs]) self.sm_x_train = self.sm_transform_data(self.x_train) self.sm_x_test = self.sm_transform_data(self.x_test) self.sm_kde = sm.nonparametric.KDEMultivariate(data=self.sm_x_train, var_type=self.var_types, bw='cv_ml') self.sm_1d_kdes = [sm.nonparametric.KDEMultivariate(data=self.sm_x_train[:,i], var_type=self.var_types[i], bw='cv_ml') for i in range(len(self.var_types))] self.hp_kde_full = MultivariateKDE(self.configspace, fully_dimensional=True, fix_boundary=False) self.hp_kde_factor = MultivariateKDE(self.configspace, fully_dimensional=False, fix_boundary=False) self.hp_kde_full.fit(self.x_train, bw_estimator='mlcv') self.hp_kde_factor.fit(self.x_train, bw_estimator='mlcv') def sm_transform_data(self, data): return(data) def tearDown(self): self.configspace = None self.x_train = None self.x_test = None self.sm_kde = None self.sm_1d_kdes = None self.hp_kde_full = None self.hp_kde_factor = None @unittest.skipIf(rapid_development, "test skipped to accelerate developing new tests") def test_bandwidths_estimation(self): # This test sometimes fails, as statsmodels uses a different optimizer with a larger tolerance for d in range(len(self.var_types)): self.assertAlmostEqual(self.sm_kde.bw[d], self.hp_kde_full.bandwidths[d], delta=5e-2) self.assertAlmostEqual(self.sm_1d_kdes[d].bw[0], self.hp_kde_factor.bandwidths[d], delta=5e-2) @unittest.skipIf(rapid_development, "test skipped to accelerate developing new tests") def test_pdfs(self): for bw in np.logspace(-0.5,-0.1,5): self.sm_kde.bw = np.array([bw]*len(self.var_types)) self.hp_kde_full.set_bandwidths(np.array([bw]*len(self.var_types))) self.hp_kde_factor.set_bandwidths(np.array([bw]*len(self.var_types))) p1 = self.sm_kde.pdf(self.sm_x_test) p2 = self.hp_kde_full.pdf(self.x_test) p3 = self.hp_kde_factor.pdf(self.x_test) p4_tmp = [] for i, kde in enumerate(self.sm_1d_kdes): kde.bw = np.array([bw]) p4_tmp.append(kde.pdf(self.sm_x_test[:,i])) p4_tmp = np.array(p4_tmp) p4 = np.array(p4_tmp).prod(axis=0) self.assertTrue(np.allclose(p1, p2)) self.assertTrue(np.allclose(p3, p4)) @unittest.skipIf(rapid_development, "test skipped to accelerate developing new tests") def test_loo_likelihood(self): for bw in np.logspace(-1,-0.1,5): self.sm_kde.bw = np.array([bw]*len(self.var_types)) self.hp_kde_full.set_bandwidths(np.array([bw]*len(self.var_types))) self.hp_kde_factor.set_bandwidths(np.array([bw]*len(self.var_types))) sm_full_ll = self.sm_kde.loo_likelihood(bw=np.array([bw]*len(self.var_types)), func=np.log) hp_full_ll = self.hp_kde_full.loo_negloglikelihood() hp_factor_ll = self.hp_kde_factor.loo_negloglikelihood() sm_factor_ll = [] for i, kde in enumerate(self.sm_1d_kdes): kde.bw = np.array([bw]) sm_factor_ll.append(kde.loo_likelihood(bw=np.array([bw]), func=np.log)) sm_factor_ll = np.array(sm_factor_ll) n = self.x_train.shape[0] delta = 1e-2 * np.abs((sm_full_ll + hp_full_ll)/2) # note: statsmodels' ll is not normalized, so we have to transform our result to get the same number! self.assertAlmostEqual(sm_full_ll, n*(hp_full_ll - np.log(n-1)), delta=delta) # same here, but it is easier to apply the normalization to the SM KDE's likelihoods delta = 1e-2 * np.abs(hp_factor_ll) self.assertAlmostEqual(np.sum((sm_factor_ll/n) + np.log(n-1)), hp_factor_ll , delta=delta)
class Base1dTest(object): n_train = 128 n_test = 1024 def setUp(self): self.configspace = CS.ConfigurationSpace(42) self.add_hyperparameters() x_train_confs = [ self.configspace.sample_configuration() for i in range(self.n_train)] self.x_train = np.array( [c.get_array() for c in x_train_confs]) x_test_confs = [ self.configspace.sample_configuration() for i in range(self.n_test)] self.x_test= np.array( [c.get_array() for c in x_test_confs]) self.sm_x_train = self.sm_transform_data(self.x_train) self.sm_x_test = self.sm_transform_data(self.x_test) self.sm_kde = sm.nonparametric.KDEMultivariate(data=self.sm_x_train, var_type=self.var_types, bw='cv_ml') self.hp_kde_full = MultivariateKDE(self.configspace, fully_dimensional=True, fix_boundary=False) self.hp_kde_factor = MultivariateKDE(self.configspace, fully_dimensional=False, fix_boundary=False) self.hp_kde_full.fit(self.x_train, bw_estimator='mlcv') self.hp_kde_factor.fit(self.x_train, bw_estimator='mlcv') def sm_transform_data(self, data): return(data) def tearDown(self): self.configspace = None self.x_train = None self.x_test = None self.sm_kde = None self.hp_kde_full = None self.hp_kde_factor = None @unittest.skipIf(rapid_development, "test skipped to accelerate developing new tests") def test_bandwidths_estimation(self): # This test sometimes fails, as statsmodels uses a different optimizer with a larger tolerance self.assertAlmostEqual(self.sm_kde.bw[0], self.hp_kde_full.bandwidths[0], delta=2e-3) self.assertAlmostEqual(self.sm_kde.bw[0], self.hp_kde_factor.bandwidths[0], delta=2e-3) @unittest.skipIf(rapid_development, "test skipped to accelerate developing new tests") def test_pdfs(self): for bw in np.logspace(-0.5,-0.1,5): self.sm_kde.bw = np.array([bw]) self.hp_kde_full.set_bandwidths(np.array([bw])) self.hp_kde_factor.set_bandwidths(np.array([bw])) p1 = self.sm_kde.pdf(self.sm_x_test) p2 = self.hp_kde_full.pdf(self.x_test) p3 = self.hp_kde_factor.pdf(self.x_test) self.assertTrue(np.allclose(p1, p2)) self.assertTrue(np.allclose(p1, p3)) @unittest.skipIf(rapid_development, "test skipped to accelerate developing new tests") def test_loo_likelihood(self): for bw in np.logspace(-1,-0.1,5): self.sm_kde.bw = np.array([bw]) self.hp_kde_full.set_bandwidths(np.array([bw])) self.hp_kde_factor.set_bandwidths(np.array([bw])) sm_ll = self.sm_kde.loo_likelihood(bw=np.array([bw]), func=np.log) hp_full_ll = self.hp_kde_full.loo_negloglikelihood() hp_factor_ll = self.hp_kde_factor.loo_negloglikelihood() n = self.x_train.shape[0] delta = 1e-3 * np.abs((sm_ll + hp_full_ll)/2) # note: statsmodels' ll is not normalized, so we have to transform our result to get the same number! self.assertAlmostEqual(sm_ll, n*(hp_full_ll - np.log(n-1)), delta=delta) self.assertAlmostEqual(sm_ll, n*(hp_factor_ll - np.log(n-1)), delta=delta)