def test_get_training_examples(self): env = Hovership() truth = SafetyTruth(env) vibly_file_path = '../data/ground_truth/from_vibly/hover_map.pickle' truth.from_vibly_file(vibly_file_path) train_x, train_y = truth.get_training_examples(n_examples=2000) self.assertEqual(train_x.shape[0], train_y.shape[0]) self.assertEqual(train_x.shape[0], 2000) self.assertEqual(train_x.shape[1], truth.stateaction_space.index_dim) train_x, train_y = truth.get_training_examples(n_examples=2000, from_failure=True, viable_proportion=0.6) self.assertEqual(train_x.shape[0], train_y.shape[0]) self.assertEqual(train_x.shape[0], 2000) self.assertEqual(train_x.shape[1], truth.stateaction_space.index_dim) self.assertTrue((train_y[:1200] > 0).all()) self.assertTrue((train_y[1200:] == 0).all())
class HyperparametersSimulation(Simulation): def __init__(self, output_directory, name, max_samples, gamma_optimistic, gamma_cautious, lambda_cautious, shape, ground_truth, random_start=False, every=50): x_seed = np.array([1.45, 0.5]) y_seed = np.array([.8]) dynamics_parameters = { 'shape': shape } self.env = Hovership( random_start=random_start, dynamics_parameters=dynamics_parameters, default_initial_state=x_seed[:1] ) self.ground_truth = SafetyTruth(self.env) self.ground_truth.from_vibly_file(ground_truth) self.hyperparameters = { 'outputscale_prior': (0.4, 2), 'lengthscale_prior': (0.2, 0.2), 'noise_prior': (0.001, 0.002) } self.agent = SafetyLearner( env=self.env, gamma_optimistic=gamma_optimistic, gamma_cautious=gamma_cautious, lambda_cautious=lambda_cautious, x_seed=x_seed, y_seed=y_seed, gp_params=self.hyperparameters, ) self.agent.reset() plotters = { 'Safety': SafetyPlotter(self.agent, self.ground_truth) } super(HyperparametersSimulation, self).__init__( output_directory, name, plotters ) self.max_samples = max_samples self.every = every self.random_start = random_start def run(self): self.run_optim() self.run_learning() def run_optim(self): train_x, train_y = self.ground_truth.get_training_examples( n_examples=2000, from_viable=True, from_failure=False ) self.agent.fit_models(train_x, train_y, epochs=20) def run_learning(self): gamma_optim_increment = ( self.agent.gamma_cautious - self.agent.safety_model.gamma_measure ) / self.max_samples n_samples = 0 self.save_figs(prefix='0') while n_samples < self.max_samples: failed = self.agent.failed n_steps = 0 while not failed and n_steps < 50: n_samples += 1 n_steps += 1 old_state = self.agent.state new_state, reward, failed, _ = self.agent.step() action = self.agent.last_action self.on_run_iteration( n_samples, old_state, action, new_state, reward, failed ) if n_samples >= self.max_samples: break if self.random_start: reset_state = np.atleast_1d( np.random.choice(np.linspace(0, 1.5, 100)) ) self.agent.reset(reset_state) else: reset_state = self.agent.get_random_safe_state() if reset_state is None: raise Exception('The whole measure is 0. There is no safe ' 'action.') self.agent.reset(reset_state) self.agent.safety_model.gamma_measure += gamma_optim_increment self.compile_gif() def on_run_iteration(self, n_samples, old_state, action, new_state, reward, failed): super(HyperparametersSimulation, self).on_run_iteration( old_state, action, new_state, reward, failed ) print(f'Step {n_samples}/{self.max_samples} - {old_state} ' f' -> {action} -> {new_state} ({failed})') if n_samples % self.every == 0: self.save_figs(prefix=f'{n_samples}')
class EpsCorlSimulation(ModelLearningSimulation): def __init__(self, name, max_samples, greed, step_size, discount_rate, gamma_optimistic, gamma_cautious, lambda_cautious, q_x_seed, q_y_seed, s_x_seed, s_y_seed, shape, every, glie_start, s_epochs): self.s_epochs = s_epochs dynamics_parameters = { 'shape': shape } self.env = LowGoalSlip(dynamics_parameters=dynamics_parameters) self.q_hyperparameters = { 'outputscale_prior': (0.4, 2), 'lengthscale_prior': (0.05, 0.1), 'noise_prior': (0.001, 0.002) } self.s_hyperparameters = { 'outputscale_prior': (0.4, 2), 'lengthscale_prior': (0.2, 0.1), 'noise_prior': (0.001, 0.002) } self.q_x_seed = q_x_seed self.q_y_seed = q_y_seed self.s_x_seed = s_x_seed self.s_y_seed = s_y_seed self.gamma_optimistic_start, self.gamma_optimistic_end = identity_or_duplicated_value(gamma_optimistic) self.gamma_cautious_start, self.gamma_cautious_end = identity_or_duplicated_value(gamma_cautious) self.lambda_cautious_start, self.lambda_cautious_end = identity_or_duplicated_value(lambda_cautious) self.gamma_optimistic = self.gamma_optimistic_start self.gamma_cautious = self.gamma_cautious_start self.lambda_cautious = self.lambda_cautious_start self.agent = EpsCorlLearner( self.env, greed=greed, step_size=step_size, discount_rate=discount_rate, q_x_seed=self.q_x_seed, q_y_seed=self.q_y_seed, gamma_optimistic=self.gamma_optimistic, gamma_cautious=self.gamma_cautious, lambda_cautious=self.lambda_cautious, s_x_seed=s_x_seed, s_y_seed=s_y_seed, q_gp_params=self.q_hyperparameters, s_gp_params=self.s_hyperparameters, ) self.ground_truth = SafetyTruth(self.env) self.ground_truth.from_vibly_file( Path(__file__).parent.parent.parent / 'data' / 'ground_truth' / 'from_vibly' / 'slip_map.pickle' ) plotters = { 'Q-Values_Safety': QValueAndSafetyPlotter(self.agent, self.ground_truth) } # plotters = {} output_directory = Path(__file__).parent.resolve() super(EpsCorlSimulation, self).__init__(output_directory, name, plotters) self.max_samples = max_samples self.every = every if isinstance(glie_start, float): self.glie_start = int(glie_start * self.max_samples) else: self.glie_start = glie_start def get_models_to_save(self): # The keys must be the same as the actual names of the attributes, this is used in load_models. # This is hacky and should be replaced return { 'Q_model': self.agent.Q_model, 'safety_model': self.agent.safety_model } def load_models(self, skip_local=False): from edge.model.safety_models import MaternSafety from edge.model.value_models import GPQLearning models_names = list(self.get_models_to_save().keys()) loaders= { 'Q_model': lambda mpath: GPQLearning(mpath, self.env, self.q_x_seed, self.q_y_seed), 'safety_model': lambda mpath: MaternSafety(mpath, self.env, self.gamma_optimistic, self.s_x_seed, self.s_y_seed), } for mname in models_names: if not skip_local: load_path = self.local_models_path / mname else: load_path = self.models_path / mname setattr( self.agent, mname, loaders[mname](load_path) ) def run(self): n_samples = 0 self.save_figs(prefix='0') # train hyperparameters print('Optimizing hyperparameters...') s_train_x, s_train_y = self.ground_truth.get_training_examples() self.agent.fit_models( s_epochs=self.s_epochs, s_train_x=s_train_x, s_train_y=s_train_y, s_optimizer_kwargs={'lr': 0.1} ) self.agent.fit_models( s_epochs=self.s_epochs, s_train_x=s_train_x, s_train_y=s_train_y, s_optimizer_kwargs={'lr': 0.01} ) self.agent.fit_models( s_epochs=self.s_epochs, s_train_x=s_train_x, s_train_y=s_train_y, s_optimizer_kwargs={'lr': 0.001} ) print('Lengthscale:',self.agent.safety_model.gp.covar_module.base_kernel.lengthscale) print('Outputscale:',self.agent.safety_model.gp.covar_module.outputscale) print('Done.') print('Training...') while n_samples < self.max_samples: reset_state = self.agent.get_random_safe_state() self.agent.reset(reset_state) failed = self.agent.failed n_steps = 0 while not failed and n_steps < 50: n_samples += 1 n_steps += 1 old_state = self.agent.state new_state, reward, failed = self.agent.step() action = self.agent.last_action # * start reducing eps to converge to a greedy policy. if self.glie_start is not None and n_samples > self.glie_start: self.agent.greed *= (n_samples - self.glie_start) / ( (n_samples - self.glie_start + 1)) self.agent.gamma_optimistic = affine_interpolation( n_samples / self.max_samples, self.gamma_optimistic_start, self.gamma_optimistic_end ) self.agent.gamma_cautious = affine_interpolation( n_samples / self.max_samples, self.gamma_cautious_start, self.gamma_cautious_end ) self.agent.lambda_cautious = affine_interpolation( n_samples / self.max_samples, self.lambda_cautious_start, self.lambda_cautious_end ) color = None if not self.agent.has_explored else [0.3, 0.3, 0.9] self.on_run_iteration(n_samples, old_state, action, new_state, reward, failed, color=color) if n_samples >= self.max_samples: break self.agent.reset() print('Done.') self.save_figs(prefix=f'{self.name}_final') self.compile_gif() def on_run_iteration(self, n_samples, *args, **kwargs): super(EpsCorlSimulation, self).on_run_iteration(*args, **kwargs) print(f'Iteration {n_samples}/{self.max_samples}') if n_samples % self.every == 0: self.save_figs(prefix=f'{n_samples}')
class OptimisticSimulation(Simulation): def __init__(self, max_samples, gamma_optimistic, gamma_cautious, lambda_cautious, shape, every): self.x_seed = np.array([1.45, 0.5]) self.y_seed = np.array([.8]) dynamics_parameters = { 'shape': shape } self.env = Hovership( random_start=True, dynamics_parameters=dynamics_parameters, default_initial_state=self.x_seed[:1] ) self.ground_truth = SafetyTruth(self.env) self.ground_truth.from_vibly_file( '../data/ground_truth/from_vibly/hover_map.pickle' ) self.hyperparameters = { 'outputscale_prior': (0.4, 2), 'lengthscale_prior': (0.1, 0.1), 'noise_prior': (0.001, 0.002) } self.agent = SafetyLearner( env=self.env, gamma_optimistic=gamma_optimistic, gamma_cautious=gamma_cautious, lambda_cautious=lambda_cautious, x_seed=self.x_seed, y_seed=self.y_seed, gp_params=self.hyperparameters, ) plotters = { 'DetailedSafety': DetailedSafetyPlotter(self.agent, self.ground_truth) } super(OptimisticSimulation, self).__init__( 'results', 'optimistic', plotters ) self.max_samples = max_samples self.every = every self.samples_path = self.output_directory / 'samples' self.samples_path.mkdir(parents=True, exist_ok=True) self.model_path = self.output_directory / 'model' self.model_path.mkdir(parents=True, exist_ok=True) failure_indexes = np.argwhere(self.ground_truth.failure_set == 1) self.failure_set = np.array([ self.ground_truth.stateaction_space[tuple(index)] for index in failure_indexes[::3] ]) def run_optim(self): train_x, train_y = self.ground_truth.get_training_examples( n_examples=2000, from_viable=True, from_failure=False ) self.agent.fit_models(train_x, train_y, epochs=20) def save_samples(self, name): self.agent.safety_model.save_samples(str(self.samples_path / name)) def load_samples(self, name): self.agent.safety_model.load_samples(str(self.samples_path / name)) def save_model(self): self.agent.safety_model.save(str(self.model_path)) def load_model(self): self.agent.safety_model = MaternSafety.load(str(self.model_path), self.env, self.agent.safety_model.gamma_measure, self.x_seed, self.y_seed ) def check_failure_set(self): model = self.agent.safety_model measure_slice, covar_slice = model._query( self.failure_set, return_covar=True) level_value = norm.cdf( (measure_slice - 0) / np.sqrt(covar_slice) ) failure_levels = level_value > model.gamma_measure if failure_levels.any(): print('Nonzero value in the failure set !') def run_learning(self): n_samples = 0 self.save_figs(prefix='0') while n_samples < self.max_samples: failed = self.agent.failed n_steps = 0 while not failed and n_steps < 50: #self.check_failure_set() n_samples += 1 n_steps += 1 old_state = self.agent.state new_state, reward, failed, _ = self.agent.step() action = self.agent.last_action self.on_run_iteration( n_samples, old_state, action, new_state, reward, failed ) if n_samples >= self.max_samples: break reset_state = self.agent.get_random_safe_state() if reset_state is None: raise Exception('The whole measure is 0. There is no safe ' 'action.') self.agent.reset(reset_state) def on_run_iteration(self, n_samples, old_state, action, new_state, reward, failed): super(OptimisticSimulation, self).on_run_iteration( old_state, action, new_state, reward, failed ) print(f'Step {n_samples}/{self.max_samples} - {old_state} ' f' -> {action} -> {new_state} ({failed})') if n_samples % self.every == 0: self.save_figs(prefix=f'{n_samples}')
class SoftHardSimulation(ModelLearningSimulation): def __init__(self, name, env_name, reward_threshold, control_frequency, max_samples, max_steps, greed, step_size, discount_rate, gamma_optimistic, gamma_hard, lambda_hard, gamma_soft, q_x_seed, q_y_seed, s_x_seed, s_y_seed, optimize_hyperparameters, dataset_type, dataset_params, shape, every, glie_start, reset_in_safe_state, plotter_smoothing_window_size): parameterization = { 'env_name': env_name, 'reward_threshold': reward_threshold, 'control_frequency': control_frequency, 'max_samples': max_samples, 'greed': greed, 'step_size': step_size, 'discount_rate': discount_rate, 'gamma_optimistic': gamma_optimistic, 'gamma_hard': gamma_hard, 'lambda_hard': lambda_hard, 'gamma_soft': gamma_soft, 'q_x_seed': q_x_seed, 'q_y_seed': q_y_seed, 's_x_seed': s_x_seed, 's_y_seed': s_y_seed, 'optimize_hyperparameters': optimize_hyperparameters, 'dataset_type': dataset_type, 'dataset_params': dataset_params, 'shape': shape, 'every': every, 'glie_start': glie_start, 'reset_in_safe_state': reset_in_safe_state, 'plotter_smoothing_window_size': plotter_smoothing_window_size } dynamics_parameters = {'shape': shape} if env_name == 'slip': self.env = LowGoalSlip(dynamics_parameters=dynamics_parameters, reward_done_threshold=reward_threshold) elif env_name == 'hovership': self.env = LowGoalHovership( dynamics_parameters=dynamics_parameters, reward_done_threshold=reward_threshold) elif env_name == 'cartpole': self.env = CartPole(discretization_shape=shape, control_frequency=control_frequency) elif env_name == 'lander': self.env = LunarLander(discretization_shape=shape) self.q_hyperparameters = { 'outputscale_prior': (0.12, 0.01), 'lengthscale_prior': (0.15, 0.05), 'noise_prior': (0.001, 0.002), 'dataset_type': dataset_type, 'dataset_params': dataset_params, } self.s_hyperparameters = { 'outputscale_prior': (0.12, 0.01), 'lengthscale_prior': (0.15, 0.05), 'noise_prior': (0.001, 0.002), 'dataset_type': dataset_type, 'dataset_params': dataset_params, } self.q_x_seed = q_x_seed self.q_y_seed = q_y_seed self.s_x_seed = s_x_seed self.s_y_seed = s_y_seed self.optimize_hyperparameters = optimize_hyperparameters self.gamma_optimistic_start, self.gamma_optimistic_end = identity_or_duplicated_value( gamma_optimistic) self.gamma_hard_start, self.gamma_hard_end = identity_or_duplicated_value( gamma_hard) self.lambda_hard_start, self.lambda_hard_end = identity_or_duplicated_value( lambda_hard) self.gamma_soft_start, self.gamma_soft_end = identity_or_duplicated_value( gamma_soft) self.gamma_optimistic = self.gamma_optimistic_start self.gamma_hard = self.gamma_hard_start self.gamma_soft = self.gamma_soft_start self.lambda_hard = self.lambda_hard_start self.agent = SoftHardLearner( self.env, greed=greed, step_size=step_size, discount_rate=discount_rate, q_x_seed=self.q_x_seed, q_y_seed=self.q_y_seed, gamma_optimistic=self.gamma_optimistic, gamma_hard=self.gamma_hard, lambda_hard=self.lambda_hard, gamma_soft=self.gamma_soft, s_x_seed=s_x_seed, s_y_seed=s_y_seed, q_gp_params=self.q_hyperparameters, s_gp_params=self.s_hyperparameters, ) if env_name == 'slip': truth_path = Path(__file__).parent.parent.parent / 'data' / \ 'ground_truth' / 'from_vibly' / 'slip_map.pickle' elif env_name == 'hovership': truth_path = Path(__file__).parent.parent.parent / 'data' / \ 'ground_truth' / 'from_vibly' / 'hover_map.pickle' else: truth_path = None if truth_path is not None: self.ground_truth = SafetyTruth(self.env) self.ground_truth.from_vibly_file(truth_path) else: self.ground_truth = None plottable_Q = ['slip', 'hovership'] if env_name in plottable_Q: plotters = { 'Q-Values_Safety': SoftHardPlotter(self.agent, self.ground_truth, ensure_in_dataset=True) } else: plotters = {} plotters.update({ 'RewardFailure': RewardFailurePlotter(agents_names=['Soft-hard'], window_size=plotter_smoothing_window_size, padding_value=1) }) output_directory = Path(__file__).parent.resolve() super(SoftHardSimulation, self).__init__(output_directory, name, plotters) self.max_samples = max_samples self.max_steps = max_steps self.every = every if isinstance(glie_start, float): self.glie_start = int(glie_start * self.max_samples) else: self.glie_start = glie_start self.reset_in_safe_state = reset_in_safe_state msg = '' for pname, pval in parameterization.items(): msg += pname + ' = ' + str(pval) + ', ' msg = msg[:-2] logging.info(config_msg(f'Simulation started with parameters: {msg}')) def get_models_to_save(self): # The keys must be the same as the actual names of the attributes, this is used in load_models. # This is hacky and should be replaced return { 'Q_model': self.agent.Q_model, 'safety_model': self.agent.safety_model } def load_models(self, skip_local=False): from edge.model.safety_models import MaternSafety from edge.model.value_models import GPQLearning models_names = list(self.get_models_to_save().keys()) loaders = { 'Q_model': lambda mpath: GPQLearning(mpath, self.env, self.q_x_seed, self. q_y_seed), 'safety_model': lambda mpath: MaternSafety(mpath, self.env, self.gamma_optimistic, self.s_x_seed, self.s_y_seed), } for mname in models_names: if not skip_local: load_path = self.local_models_path / mname else: load_path = self.models_path / mname setattr(self.agent, mname, loaders[mname](load_path)) def run(self): n_samples = 0 self.save_figs(prefix='0') if self.optimize_hyperparameters: logging.info('Optimizing hyperparameters...') s_train_x, s_train_y = self.ground_truth.get_training_examples() self.agent.fit_models(s_epochs=50, s_train_x=s_train_x, s_train_y=s_train_y, s_optimizer_kwargs={'lr': 0.1}) self.agent.fit_models(s_epochs=50, s_train_x=s_train_x, s_train_y=s_train_y, s_optimizer_kwargs={'lr': 0.01}) self.agent.fit_models(s_epochs=50, s_train_x=s_train_x, s_train_y=s_train_y, s_optimizer_kwargs={'lr': 0.001}) logging.info('Done.') else: logging.info('Hyperparameters were NOT optimized.') logging.info( config_msg( 'Lengthscale:' f'{self.agent.safety_model.gp.covar_module.base_kernel.lengthscale}' )) logging.info( config_msg( 'Outputscale:' f'{self.agent.safety_model.gp.covar_module.outputscale}')) logging.info('Training...') while n_samples < self.max_samples: if self.reset_in_safe_state: reset_state = self.agent.get_random_safe_state() else: reset_state = None self.agent.reset(reset_state) failed = self.agent.failed done = self.env.done n_steps = 0 while not done and n_steps < self.max_steps: n_samples += 1 n_steps += 1 old_state = self.agent.state new_state, reward, failed, done = self.agent.step() action = self.agent.last_action # * start reducing step size so Q-Learning converges if self.glie_start is not None and n_samples > self.glie_start: self.agent.step_size *= (n_samples - self.glie_start) / ( (n_samples - self.glie_start + 1)) self.agent.gamma_optimistic = affine_interpolation( n_samples / self.max_samples, self.gamma_optimistic_start, self.gamma_optimistic_end) self.agent.gamma_hard = affine_interpolation( n_samples / self.max_samples, self.gamma_hard_start, self.gamma_hard_end) self.agent.lambda_hard = affine_interpolation( n_samples / self.max_samples, self.lambda_hard_start, self.lambda_hard_end) self.agent.gamma_soft = affine_interpolation( n_samples / self.max_samples, self.gamma_soft_start, self.gamma_soft_end) color = None if not self.agent.updated_safety else [ 0.3, 0.3, 0.9 ] self.on_run_iteration(n_samples=n_samples, state=old_state, action=action, new_state=new_state, reward=reward, failed=failed, done=done, color=color, aname='Soft-hard') if n_samples >= self.max_samples: break logging.info('Done.') self.save_figs(prefix=f'{self.name}_final') self.compile_gif() def on_run_iteration(self, n_samples, *args, **kwargs): super(SoftHardSimulation, self).on_run_iteration(*args, **kwargs) logging.info(f'Iteration {n_samples}/{self.max_samples}') logging.info(f'# of Q-values training examples: ' f'{len(self.agent.Q_model.gp.train_x)}') logging.info(f'# of safety measure training examples: ' f'{len(self.agent.safety_model.gp.train_x)}') if kwargs['failed']: logging.info('Failed!') elif kwargs['done']: logging.info('Solved!') if n_samples % self.every == 0: self.save_figs(prefix=f'{n_samples}') self.env.render()