def test_ddpg_trainer(self): environment = GridworldContinuous() samples = environment.generate_samples(200000, 1.0) epochs = 3 trainer = DDPGTrainer( self.get_ddpg_parameters(), environment.normalization, environment.normalization_action, ) evaluator = GridworldDDPGEvaluator(environment, True) tdps = environment.preprocess_samples(samples, self.minibatch_size) for epoch in range(epochs): print("On epoch {} of {}".format(epoch + 1, epochs)) critic_predictor = trainer.predictor() evaluator.evaluate_critic(critic_predictor) for tdp in tdps: training_samples = [ tdp.states, tdp.actions, tdp.rewards.flatten(), tdp.next_states, None, 1 - tdp.not_terminals.flatten(), # done None, None, [1 for i in range(len(tdp.states))], # time diff ] trainer.train(training_samples) critic_predictor = trainer.predictor() error = evaluator.evaluate_critic(critic_predictor) print("gridworld MAE: {0:.3f}".format(error))
def test_ddpg_trainer(self): environment = GridworldContinuous() samples = environment.generate_samples(500000, 0.25) trainer = DDPGTrainer( self.get_ddpg_parameters(), environment.normalization, environment.normalization_action, environment.min_action_range, environment.max_action_range, ) evaluator = GridworldDDPGEvaluator(environment, True, DISCOUNT, False, samples) tdps = environment.preprocess_samples(samples, self.minibatch_size) critic_predictor = trainer.predictor(actor=False) evaluator.evaluate_critic(critic_predictor) for tdp in tdps: tdp.rewards = tdp.rewards.flatten() tdp.not_terminals = tdp.not_terminals.flatten() trainer.train(tdp) # Make sure actor predictor works actor = trainer.predictor(actor=True) evaluator.evaluate_actor(actor) # Evaluate critic predicor for correctness critic_predictor = trainer.predictor(actor=False) error = evaluator.evaluate_critic(critic_predictor) print("gridworld MAE: {0:.3f}".format(error))
def test_evaluator_ground_truth(self): environment = GridworldContinuous() states, actions, rewards, next_states, next_actions, is_terminal,\ possible_next_actions, _ = environment.generate_samples(100000, 1.0) true_values = environment.true_values_for_sample( states, actions, False) # Hijack the reward timeline to insert the ground truth reward_timelines = [] for tv in true_values: reward_timelines.append({0: tv}) trainer = self.get_sarsa_trainer(environment) evaluator = Evaluator(trainer, DISCOUNT) tdps = environment.preprocess_samples( states, actions, rewards, next_states, next_actions, is_terminal, possible_next_actions, reward_timelines, self.minibatch_size, ) for tdp in tdps: trainer.train_numpy(tdp, evaluator) self.assertLess(evaluator.td_loss[-1], 0.05) self.assertLess(evaluator.mc_loss[-1], 0.12)
def test_trainer_sarsa(self): environment = GridworldContinuous() states, actions, rewards, next_states, next_actions, is_terminal,\ possible_next_actions, reward_timelines = \ environment.generate_samples(100000, 1.0) trainer = self.get_sarsa_trainer(environment) predictor = trainer.predictor() evaluator = GridworldContinuousEvaluator(environment, False) tdps = environment.preprocess_samples( states, actions, rewards, next_states, next_actions, is_terminal, possible_next_actions, reward_timelines, self.minibatch_size, ) self.assertGreater(evaluator.evaluate(predictor), 0.15) for tdp in tdps: trainer.train_numpy(tdp, None) evaluator.evaluate(predictor) self.assertLess(evaluator.evaluate(predictor), 0.05)
def test_trainer_maxq(self): environment = GridworldContinuous() rl_parameters = self.get_sarsa_parameters() new_rl_parameters = ContinuousActionModelParameters( rl=RLParameters( gamma=DISCOUNT, target_update_rate=0.5, reward_burnin=10, maxq_learning=True, ), training=rl_parameters.training, knn=rl_parameters.knn, ) maxq_trainer = ContinuousActionDQNTrainer( new_rl_parameters, environment.normalization, environment.normalization_action, ) samples = environment.generate_samples(100000, 1.0) predictor = maxq_trainer.predictor() tdps = environment.preprocess_samples(samples, self.minibatch_size) evaluator = GridworldContinuousEvaluator(environment, True) self.assertGreater(evaluator.evaluate(predictor), 0.2) for _ in range(2): for tdp in tdps: maxq_trainer.train_numpy(tdp, None) evaluator.evaluate(predictor) self.assertLess(evaluator.evaluate(predictor), 0.15)
def test_gridworld_continuous_generate_samples(self): env = GridworldContinuous() num_samples = 1000 num_steps = 5 samples = env.generate_samples( num_samples, epsilon=1.0, discount_factor=0.9, multi_steps=num_steps ) self._check_samples(samples, num_samples, num_steps, True)
def test_evaluator_timeline(self): environment = GridworldContinuous() samples = environment.generate_samples(100000, 1.0) trainer = self.get_sarsa_trainer(environment) evaluator = Evaluator(1) tdps = environment.preprocess_samples(samples, self.minibatch_size) for tdp in tdps: trainer.train_numpy(tdp, evaluator) self.assertLess(evaluator.td_loss[-1], 0.2) self.assertLess(evaluator.mc_loss[-1], 0.2)
def test_gridworld_continuous_generate_samples(self): env = GridworldContinuous() num_samples = 1000 num_steps = 5 samples = env.generate_samples( num_samples, epsilon=1.0, discount_factor=0.9, multi_steps=num_steps, include_shorter_samples_at_start=True, include_shorter_samples_at_end=True, ) self._check_samples(samples, num_samples, num_steps, True)
def test_evaluator_ground_truth(self): environment = GridworldContinuous() samples = environment.generate_samples(500000, 1.0, DISCOUNT) # Hijack the reward timeline to insert the ground truth samples.episode_values = environment.true_values_for_sample( samples.states, samples.actions, False) trainer = self.get_sarsa_trainer(environment) evaluator = Evaluator(None, 10, DISCOUNT, None, None) tdps = environment.preprocess_samples(samples, self.minibatch_size) for tdp in tdps: trainer.train(tdp, evaluator) self.assertLess(evaluator.mc_loss[-1], 0.15)
def test_trainer_sarsa(self): environment = GridworldContinuous() samples = environment.generate_samples(100000, 1.0) trainer = self.get_sarsa_trainer(environment) predictor = trainer.predictor() evaluator = GridworldContinuousEvaluator(environment, False, DISCOUNT, False, samples) tdps = environment.preprocess_samples(samples, self.minibatch_size) for tdp in tdps: trainer.train_numpy(tdp, None) evaluator.evaluate(predictor) self.assertLess(evaluator.evaluate(predictor), 0.15)
def generate_samples(self, num_transitions, epsilon, discount_factor) -> Samples: samples = GridworldContinuous.generate_samples(self, num_transitions, epsilon, discount_factor) enum_states = [] for state in samples.states: enum_states.append({0: float(list(state.keys())[0])}) enum_next_states = [] for state in samples.next_states: enum_next_states.append({0: float(list(state.keys())[0])}) return Samples( mdp_ids=samples.mdp_ids, sequence_numbers=samples.sequence_numbers, states=enum_states, actions=samples.actions, action_probabilities=samples.action_probabilities, rewards=samples.rewards, possible_actions=samples.possible_actions, next_states=enum_next_states, next_actions=samples.next_actions, terminals=samples.terminals, possible_next_actions=samples.possible_next_actions, episode_values=samples.episode_values, )
def true_values_for_sample(self, enum_states, actions, assume_optimal_policy: bool): states = [] for state in enum_states: states.append({int(list(state.values())[0]): 1}) return GridworldContinuous.true_values_for_sample( self, states, actions, assume_optimal_policy )
def _test_trainer_sarsa(self, use_gpu=False, use_all_avail_gpus=False, modular=False): environment = GridworldContinuous() evaluator = GridworldContinuousEvaluator( environment, assume_optimal_policy=False, gamma=DISCOUNT, use_int_features=False, ) if modular: # FIXME: the exporter should make a copy of the model; moving it to CPU inplace if use_gpu: self.run_pre_training_eval = False if use_all_avail_gpus: self.tolerance_threshold = 0.11 trainer, exporter = self.get_modular_sarsa_trainer_exporter( environment, None, use_gpu, use_all_avail_gpus) else: trainer, exporter = self.get_sarsa_trainer_exporter( environment, None, use_gpu, use_all_avail_gpus) self.evaluate_gridworld(environment, evaluator, trainer, exporter, use_gpu)
def _test_sac_trainer(self, use_2_q_functions=False, use_gpu=False): environment = GridworldContinuous() trainer = self.get_sac_trainer( environment, self.get_sac_parameters(use_2_q_functions), use_gpu) evaluator = GridworldContinuousEvaluator( environment, assume_optimal_policy=False, gamma=DISCOUNT, use_int_features=False, ) exporter = self.get_critic_exporter(trainer, environment) self.tolerance_threshold = 0.2 if use_gpu: self.run_pre_training_eval = False self.evaluate_gridworld(environment, evaluator, trainer, exporter, use_gpu) # Make sure actor predictor works actor_predictor = self.get_actor_predictor(trainer, environment) # Just test that it doesn't blow up preds = actor_predictor.predict(evaluator.logged_states, None) self._test_save_load_actor(preds, actor_predictor, evaluator.logged_states)
def generate_samples(self, num_transitions, epsilon, with_possible=True) -> Samples: samples = GridworldContinuous.generate_samples(self, num_transitions, epsilon, with_possible) enum_states = [] for state in samples.states: enum_states.append({0: float(list(state.keys())[0])}) enum_next_states = [] for state in samples.next_states: enum_next_states.append({0: float(list(state.keys())[0])}) return Samples( mdp_ids=samples.mdp_ids, sequence_numbers=samples.sequence_numbers, states=enum_states, actions=samples.actions, propensities=samples.propensities, rewards=samples.rewards, next_states=enum_next_states, next_actions=samples.next_actions, terminals=samples.terminals, possible_next_actions=samples.possible_next_actions, reward_timelines=samples.reward_timelines, )
def test_trainer_sarsa_factorized(self): environment = GridworldContinuous() samples = environment.generate_samples(500000, 1.0, DISCOUNT) trainer = self.get_sarsa_trainer( environment, self.get_sarsa_parameters_factorized()) predictor = trainer.predictor() evaluator = GridworldContinuousEvaluator(environment, False, DISCOUNT, False, samples) tdps = environment.preprocess_samples(samples, self.minibatch_size) for tdp in tdps: trainer.train(tdp) predictor = trainer.predictor() evaluator.evaluate(predictor) self.assertLess(evaluator.evaluate(predictor), 0.15)
def test_trainer_sarsa(self): environment = GridworldContinuous() samples = environment.generate_samples(150000, 1.0) trainer = self.get_sarsa_trainer(environment) predictor = trainer.predictor() evaluator = GridworldContinuousEvaluator(environment, False, DISCOUNT, False, samples) tdps = environment.preprocess_samples(samples, self.minibatch_size) for tdp in tdps: tdp.rewards = tdp.rewards.flatten() tdp.not_terminals = tdp.not_terminals.flatten() trainer.train(tdp) predictor = trainer.predictor() evaluator.evaluate(predictor) self.assertLess(evaluator.evaluate(predictor), 0.15)
def _test_trainer_sarsa(self, use_gpu=False, use_all_avail_gpus=False): environment = GridworldContinuous() evaluator = GridworldContinuousEvaluator( environment, assume_optimal_policy=False, gamma=DISCOUNT ) trainer = self.get_trainer(environment, None, use_gpu, use_all_avail_gpus) self.evaluate_gridworld(environment, evaluator, trainer, use_gpu)
def test_evaluator_ground_truth(self): environment = GridworldContinuous() samples = environment.generate_samples(100000, 1.0) true_values = environment.true_values_for_sample( samples.states, samples.actions, False) # Hijack the reward timeline to insert the ground truth samples.reward_timelines = [] for tv in true_values: samples.reward_timelines.append({0: tv}) trainer = self.get_sarsa_trainer(environment) evaluator = Evaluator(1) tdps = environment.preprocess_samples(samples, self.minibatch_size) for tdp in tdps: trainer.train_numpy(tdp, evaluator) self.assertLess(evaluator.td_loss[-1], 0.05) self.assertLess(evaluator.mc_loss[-1], 0.12)
def test_evaluator_ground_truth(self): environment = GridworldContinuous() samples = environment.generate_samples(200000, 1.0) true_values = environment.true_values_for_sample( samples.states, samples.actions, False) # Hijack the reward timeline to insert the ground truth samples.reward_timelines = [] for tv in true_values: samples.reward_timelines.append({0: tv}) trainer = self.get_sarsa_trainer(environment) evaluator = Evaluator(None, 10, DISCOUNT) tdps = environment.preprocess_samples(samples, self.minibatch_size) for tdp in tdps: tdp.rewards = tdp.rewards.flatten() tdp.not_terminals = tdp.not_terminals.flatten() trainer.train(tdp, evaluator) self.assertLess(evaluator.mc_loss[-1], 0.15)
def test_trainer_maxq(self): environment = GridworldContinuous() rl_parameters = self.get_sarsa_parameters() new_rl_parameters = ContinuousActionModelParameters( rl=RLParameters( gamma=DISCOUNT, target_update_rate=0.5, reward_burnin=10, maxq_learning=True, ), training=rl_parameters.training, knn=rl_parameters.knn) maxq_trainer = ContinuousActionDQNTrainer( new_rl_parameters, environment.normalization, environment.normalization_action, ) states, actions, rewards, next_states, next_actions, is_terminal,\ possible_next_actions, reward_timelines = \ environment.generate_samples(100000, 1.0) predictor = maxq_trainer.predictor() tdps = environment.preprocess_samples( states, actions, rewards, next_states, next_actions, is_terminal, possible_next_actions, reward_timelines, self.minibatch_size, ) evaluator = GridworldContinuousEvaluator(environment, True) self.assertGreater(evaluator.evaluate(predictor), 0.4) for _ in range(2): for tdp in tdps: maxq_trainer.stream_tdp(tdp) evaluator.evaluate(predictor) self.assertLess(evaluator.evaluate(predictor), 0.1)
def _test_trainer_sarsa_factorized(self, use_gpu=False, use_all_avail_gpus=False): self.check_tolerance = False self.tolerance_threshold = 0.15 environment = GridworldContinuous() trainer, exporter = self.get_sarsa_trainer_exporter( environment, self.get_sarsa_parameters_factorized(), use_gpu, use_all_avail_gpus, ) evaluator = GridworldContinuousEvaluator(environment, False, DISCOUNT) self.evaluate_gridworld(environment, evaluator, trainer, exporter, use_gpu)
def _test_trainer_sarsa(self, use_gpu=False, use_all_avail_gpus=False): environment = GridworldContinuous() evaluator = GridworldContinuousEvaluator( environment, assume_optimal_policy=False, gamma=DISCOUNT ) if use_all_avail_gpus: self.tolerance_threshold = 0.11 trainer, exporter = self.get_modular_sarsa_trainer_exporter( environment, None, use_gpu, use_all_avail_gpus ) self.evaluate_gridworld(environment, evaluator, trainer, exporter, use_gpu)
def test_evaluator_timeline(self): environment = GridworldContinuous() states, actions, rewards, next_states, next_actions, is_terminal,\ possible_next_actions, reward_timelines = \ environment.generate_samples(100000, 1.0) trainer = self.get_sarsa_trainer(environment) evaluator = Evaluator(trainer, DISCOUNT) tdps = environment.preprocess_samples( states, actions, rewards, next_states, next_actions, is_terminal, possible_next_actions, reward_timelines, self.minibatch_size, ) for tdp in tdps: trainer.train_numpy(tdp, evaluator) self.assertLess(evaluator.td_loss[-1], 0.2) self.assertLess(evaluator.mc_loss[-1], 0.2)
def _test_sac_trainer(self, use_gpu=False, **kwargs): environment = GridworldContinuous() trainer = self.get_sac_trainer(environment, use_gpu, **kwargs) evaluator = GridworldContinuousEvaluator(environment, assume_optimal_policy=False, gamma=DISCOUNT) self.evaluate_gridworld(environment, evaluator, trainer, use_gpu) # Make sure actor predictor works actor_predictor = self.get_actor_predictor(trainer, environment) # Just test that it doesn't blow up preds = actor_predictor.predict(evaluator.logged_states) self._test_save_load_actor(preds, actor_predictor, evaluator.logged_states)
def _test_ddpg_trainer(self, use_gpu=False, use_all_avail_gpus=False): self.check_tolerance = False self.tolerance_threshold = 1.0 environment = GridworldContinuous() trainer = DDPGTrainer( self.get_ddpg_parameters(), environment.normalization, environment.normalization_action, environment.min_action_range, environment.max_action_range, use_gpu=use_gpu, use_all_avail_gpus=use_all_avail_gpus, ) evaluator = GridworldDDPGEvaluator(environment, DISCOUNT) self.evaluate_gridworld(environment, evaluator, trainer, trainer, use_gpu)
def preprocess_samples( self, states: List[Dict[str, float]], actions: List[Dict[str, float]], rewards: List[float], next_states: List[Dict[str, float]], next_actions: List[Dict[str, float]], is_terminals: List[bool], possible_next_actions: List[List[Dict[str, float]]], reward_timelines: List[Dict[int, float]], ) -> TrainingDataPage: tdp = GridworldContinuous.preprocess_samples( self, states, actions, rewards, next_states, next_actions, is_terminals, possible_next_actions, reward_timelines) tdp.states = np.where(tdp.states == 1.0)[1].reshape(-1, 1).astype( np.float32) tdp.next_states = np.where(tdp.next_states == 1.0)[1].reshape( -1, 1).astype(np.float32) return tdp
def generate_samples( self, num_transitions, epsilon, with_possible=True ) -> Tuple[List[Dict[str, float]], List[Dict[str, float]], List[float], List[Dict[str, float]], List[Dict[str, float]], List[bool], List[List[Dict[str, float]]], List[Dict[int, float]]]: states, actions, rewards, next_states, next_actions, is_terminals, \ possible_next_actions, reward_timelines = \ GridworldContinuous.generate_samples( self, num_transitions, epsilon, with_possible) enum_states = [] for state in states: enum_states.append({'0': float(list(state.keys())[0])}) enum_next_states = [] for state in next_states: enum_next_states.append({'0': float(list(state.keys())[0])}) return ( enum_states, actions, rewards, enum_next_states, next_actions, is_terminals, possible_next_actions, reward_timelines )
def _test_td3_trainer(self, use_gpu=False, **kwargs): environment = GridworldContinuous() trainer = self.get_td3_trainer( environment, self.get_td3_parameters(**kwargs), use_gpu ) evaluator = GridworldContinuousEvaluator( environment, assume_optimal_policy=False, gamma=DISCOUNT ) self.current_predictor_network = trainer.q1_network self.evaluate_gridworld(environment, evaluator, trainer, use_gpu) if trainer.q2_network is not None: self.current_predictor_network = trainer.q2_network self.evaluate_gridworld(environment, evaluator, trainer, use_gpu) # Make sure actor predictor works actor_predictor = self.get_actor_predictor(trainer, environment) preds = actor_predictor.predict(evaluator.logged_states) self._test_save_load_actor(preds, actor_predictor, evaluator.logged_states)
def _test_td3_trainer(self, use_gpu=False, **kwargs): environment = GridworldContinuous() trainer = self.get_td3_trainer(environment, self.get_td3_parameters(**kwargs), use_gpu) evaluator = GridworldContinuousEvaluator(environment, assume_optimal_policy=False, gamma=DISCOUNT) exporter1, exporter2 = self.get_critic_exporter(trainer, environment) self.evaluate_gridworld(environment, evaluator, trainer, exporter1, use_gpu) if exporter2: self.evaluate_gridworld(environment, evaluator, trainer, exporter2, use_gpu) # Make sure actor predictor works actor_predictor = self.get_actor_predictor(trainer, environment) preds = actor_predictor.predict(evaluator.logged_states) self._test_save_load_actor(preds, actor_predictor, evaluator.logged_states)