def test_trainer_sarsa(self): environment = Gridworld() samples = environment.generate_samples(150000, 1.0) evaluator = GridworldEvaluator(environment, False, DISCOUNT, False, samples) trainer = self.get_sarsa_trainer(environment) predictor = trainer.predictor() tdps = environment.preprocess_samples(samples, self.minibatch_size) evaluator.evaluate(predictor) print( "Pre-Training eval: ", evaluator.mc_loss[-1], evaluator.value_doubly_robust[-1], ) for _ in range(2): for tdp in tdps: trainer.train_numpy(tdp, None) evaluator.evaluate(predictor) print( "Post-Training eval: ", evaluator.mc_loss[-1], evaluator.value_doubly_robust[-1], ) self.assertLess(evaluator.mc_loss[-1], 0.1)
def test_evaluator_ground_truth(self): environment = Gridworld() states, actions, rewards, next_states, next_actions, is_terminal,\ possible_next_actions, _ = environment.generate_samples(100000, 1.0) true_values = environment.true_values_for_sample( states, actions, False) # Hijack the reward timeline to insert the ground truth reward_timelines = [] for tv in true_values: reward_timelines.append({0: tv}) trainer = self.get_sarsa_trainer(environment) evaluator = Evaluator(trainer, DISCOUNT) tdps = environment.preprocess_samples( states, actions, rewards, next_states, next_actions, is_terminal, possible_next_actions, reward_timelines, self.minibatch_size, ) for tdp in tdps: trainer.stream_tdp(tdp, evaluator) self.assertLess(evaluator.td_loss[-1], 0.05) self.assertLess(evaluator.mc_loss[-1], 0.05)
def test_trainer_sarsa(self): environment = Gridworld() states, actions, rewards, next_states, next_actions, is_terminal,\ possible_next_actions, reward_timelines = \ environment.generate_samples(100000, 1.0) evaluator = GridworldEvaluator(environment, False) trainer = self.get_sarsa_trainer(environment) predictor = trainer.predictor() tdps = environment.preprocess_samples( states, actions, rewards, next_states, next_actions, is_terminal, possible_next_actions, reward_timelines, self.minibatch_size, ) self.assertGreater(evaluator.evaluate(predictor), 0.15) for tdp in tdps: trainer.stream_tdp(tdp, None) evaluator.evaluate(predictor) self.assertLess(evaluator.evaluate(predictor), 0.05)
def test_reward_boost(self): environment = Gridworld() reward_boost = {'L': 100, 'R': 200, 'U': 300, 'D': 400} trainer = self.get_sarsa_trainer_reward_boost(environment, reward_boost) predictor = trainer.predictor() states, actions, rewards, next_states, next_actions, is_terminal,\ possible_next_actions, reward_timelines = \ environment.generate_samples(100000, 1.0) rewards_update = [] for action, reward in zip(actions, rewards): rewards_update.append(reward - reward_boost[action]) evaluator = GridworldEvaluator(environment, False) tdps = environment.preprocess_samples( states, actions, rewards_update, next_states, next_actions, is_terminal, possible_next_actions, reward_timelines, self.minibatch_size, ) self.assertGreater(evaluator.evaluate(predictor), 0.15) for tdp in tdps: trainer.train_numpy(tdp, None) self.assertLess(evaluator.evaluate(predictor), 0.05)
def generate_samples(self, num_transitions, epsilon, with_possible=True) -> Samples: samples = Gridworld.generate_samples(self, num_transitions, epsilon, with_possible) enum_states = [] for state in samples.states: enum_states.append({0: float(list(state.keys())[0])}) enum_next_states = [] for state in samples.next_states: enum_next_states.append({0: float(list(state.keys())[0])}) return Samples( mdp_ids=samples.mdp_ids, sequence_numbers=samples.sequence_numbers, states=enum_states, actions=samples.actions, propensities=samples.propensities, rewards=samples.rewards, next_states=enum_next_states, next_actions=samples.next_actions, terminals=samples.terminals, possible_next_actions=samples.possible_next_actions, reward_timelines=samples.reward_timelines, )
def test_reward_boost(self): environment = Gridworld() reward_boost = {"L": 100, "R": 200, "U": 300, "D": 400} trainer = self.get_sarsa_trainer_reward_boost(environment, reward_boost) predictor = trainer.predictor() samples = environment.generate_samples(150000, 1.0) rewards_update = [] for action, reward in zip(samples.actions, samples.rewards): rewards_update.append(reward - reward_boost[action]) samples.rewards = rewards_update evaluator = GridworldEvaluator(environment, False, DISCOUNT, False, samples) tdps = environment.preprocess_samples(samples, self.minibatch_size) evaluator.evaluate(predictor) print( "Pre-Training eval: ", evaluator.mc_loss[-1], evaluator.value_doubly_robust[-1], ) for _ in range(2): for tdp in tdps: trainer.train_numpy(tdp, None) evaluator.evaluate(predictor) print( "Post-Training eval: ", evaluator.mc_loss[-1], evaluator.value_doubly_robust[-1], ) self.assertLess(evaluator.mc_loss[-1], 0.1)
def test_gridworld_generate_samples(self): env = Gridworld() num_samples = 1000 num_steps = 5 samples = env.generate_samples( num_samples, epsilon=1.0, discount_factor=0.9, multi_steps=num_steps ) self._check_samples(samples, num_samples, num_steps, False)
def test_evaluator_timeline(self): environment = Gridworld() samples = environment.generate_samples(100000, 1.0) trainer = self.get_sarsa_trainer(environment) evaluator = Evaluator(1) tdps = environment.preprocess_samples(samples, self.minibatch_size) for tdp in tdps: trainer.train_numpy(tdp, evaluator) self.assertLess(evaluator.td_loss[-1], 0.2) self.assertLess(evaluator.mc_loss[-1], 0.2)
def test_sequential_doubly_robust(self): """Both the logged and model policies are epsilon-greedy policies where greedy = optimal, but the epsilon values are different. We test a variety of epsilon pairs to check the estimator's ability to evaluate model policies that are much different than the logged policies that generated the data. By computing the true values associated with both epsilon policies, we can see the performance and compute a percentage error. """ environment = Gridworld() sequential_dr = SequentialDoublyRobustEstimator(DISCOUNT) epsilon_test_pairs = [ [1.0, 0.05], [0.8, 0.2], [0.6, 0.4], [0.5, 0.5], [0.4, 0.6], [0.2, 0.8], [0.05, 1.0], ] for epsilon_pair in epsilon_test_pairs: epsilon_logged = epsilon_pair[0] epsilon_model = epsilon_pair[1] samples_logged = environment.generate_samples( 10000, epsilon_logged, DISCOUNT ) edp = self.create_edp(environment, samples_logged, epsilon_model) cpe_sequential_dr = sequential_dr.estimate(edp) true_logged_value = environment.true_q_epsilon_values( DISCOUNT, epsilon_logged ) true_model_value = environment.true_q_epsilon_values( DISCOUNT, epsilon_model ) ratio = true_model_value[0] / true_logged_value[0] percent_err = (cpe_sequential_dr.normalized - ratio) / ratio * 100 logger.info( "Sequential DR: epsilon_pair = (" + str(epsilon_logged) + ", " + str(epsilon_model) + ");\n" + "true ratio = " + str(ratio) + ", computed ratio = " + str(cpe_sequential_dr.normalized) + ", percent error = " + str(percent_err) + "." ) self.assertLessEqual(np.absolute(percent_err), 100) self.assertLessEqual( cpe_sequential_dr.normalized_std_error, cpe_sequential_dr.normalized )
def test_gridworld_generate_samples(self): env = Gridworld() num_samples = 1000 num_steps = 5 samples = env.generate_samples( num_samples, epsilon=1.0, discount_factor=0.9, multi_steps=num_steps, include_shorter_samples_at_start=True, include_shorter_samples_at_end=True, ) self._check_samples(samples, num_samples, num_steps, False)
def test_trainer_maxq(self): environment = Gridworld() maxq_sarsa_parameters = DiscreteActionModelParameters( actions=environment.ACTIONS, rl=RLParameters( gamma=DISCOUNT, target_update_rate=0.5, reward_burnin=10, maxq_learning=True, ), training=TrainingParameters( layers=[-1, 1], activations=["linear"], minibatch_size=self.minibatch_size, learning_rate=0.01, optimizer="ADAM", ), ) # construct the new trainer that using maxq maxq_trainer = DiscreteActionTrainer( maxq_sarsa_parameters, environment.normalization ) samples = environment.generate_samples(100000, 1.0) predictor = maxq_trainer.predictor() tdps = environment.preprocess_samples(samples, self.minibatch_size) evaluator = GridworldEvaluator(environment, True) evaluator.evaluate(predictor) print( "Pre-Training eval: ", evaluator.mc_loss[-1], evaluator.reward_doubly_robust[-1], ) self.assertGreater(evaluator.mc_loss[-1], 0.3) for _ in range(5): for tdp in tdps: maxq_trainer.train_numpy(tdp, None) evaluator.evaluate(predictor) print( "Post-Training eval: ", evaluator.mc_loss[-1], evaluator.reward_doubly_robust[-1], ) self.assertLess(evaluator.mc_loss[-1], 0.1) self.assertGreater( evaluator.reward_doubly_robust[-1], evaluator.reward_doubly_robust[-2] )
def _test_reward_boost(self, use_gpu=False, use_all_avail_gpus=False): environment = Gridworld() reward_boost = {"L": 100, "R": 200, "U": 300, "D": 400} trainer = self.get_sarsa_trainer_reward_boost( environment, reward_boost, False, use_gpu=use_gpu, use_all_avail_gpus=use_all_avail_gpus, ) predictor = trainer.predictor() samples = environment.generate_samples(100000, 1.0, DISCOUNT) rewards_update = [] for action, reward in zip(samples.actions, samples.rewards): rewards_update.append(reward - reward_boost[action]) samples.rewards = rewards_update evaluator = GridworldEvaluator(environment, False, DISCOUNT, False, samples) tdps = environment.preprocess_samples(samples, self.minibatch_size, use_gpu=use_gpu) with tempfile.TemporaryDirectory() as tmpdirname: tmp_path = os.path.join(tmpdirname, "model") predictor.save(tmp_path, "minidb") new_predictor = DQNPredictor.load(tmp_path, "minidb", False) evaluator.evaluate(new_predictor) print( "Pre-Training eval: ", evaluator.mc_loss[-1], evaluator.value_doubly_robust[-1], ) for tdp in tdps: trainer.train(tdp, None) predictor = trainer.predictor() with tempfile.TemporaryDirectory() as tmpdirname: tmp_path = os.path.join(tmpdirname, "model") predictor.save(tmp_path, "minidb") new_predictor = DQNPredictor.load(tmp_path, "minidb", False) evaluator.evaluate(new_predictor) print( "Post-Training eval: ", evaluator.mc_loss[-1], evaluator.value_doubly_robust[-1], ) self.assertLess(evaluator.mc_loss[-1], 0.1)
def test_evaluator_ground_truth_no_dueling(self): environment = Gridworld() samples = environment.generate_samples(500000, 1.0, DISCOUNT) true_values = environment.true_values_for_sample( samples.states, samples.actions, False) # Hijack the reward timeline to insert the ground truth samples.episode_values = true_values trainer = self.get_sarsa_trainer(environment, False) evaluator = Evaluator(environment.ACTIONS, 10, DISCOUNT, None, None) tdps = environment.preprocess_samples(samples, self.minibatch_size) for tdp in tdps: trainer.train(tdp, evaluator) self.assertLess(evaluator.mc_loss[-1], 0.1)
def test_trainer_maxq(self): environment = Gridworld() maxq_sarsa_parameters = DiscreteActionModelParameters( actions=environment.ACTIONS, rl=RLParameters(gamma=DISCOUNT, target_update_rate=0.5, reward_burnin=10, maxq_learning=True), training=TrainingParameters( layers=[-1, 1], activations=['linear'], minibatch_size=self.minibatch_size, learning_rate=0.01, optimizer='ADAM', )) # construct the new trainer that using maxq maxq_trainer = DiscreteActionTrainer( maxq_sarsa_parameters, environment.normalization, ) states, actions, rewards, next_states, next_actions, is_terminal,\ possible_next_actions, reward_timelines = \ environment.generate_samples(100000, 1.0) predictor = maxq_trainer.predictor() tdps = environment.preprocess_samples( states, actions, rewards, next_states, next_actions, is_terminal, possible_next_actions, reward_timelines, self.minibatch_size, ) evaluator = GridworldEvaluator(environment, True) print("Pre-Training eval", evaluator.evaluate(predictor)) self.assertGreater(evaluator.evaluate(predictor), 0.3) for _ in range(2): for tdp in tdps: maxq_trainer.stream_tdp(tdp, None) evaluator.evaluate(predictor) print("Post-Training eval", evaluator.evaluate(predictor)) self.assertLess(evaluator.evaluate(predictor), 0.1)
def test_evaluator_ground_truth(self): environment = Gridworld() samples = environment.generate_samples(200000, 1.0) true_values = environment.true_values_for_sample( samples.states, samples.actions, False) # Hijack the reward timeline to insert the ground truth samples.reward_timelines = [] for tv in true_values: samples.reward_timelines.append({0: tv}) trainer = self.get_sarsa_trainer(environment) evaluator = Evaluator(environment.ACTIONS, 10, DISCOUNT, None, None) tdps = environment.preprocess_samples(samples, self.minibatch_size) for _ in range(2): for tdp in tdps: trainer.train_numpy(tdp, evaluator) self.assertLess(evaluator.mc_loss[-1], 0.1)
def generate_samples( self, num_transitions, epsilon, with_possible=True ) -> Tuple[List[Dict[int, float]], List[str], List[float], List[Dict[ int, float]], List[str], List[bool], List[List[str]], List[Dict[ int, float]]]: states, actions, rewards, next_states, next_actions, is_terminals, \ possible_next_actions, reward_timelines = Gridworld.generate_samples( self, num_transitions, epsilon, with_possible) enum_states = [] for state in states: enum_states.append({0: float(list(state.keys())[0])}) enum_next_states = [] for state in next_states: enum_next_states.append({0: float(list(state.keys())[0])}) return (enum_states, actions, rewards, enum_next_states, next_actions, is_terminals, possible_next_actions, reward_timelines)
def test_evaluator_timeline(self): environment = Gridworld() states, actions, rewards, next_states, next_actions, is_terminal,\ possible_next_actions, reward_timelines = \ environment.generate_samples(100000, 1.0) trainer = self.get_sarsa_trainer(environment) evaluator = Evaluator(trainer, DISCOUNT) tdps = environment.preprocess_samples( states, actions, rewards, next_states, next_actions, is_terminal, possible_next_actions, reward_timelines, self.minibatch_size, ) for tdp in tdps: trainer.stream_tdp(tdp, evaluator) self.assertLess(evaluator.td_loss[-1], 0.2) self.assertLess(evaluator.mc_loss[-1], 0.2)
def test_knn_dqn_trainer(self): environment = Gridworld() samples = environment.generate_samples(200000, 1.0, DISCOUNT) evaluator = GridworldEvaluator(environment, False, DISCOUNT, False, samples) parameters = self.get_parameters(environment) trainer = KNNDQNTrainer(parameters, environment.normalization) tdps = environment.preprocess_samples(samples, self.minibatch_size, one_hot_action=False) predictor = trainer.predictor(environment.ACTIONS) evaluator.evaluate(predictor) print( "Pre-Training eval: ", evaluator.mc_loss[-1], evaluator.value_doubly_robust[-1], ) pre_train_loss = evaluator.mc_loss[-1] for tdp in tdps: tdp.rewards = tdp.rewards.flatten() tdp.not_terminals = tdp.not_terminals.flatten() trainer.train(tdp) predictor = trainer.predictor(environment.ACTIONS) evaluator.evaluate(predictor) print( "Post-Training eval: ", evaluator.mc_loss[-1], evaluator.value_doubly_robust[-1], ) self.assertLess(evaluator.mc_loss[-1], pre_train_loss)
def generate_samples(self, num_transitions, epsilon, discount_factor) -> Samples: samples = Gridworld.generate_samples(self, num_transitions, epsilon, discount_factor) enum_states = [] for state in samples.states: enum_states.append({0: float(list(state.keys())[0])}) enum_next_states = [] for state in samples.next_states: enum_next_states.append({0: float(list(state.keys())[0])}) return Samples( mdp_ids=samples.mdp_ids, sequence_numbers=samples.sequence_numbers, states=enum_states, actions=samples.actions, propensities=samples.propensities, rewards=samples.rewards, possible_actions=samples.possible_actions, next_states=enum_next_states, next_actions=samples.next_actions, terminals=samples.terminals, possible_next_actions=samples.possible_next_actions, episode_values=samples.episode_values, )
def test_gridworld_generate_samples(self): env = Gridworld() num_samples = 1000 num_steps = 5 samples = env.generate_samples(num_samples, epsilon=1.0, discount_factor=0.9, multi_steps=num_steps) for i in range(num_samples): if samples.terminals[i][0]: break if i < num_samples - 1: self.assertEqual(samples.mdp_ids[i], samples.mdp_ids[i + 1]) self.assertEqual(samples.sequence_numbers[i] + 1, samples.sequence_numbers[i + 1]) for j in range(len(samples.terminals[i])): self.assertEqual(samples.rewards[i][j], samples.rewards[i + j][0]) self.assertDictEqual(samples.next_states[i][j], samples.next_states[i + j][0]) self.assertEqual(samples.next_actions[i][j], samples.next_actions[i + j][0]) self.assertEqual(samples.terminals[i][j], samples.terminals[i + j][0]) self.assertListEqual( samples.possible_next_actions[i][j], samples.possible_next_actions[i + j][0], ) if samples.terminals[i][j]: continue self.assertDictEqual(samples.next_states[i][j], samples.states[i + j + 1]) self.assertEqual(samples.next_actions[i][j], samples.actions[i + j + 1]) self.assertListEqual( samples.possible_next_actions[i][j], samples.possible_actions[i + j + 1], ) single_step_samples = samples.to_single_step() for i in range(num_samples): if single_step_samples.terminals[i] is True: break self.assertEqual(single_step_samples.mdp_ids[i], samples.mdp_ids[i]) self.assertEqual(single_step_samples.sequence_numbers[i], samples.sequence_numbers[i]) self.assertDictEqual(single_step_samples.states[i], samples.states[i]) self.assertEqual(single_step_samples.actions[i], samples.actions[i]) self.assertEqual( single_step_samples.action_probabilities[i], samples.action_probabilities[i], ) self.assertEqual(single_step_samples.rewards[i], samples.rewards[i][0]) self.assertListEqual(single_step_samples.possible_actions[i], samples.possible_actions[i]) self.assertDictEqual(single_step_samples.next_states[i], samples.next_states[i][0]) self.assertEqual(single_step_samples.next_actions[i], samples.next_actions[i][0]) self.assertEqual(single_step_samples.terminals[i], samples.terminals[i][0]) self.assertListEqual( single_step_samples.possible_next_actions[i], samples.possible_next_actions[i][0], )