def test_pure_q_learning_all_cheat(self): q_learning_parameters = DiscreteActionModelParameters( actions=self._env.ACTIONS, rl=self._rl_parameters_all_cheat_maxq, training=TrainingParameters( layers=[self._env.width * self._env.height, 1], activations=['linear'], minibatch_size=self.minibatch_size, learning_rate=0.05, optimizer='SGD', lr_policy='fixed', ) ) trainer = DiscreteActionTrainer( q_learning_parameters, self._env.normalization, ) predictor = trainer.predictor() policy = _build_policy(self._env, predictor, 1) initial_state = self._env.reset() iteration_result = _collect_samples( self._env, policy, 20000, initial_state ) num_iterations = 50 for _ in range(num_iterations): tdps = self._env.preprocess_samples( iteration_result.states, iteration_result.actions, iteration_result.rewards, iteration_result.next_states, iteration_result.next_actions, iteration_result.is_terminals, iteration_result.possible_next_actions, None, self.minibatch_size, ) for tdp in tdps: trainer.train_numpy(tdp, None) initial_state = self._env.reset() policy = _build_policy(self._env, predictor, 0.1) iteration_result = _collect_samples( self._env, policy, 20000, initial_state ) policy = _build_policy(self._env, predictor, 0) initial_state = self._env.reset() iteration_result = _collect_samples( self._env, policy, 1000, initial_state ) # 100% should be cheat. Will fix in the future. self.assertGreater( np.sum(np.array(iteration_result.actions) == 'C'), 800 )
def test_trainer_maxq(self): environment = Gridworld() maxq_sarsa_parameters = DiscreteActionModelParameters( actions=environment.ACTIONS, rl=RLParameters( gamma=DISCOUNT, target_update_rate=0.5, reward_burnin=10, maxq_learning=True, ), training=TrainingParameters( layers=[-1, 1], activations=["linear"], minibatch_size=self.minibatch_size, learning_rate=0.01, optimizer="ADAM", ), ) # construct the new trainer that using maxq maxq_trainer = DiscreteActionTrainer( maxq_sarsa_parameters, environment.normalization ) samples = environment.generate_samples(100000, 1.0) predictor = maxq_trainer.predictor() tdps = environment.preprocess_samples(samples, self.minibatch_size) evaluator = GridworldEvaluator(environment, True) evaluator.evaluate(predictor) print( "Pre-Training eval: ", evaluator.mc_loss[-1], evaluator.reward_doubly_robust[-1], ) self.assertGreater(evaluator.mc_loss[-1], 0.3) for _ in range(5): for tdp in tdps: maxq_trainer.train_numpy(tdp, None) evaluator.evaluate(predictor) print( "Post-Training eval: ", evaluator.mc_loss[-1], evaluator.reward_doubly_robust[-1], ) self.assertLess(evaluator.mc_loss[-1], 0.1) self.assertGreater( evaluator.reward_doubly_robust[-1], evaluator.reward_doubly_robust[-2] )
def test_trainer_maxq(self): environment = Gridworld() maxq_sarsa_parameters = DiscreteActionModelParameters( actions=environment.ACTIONS, rl=RLParameters(gamma=DISCOUNT, target_update_rate=0.5, reward_burnin=10, maxq_learning=True), training=TrainingParameters( layers=[-1, 1], activations=['linear'], minibatch_size=self.minibatch_size, learning_rate=0.01, optimizer='ADAM', )) # construct the new trainer that using maxq maxq_trainer = DiscreteActionTrainer( maxq_sarsa_parameters, environment.normalization, ) states, actions, rewards, next_states, next_actions, is_terminal,\ possible_next_actions, reward_timelines = \ environment.generate_samples(100000, 1.0) predictor = maxq_trainer.predictor() tdps = environment.preprocess_samples( states, actions, rewards, next_states, next_actions, is_terminal, possible_next_actions, reward_timelines, self.minibatch_size, ) evaluator = GridworldEvaluator(environment, True) print("Pre-Training eval", evaluator.evaluate(predictor)) self.assertGreater(evaluator.evaluate(predictor), 0.3) for _ in range(2): for tdp in tdps: maxq_trainer.stream_tdp(tdp, None) evaluator.evaluate(predictor) print("Post-Training eval", evaluator.evaluate(predictor)) self.assertLess(evaluator.evaluate(predictor), 0.1)
def test_pure_q_learning_all_cheat(self): q_learning_parameters = DiscreteActionModelParameters( actions=self._env.ACTIONS, rl=self._rl_parameters_all_cheat_maxq, training=TrainingParameters( layers=[self._env.width * self._env.height, 1], activations=['linear'], minibatch_size=32, learning_rate=0.05, optimizer='SGD', lr_policy='fixed')) trainer = DiscreteActionTrainer(self._env.normalization, q_learning_parameters) predictor = trainer.predictor() policy = _build_policy(self._env, predictor, 1) initial_state = self._env.reset() iteration_result = _collect_samples(self._env, policy, 10000, initial_state) num_iterations = 50 for _ in range(num_iterations): policy = _build_policy(self._env, predictor, 0) tdp = self._env.preprocess_samples( iteration_result.states, iteration_result.actions, iteration_result.rewards, iteration_result.next_states, iteration_result.next_actions, iteration_result.is_terminals, iteration_result.possible_next_actions, None, ) trainer.stream_tdp(tdp, None) initial_state = iteration_result.current_state initial_state = self._env.reset() iteration_result = _collect_samples(self._env, policy, 10000, initial_state) self.assertTrue(np.all(np.array(iteration_result.actions) == 'C'))