def test_no_soft_update(self): model = Model() target_model = copy.deepcopy(model) for target_param, param in zip(model.parameters(), target_model.parameters()): self.assertIs(target_param, param) optimizer = torch.optim.Adam(model.parameters()) x = torch.tensor([1, 2], dtype=torch.int64) emb = model(x) loss = emb.sum() loss.backward() optimizer.step() params = list(model.parameters()) self.assertEqual(1, len(params)) param = params[0].detach().numpy() trainer = RLTrainer(rl_parameters=RLParameters(), use_gpu=False) trainer._soft_update(model, target_model, 0.1) target_params = list(target_model.parameters()) self.assertEqual(1, len(target_params)) target_param = target_params[0].detach().numpy() npt.assert_array_equal(target_param, param)
def get_sarsa_parameters( self, environment, reward_shape, dueling, categorical, quantile, clip_grad_norm ): rl_parameters = RLParameters( gamma=DISCOUNT, target_update_rate=1.0, maxq_learning=False, reward_boost=reward_shape, ) training_parameters = TrainingParameters( layers=[-1, 128, -1] if dueling else [-1, -1], activations=["relu", "relu"] if dueling else ["linear"], minibatch_size=self.minibatch_size, learning_rate=0.05, optimizer="ADAM", clip_grad_norm=clip_grad_norm, ) return DiscreteActionModelParameters( actions=environment.ACTIONS, rl=rl_parameters, training=training_parameters, rainbow=RainbowDQNParameters( double_q_learning=True, dueling_architecture=dueling, categorical=categorical, quantile=quantile, num_atoms=5, ), )
def get_sarsa_parameters(self) -> ParametricDQNTrainerParameters: return ParametricDQNTrainerParameters( # type: ignore rl=RLParameters(gamma=DISCOUNT, target_update_rate=1.0, maxq_learning=False), minibatch_size=self.minibatch_size, optimizer=OptimizerParameters(learning_rate=0.05, optimizer="ADAM"), double_q_learning=True, )
def get_sarsa_parameters(self) -> ParametricDQNTrainerParameters: # pyre-fixme[28]: Unexpected keyword argument `rl`. return ParametricDQNTrainerParameters( rl=RLParameters(gamma=DISCOUNT, target_update_rate=1.0, maxq_learning=False), minibatch_size=self.minibatch_size, optimizer=OptimizerParameters(learning_rate=0.05, optimizer="ADAM"), double_q_learning=True, )
def test_minibatches_per_step(self): _epochs = self.epochs self.epochs = 2 rl_parameters = RLParameters(gamma=0.95, target_update_rate=0.9, maxq_learning=True) rainbow_parameters = RainbowDQNParameters(double_q_learning=True, dueling_architecture=False) training_parameters1 = TrainingParameters( layers=self.layers, activations=self.activations, minibatch_size=1024, minibatches_per_step=1, learning_rate=0.25, optimizer="ADAM", ) training_parameters2 = TrainingParameters( layers=self.layers, activations=self.activations, minibatch_size=128, minibatches_per_step=8, learning_rate=0.25, optimizer="ADAM", ) env1 = Env(self.state_dims, self.action_dims) env2 = Env(self.state_dims, self.action_dims) model_parameters1 = DiscreteActionModelParameters( actions=env1.actions, rl=rl_parameters, rainbow=rainbow_parameters, training=training_parameters1, ) model_parameters2 = DiscreteActionModelParameters( actions=env2.actions, rl=rl_parameters, rainbow=rainbow_parameters, training=training_parameters2, ) # minibatch_size / 8, minibatches_per_step * 8 should give the same result logger.info("Training model 1") trainer1 = self._train(model_parameters1, env1) SummaryWriterContext._reset_globals() logger.info("Training model 2") trainer2 = self._train(model_parameters2, env2) weight1 = trainer1.q_network.fc.dnn[-2].weight.detach().numpy() weight2 = trainer2.q_network.fc.dnn[-2].weight.detach().numpy() # Due to numerical stability this tolerance has to be fairly high self.assertTrue(np.allclose(weight1, weight2, rtol=0.0, atol=1e-3)) self.epochs = _epochs
def get_td3_parameters(self, use_2_q_functions=False): return TD3ModelParameters( rl=RLParameters(gamma=DISCOUNT, target_update_rate=0.01), training=TD3TrainingParameters( minibatch_size=self.minibatch_size, use_2_q_functions=use_2_q_functions, q_network_optimizer=OptimizerParameters(), actor_network_optimizer=OptimizerParameters(), ), q_network=FeedForwardParameters(layers=[128, 64], activations=["relu", "relu"]), actor_network=FeedForwardParameters(layers=[128, 64], activations=["relu", "relu"]), )
def test_trainer_maxq(self): env = Env(self.state_dims, self.action_dims) maxq_parameters = DiscreteActionModelParameters( actions=env.actions, rl=RLParameters(gamma=0.95, target_update_rate=0.9, maxq_learning=True), rainbow=RainbowDQNParameters(double_q_learning=True, dueling_architecture=False), training=TrainingParameters( layers=self.layers, activations=self.activations, minibatch_size=1024, learning_rate=0.25, optimizer="ADAM", ), ) # Q value should converge to very close to 20 trainer = self._train(maxq_parameters, env) avg_q_value_after_training = torch.mean(trainer.all_action_scores) self.assertLess(avg_q_value_after_training, 22) self.assertGreater(avg_q_value_after_training, 18)
def get_sac_trainer( self, env, use_gpu, use_2_q_functions=False, logged_action_uniform_prior=True, constrain_action_sum=False, use_value_network=True, use_alpha_optimizer=True, entropy_temperature=None, ): q_network_params = FeedForwardParameters(layers=[128, 64], activations=["relu", "relu"]) value_network_params = FeedForwardParameters( layers=[128, 64], activations=["relu", "relu"]) actor_network_params = FeedForwardParameters( layers=[128, 64], activations=["relu", "relu"]) state_dim = get_num_output_features(env.normalization) action_dim = get_num_output_features( env.normalization_continuous_action) q1_network = FullyConnectedParametricDQN(state_dim, action_dim, q_network_params.layers, q_network_params.activations) q2_network = None if use_2_q_functions: q2_network = FullyConnectedParametricDQN( state_dim, action_dim, q_network_params.layers, q_network_params.activations, ) if constrain_action_sum: actor_network = DirichletFullyConnectedActor( state_dim, action_dim, actor_network_params.layers, actor_network_params.activations, ) else: actor_network = GaussianFullyConnectedActor( state_dim, action_dim, actor_network_params.layers, actor_network_params.activations, ) value_network = None if use_value_network: value_network = FullyConnectedNetwork( [state_dim] + value_network_params.layers + [1], value_network_params.activations + ["linear"], ) if use_gpu: q1_network.cuda() if q2_network: q2_network.cuda() if value_network: value_network.cuda() actor_network.cuda() parameters = SACTrainerParameters( rl=RLParameters(gamma=DISCOUNT, target_update_rate=0.5), minibatch_size=self.minibatch_size, q_network_optimizer=OptimizerParameters(), value_network_optimizer=OptimizerParameters(), actor_network_optimizer=OptimizerParameters(), alpha_optimizer=OptimizerParameters() if use_alpha_optimizer else None, entropy_temperature=entropy_temperature, logged_action_uniform_prior=logged_action_uniform_prior, ) return SACTrainer( q1_network, actor_network, parameters, use_gpu=use_gpu, value_network=value_network, q2_network=q2_network, )