def test_no_soft_update(self):
        model = Model()
        target_model = copy.deepcopy(model)

        for target_param, param in zip(model.parameters(),
                                       target_model.parameters()):
            self.assertIs(target_param, param)

        optimizer = torch.optim.Adam(model.parameters())

        x = torch.tensor([1, 2], dtype=torch.int64)
        emb = model(x)

        loss = emb.sum()

        loss.backward()
        optimizer.step()

        params = list(model.parameters())
        self.assertEqual(1, len(params))
        param = params[0].detach().numpy()

        trainer = RLTrainer(DiscreteActionModelParameters(rl=RLParameters()),
                            use_gpu=False)
        trainer._soft_update(model, target_model, 0.1)

        target_params = list(target_model.parameters())
        self.assertEqual(1, len(target_params))
        target_param = target_params[0].detach().numpy()

        npt.assert_array_equal(target_param, param)
Пример #2
0
 def get_sarsa_parameters(self, environment, reward_shape, dueling,
                          categorical, quantile, clip_grad_norm):
     rl_parameters = RLParameters(
         gamma=DISCOUNT,
         target_update_rate=1.0,
         maxq_learning=False,
         reward_boost=reward_shape,
     )
     training_parameters = TrainingParameters(
         layers=[-1, 128, -1] if dueling else [-1, -1],
         activations=["relu", "relu"] if dueling else ["linear"],
         minibatch_size=self.minibatch_size,
         learning_rate=0.05,
         optimizer="ADAM",
         clip_grad_norm=clip_grad_norm,
     )
     return DiscreteActionModelParameters(
         actions=environment.ACTIONS,
         rl=rl_parameters,
         training=training_parameters,
         rainbow=RainbowDQNParameters(
             double_q_learning=True,
             dueling_architecture=dueling,
             categorical=categorical,
             quantile=quantile,
             num_atoms=5,
         ),
     )
Пример #3
0
 def get_sac_parameters(
     self,
     use_2_q_functions=False,
     logged_action_uniform_prior=True,
     constrain_action_sum=False,
 ):
     return SACModelParameters(
         rl=RLParameters(gamma=DISCOUNT, target_update_rate=0.5),
         training=SACTrainingParameters(
             minibatch_size=self.minibatch_size,
             use_2_q_functions=use_2_q_functions,
             q_network_optimizer=OptimizerParameters(),
             value_network_optimizer=OptimizerParameters(),
             actor_network_optimizer=OptimizerParameters(),
             alpha_optimizer=OptimizerParameters(),
             logged_action_uniform_prior=logged_action_uniform_prior,
         ),
         q_network=FeedForwardParameters(layers=[128, 64],
                                         activations=["relu", "relu"]),
         value_network=FeedForwardParameters(layers=[128, 64],
                                             activations=["relu", "relu"]),
         actor_network=FeedForwardParameters(layers=[128, 64],
                                             activations=["relu", "relu"]),
         constrain_action_sum=constrain_action_sum,
     )
 def get_sarsa_parameters(self) -> ParametricDQNTrainerParameters:
     return ParametricDQNTrainerParameters(  # type: ignore
         rl=RLParameters(gamma=DISCOUNT,
                         target_update_rate=1.0,
                         maxq_learning=False),
         minibatch_size=self.minibatch_size,
         optimizer=OptimizerParameters(learning_rate=0.05,
                                       optimizer="ADAM"),
         double_q_learning=True,
     )
    def test_minibatches_per_step(self):
        _epochs = self.epochs
        self.epochs = 2
        rl_parameters = RLParameters(gamma=0.95,
                                     target_update_rate=0.9,
                                     maxq_learning=True)
        rainbow_parameters = RainbowDQNParameters(double_q_learning=True,
                                                  dueling_architecture=False)
        training_parameters1 = TrainingParameters(
            layers=self.layers,
            activations=self.activations,
            minibatch_size=1024,
            minibatches_per_step=1,
            learning_rate=0.25,
            optimizer="ADAM",
        )
        training_parameters2 = TrainingParameters(
            layers=self.layers,
            activations=self.activations,
            minibatch_size=128,
            minibatches_per_step=8,
            learning_rate=0.25,
            optimizer="ADAM",
        )
        env1 = Env(self.state_dims, self.action_dims)
        env2 = Env(self.state_dims, self.action_dims)
        model_parameters1 = DiscreteActionModelParameters(
            actions=env1.actions,
            rl=rl_parameters,
            rainbow=rainbow_parameters,
            training=training_parameters1,
        )
        model_parameters2 = DiscreteActionModelParameters(
            actions=env2.actions,
            rl=rl_parameters,
            rainbow=rainbow_parameters,
            training=training_parameters2,
        )
        # minibatch_size / 8, minibatches_per_step * 8 should give the same result
        logger.info("Training model 1")
        trainer1 = self._train(model_parameters1, env1)
        SummaryWriterContext._reset_globals()
        logger.info("Training model 2")
        trainer2 = self._train(model_parameters2, env2)

        weight1 = trainer1.q_network.fc.layers[-1].weight.detach().numpy()
        weight2 = trainer2.q_network.fc.layers[-1].weight.detach().numpy()

        # Due to numerical stability this tolerance has to be fairly high
        self.assertTrue(np.allclose(weight1, weight2, rtol=0.0, atol=1e-3))
        self.epochs = _epochs
Пример #6
0
 def get_td3_parameters(self, use_2_q_functions=False):
     return TD3ModelParameters(
         rl=RLParameters(gamma=DISCOUNT, target_update_rate=0.01),
         training=TD3TrainingParameters(
             minibatch_size=self.minibatch_size,
             use_2_q_functions=use_2_q_functions,
             q_network_optimizer=OptimizerParameters(),
             actor_network_optimizer=OptimizerParameters(),
         ),
         q_network=FeedForwardParameters(layers=[128, 64],
                                         activations=["relu", "relu"]),
         actor_network=FeedForwardParameters(layers=[128, 64],
                                             activations=["relu", "relu"]),
     )
Пример #7
0
 def get_sarsa_parameters(self):
     return ContinuousActionModelParameters(
         rl=RLParameters(gamma=DISCOUNT,
                         target_update_rate=1.0,
                         maxq_learning=False),
         training=TrainingParameters(
             layers=[-1, 256, 128, -1],
             activations=["relu", "relu", "linear"],
             minibatch_size=self.minibatch_size,
             learning_rate=0.05,
             optimizer="ADAM",
         ),
         rainbow=RainbowDQNParameters(double_q_learning=True,
                                      dueling_architecture=False),
     )
    def test_trainer_maxq(self):
        env = Env(self.state_dims, self.action_dims)
        maxq_parameters = DiscreteActionModelParameters(
            actions=env.actions,
            rl=RLParameters(gamma=0.95,
                            target_update_rate=0.9,
                            maxq_learning=True),
            rainbow=RainbowDQNParameters(double_q_learning=True,
                                         dueling_architecture=False),
            training=TrainingParameters(
                layers=self.layers,
                activations=self.activations,
                minibatch_size=1024,
                learning_rate=0.25,
                optimizer="ADAM",
            ),
        )

        # Q value should converge to very close to 20
        trainer = self._train(maxq_parameters, env)
        avg_q_value_after_training = torch.mean(trainer.all_action_scores)
        self.assertLess(avg_q_value_after_training, 22)
        self.assertGreater(avg_q_value_after_training, 18)
Пример #9
0
    def get_sac_trainer(
        self,
        env,
        use_gpu,
        use_2_q_functions=False,
        logged_action_uniform_prior=True,
        constrain_action_sum=False,
        use_value_network=True,
    ):
        q_network_params = FeedForwardParameters(layers=[128, 64],
                                                 activations=["relu", "relu"])
        value_network_params = FeedForwardParameters(
            layers=[128, 64], activations=["relu", "relu"])
        actor_network_params = FeedForwardParameters(
            layers=[128, 64], activations=["relu", "relu"])

        state_dim = get_num_output_features(env.normalization)
        action_dim = get_num_output_features(
            env.normalization_continuous_action)
        q1_network = FullyConnectedParametricDQN(state_dim, action_dim,
                                                 q_network_params.layers,
                                                 q_network_params.activations)
        q2_network = None
        if use_2_q_functions:
            q2_network = FullyConnectedParametricDQN(
                state_dim,
                action_dim,
                q_network_params.layers,
                q_network_params.activations,
            )
        if constrain_action_sum:
            actor_network = DirichletFullyConnectedActor(
                state_dim,
                action_dim,
                actor_network_params.layers,
                actor_network_params.activations,
            )
        else:
            actor_network = GaussianFullyConnectedActor(
                state_dim,
                action_dim,
                actor_network_params.layers,
                actor_network_params.activations,
            )

        value_network = None
        if use_value_network:
            value_network = FullyConnectedNetwork(
                [state_dim] + value_network_params.layers + [1],
                value_network_params.activations + ["linear"],
            )

        if use_gpu:
            q1_network.cuda()
            if q2_network:
                q2_network.cuda()
            if value_network:
                value_network.cuda()
            actor_network.cuda()

        parameters = SACTrainerParameters(
            rl=RLParameters(gamma=DISCOUNT, target_update_rate=0.5),
            minibatch_size=self.minibatch_size,
            q_network_optimizer=OptimizerParameters(),
            value_network_optimizer=OptimizerParameters(),
            actor_network_optimizer=OptimizerParameters(),
            alpha_optimizer=OptimizerParameters(),
            logged_action_uniform_prior=logged_action_uniform_prior,
        )

        return SACTrainer(
            q1_network,
            actor_network,
            parameters,
            use_gpu=use_gpu,
            value_network=value_network,
            q2_network=q2_network,
        )