Exemplo n.º 1
0
    def __init__(
        self,
        state_dim: int,
        action_dim: int,
        sizes: List[int],
        activations: List[str],
        use_batch_norm: bool = False,
        action_activation: str = "tanh",
        exploration_variance: Optional[float] = None,
    ):
        super().__init__()
        assert state_dim > 0, "state_dim must be > 0, got {}".format(state_dim)
        assert action_dim > 0, "action_dim must be > 0, got {}".format(
            action_dim)
        self.state_dim = state_dim
        self.action_dim = action_dim
        assert len(sizes) == len(
            activations
        ), "The numbers of sizes and activations must match; got {} vs {}".format(
            len(sizes), len(activations))
        self.action_activation = action_activation
        self.fc = FullyConnectedNetwork(
            [state_dim] + sizes + [action_dim],
            activations + [self.action_activation],
            use_batch_norm=use_batch_norm,
        )

        # Gaussian noise for exploration.
        self.exploration_variance = exploration_variance
        if exploration_variance is not None:
            assert exploration_variance > 0
            loc = torch.zeros(action_dim).float()
            scale = torch.ones(action_dim).float() * exploration_variance
            self.noise_dist = Normal(loc=loc, scale=scale)
Exemplo n.º 2
0
    def test_forward_pass(self):
        torch.manual_seed(123)
        state_dim = 1
        action_dim = 2
        state = rlt.FeatureData(torch.tensor([[2.0]]))
        bcq_drop_threshold = 0.20

        q_network = FullyConnectedDQN(state_dim,
                                      action_dim,
                                      sizes=[2],
                                      activations=["relu"])
        init.constant_(q_network.fc.dnn[-2].bias, 3.0)
        imitator_network = FullyConnectedNetwork(
            layers=[state_dim, 2, action_dim], activations=["relu", "linear"])

        imitator_probs = torch.nn.functional.softmax(imitator_network(
            state.float_features),
                                                     dim=1)
        bcq_mask = imitator_probs < bcq_drop_threshold
        npt.assert_array_equal(bcq_mask.detach(), [[True, False]])

        model = BatchConstrainedDQN(
            state_dim=state_dim,
            q_network=q_network,
            imitator_network=imitator_network,
            bcq_drop_threshold=bcq_drop_threshold,
        )
        final_q_values = model(state)
        npt.assert_array_equal(final_q_values.detach(), [[-1e10, 3.0]])
Exemplo n.º 3
0
    def __init__(self, cnn_parameters, layers, activations) -> None:
        super().__init__()
        self.conv_dims = cnn_parameters.conv_dims
        self.conv_height_kernels = cnn_parameters.conv_height_kernels
        self.conv_width_kernels = cnn_parameters.conv_width_kernels
        self.conv_layers: nn.ModuleList = nn.ModuleList()
        self.pool_layers: nn.ModuleList = nn.ModuleList()

        for i, _ in enumerate(self.conv_dims[1:]):
            self.conv_layers.append(
                nn.Conv2d(
                    self.conv_dims[i],
                    self.conv_dims[i + 1],
                    kernel_size=(
                        self.conv_height_kernels[i],
                        self.conv_width_kernels[i],
                    ),
                ))
            nn.init.kaiming_normal_(self.conv_layers[i].weight)
            if cnn_parameters.pool_types[i] == "max":
                self.pool_layers.append(
                    nn.MaxPool2d(
                        kernel_size=cnn_parameters.pool_kernels_strides[i]))
            else:
                assert False, "Unknown pooling type".format(layers)

        input_size = (
            cnn_parameters.num_input_channels,
            cnn_parameters.input_height,
            cnn_parameters.input_width,
        )
        conv_out = self.conv_forward(torch.ones(1, *input_size))
        self.fc_input_dim = int(np.prod(conv_out.size()[1:]))
        layers[0] = self.fc_input_dim
        self.feed_forward = FullyConnectedNetwork(layers, activations)
Exemplo n.º 4
0
Arquivo: dqn.py Projeto: zrion/ReAgent
 def __init__(
     self,
     state_dim,
     action_dim,
     sizes,
     activations,
     *,
     num_atoms: Optional[int] = None,
     use_batch_norm=False,
     dropout_ratio=0.0,
     normalized_output=False,
 ):
     super().__init__()
     assert state_dim > 0, "state_dim must be > 0, got {}".format(state_dim)
     assert action_dim > 0, "action_dim must be > 0, got {}".format(action_dim)
     self.state_dim = state_dim
     self.action_dim = action_dim
     assert len(sizes) == len(
         activations
     ), "The numbers of sizes and activations must match; got {} vs {}".format(
         len(sizes), len(activations)
     )
     self.num_atoms = num_atoms
     self.fc = FullyConnectedNetwork(
         [state_dim] + sizes + [action_dim * (num_atoms or 1)],
         activations + ["linear"],
         use_batch_norm=use_batch_norm,
         dropout_ratio=dropout_ratio,
         normalize_output=normalized_output,
     )
Exemplo n.º 5
0
    def __init__(
        self,
        state_dim,
        action_dim,
        num_atoms,
        qmin,
        qmax,
        sizes,
        activations,
        use_batch_norm=False,
        dropout_ratio=0.0,
        use_gpu=False,
    ):
        super().__init__()
        assert state_dim > 0, "state_dim must be > 0, got {}".format(state_dim)
        assert action_dim > 0, "action_dim must be > 0, got {}".format(
            action_dim)
        self.state_dim = state_dim
        self.action_dim = action_dim
        assert len(sizes) == len(
            activations
        ), "The numbers of sizes and activations must match; got {} vs {}".format(
            len(sizes), len(activations))
        self.fc = FullyConnectedNetwork(
            [state_dim] + sizes + [action_dim * num_atoms],
            activations + ["linear"],
            use_batch_norm=use_batch_norm,
            dropout_ratio=dropout_ratio,
        )

        self.num_atoms = num_atoms
        self.action_dim = action_dim
        self.support = torch.linspace(qmin, qmax, num_atoms)
        if use_gpu:
            self.support = self.support.cuda()
Exemplo n.º 6
0
    def __init__(self, seq2reward_network: Seq2RewardNetwork,
                 params: Seq2RewardTrainerParameters):
        super().__init__()
        self.seq2reward_network = seq2reward_network
        self.params = params

        # Turning off Q value output during training:
        self.view_q_value = params.view_q_value
        # permutations used to do planning
        self.all_permut = gen_permutations(params.multi_steps,
                                           len(self.params.action_names))
        self.mse_loss = nn.MSELoss(reduction="mean")

        # Predict how many steps are remaining from the current step
        self.step_predict_network = FullyConnectedNetwork(
            [
                self.seq2reward_network.state_dim,
                self.params.step_predict_net_size,
                self.params.step_predict_net_size,
                self.params.multi_steps,
            ],
            ["relu", "relu", "linear"],
            use_layer_norm=False,
        )
        self.step_loss = nn.CrossEntropyLoss(reduction="mean")
Exemplo n.º 7
0
 def __init__(
     self,
     state_dim,
     action_dim,
     sizes,
     activations,
     use_batch_norm=False,
     use_layer_norm=False,
     output_dim=1,
 ):
     super().__init__()
     assert state_dim > 0, "state_dim must be > 0, got {}".format(state_dim)
     assert action_dim > 0, "action_dim must be > 0, got {}".format(action_dim)
     self.state_dim = state_dim
     self.action_dim = action_dim
     assert len(sizes) == len(
         activations
     ), "The numbers of sizes and activations must match; got {} vs {}".format(
         len(sizes), len(activations)
     )
     self.fc = FullyConnectedNetwork(
         [state_dim + action_dim] + sizes + [output_dim],
         activations + ["linear"],
         use_batch_norm=use_batch_norm,
         use_layer_norm=use_layer_norm,
     )
Exemplo n.º 8
0
 def build_slate_ranking_network(self,
                                 state_dim,
                                 candidate_dim,
                                 _candidate_size=None,
                                 _slate_size=None) -> ModelBase:
     # pointwise MLP
     input_dim = state_dim + candidate_dim
     output_dim = 1
     layers = [input_dim, *self.hidden_layers, output_dim]
     activations = [
         *self.activations,
         # identity, but we'll add our own final layer
         "linear",
     ]
     mlp = FullyConnectedNetwork(
         layers=layers,
         activations=activations,
         use_batch_norm=self.use_batch_norm,
         min_std=self.min_std,
         dropout_ratio=self.dropout_ratio,
         use_layer_norm=self.use_layer_norm,
         normalize_output=self.normalize_output,
         orthogonal_init=self.orthogonal_init,
     )
     mlp = nn.Sequential(*[
         mlp,
         self.final_layer.get(),
     ])
     return MLPScorer(mlp=mlp, has_user_feat=self.has_user_feat)
Exemplo n.º 9
0
    def __init__(self,
                 state_dim,
                 action_dim,
                 sizes,
                 activations,
                 use_batch_norm=False):
        """
        AKA the multivariate beta distribution. Used in cases where actor's action
        must sum to 1.
        """
        super().__init__()
        assert state_dim > 0, "state_dim must be > 0, got {}".format(state_dim)
        assert action_dim > 0, "action_dim must be > 0, got {}".format(
            action_dim)
        self.state_dim = state_dim
        self.action_dim = action_dim
        assert len(sizes) == len(
            activations
        ), "The numbers of sizes and activations must match; got {} vs {}".format(
            len(sizes), len(activations))

        # The last layer gives the concentration of the distribution.
        self.fc = FullyConnectedNetwork(
            [state_dim] + sizes + [action_dim],
            activations + ["linear"],
            use_batch_norm=use_batch_norm,
        )
Exemplo n.º 10
0
 def build_value_network(self,
                         state_normalization_data: NormalizationData,
                         output_dim: int = 1) -> torch.nn.Module:
     state_dim = get_num_output_features(
         state_normalization_data.dense_normalization_parameters)
     return FullyConnectedNetwork(
         [state_dim] + self.sizes + [output_dim],
         self.activations + ["linear"],
         use_layer_norm=self.use_layer_norm,
     )
Exemplo n.º 11
0
    def test_forward_pass(self):
        state_dim = 1
        action_dim = 2
        input = PreprocessedState.from_tensor(state=torch.tensor([[2.0]]))
        bcq_drop_threshold = 0.20

        q_network = FullyConnectedDQN(state_dim,
                                      action_dim,
                                      sizes=[2],
                                      activations=["relu"])
        # Set weights of q-network to make it deterministic
        q_net_layer_0_w = torch.tensor([[1.2], [0.9]])
        q_network.state_dict()["fc.layers.0.weight"].data.copy_(
            q_net_layer_0_w)
        q_net_layer_0_b = torch.tensor([0.0, 0.0])
        q_network.state_dict()["fc.layers.0.bias"].data.copy_(q_net_layer_0_b)
        q_net_layer_1_w = torch.tensor([[0.5, -0.5], [1.0, 1.0]])
        q_network.state_dict()["fc.layers.1.weight"].data.copy_(
            q_net_layer_1_w)
        q_net_layer_1_b = torch.tensor([0.0, 0.0])
        q_network.state_dict()["fc.layers.1.bias"].data.copy_(q_net_layer_1_b)

        imitator_network = FullyConnectedNetwork(
            layers=[state_dim, 2, action_dim], activations=["relu", "linear"])
        # Set weights of imitator network to make it deterministic
        im_net_layer_0_w = torch.tensor([[1.2], [0.9]])
        imitator_network.state_dict()["layers.0.weight"].data.copy_(
            im_net_layer_0_w)
        im_net_layer_0_b = torch.tensor([0.0, 0.0])
        imitator_network.state_dict()["layers.0.bias"].data.copy_(
            im_net_layer_0_b)
        im_net_layer_1_w = torch.tensor([[0.5, 1.5], [1.0, 2.0]])
        imitator_network.state_dict()["layers.1.weight"].data.copy_(
            im_net_layer_1_w)
        im_net_layer_1_b = torch.tensor([0.0, 0.0])
        imitator_network.state_dict()["layers.1.bias"].data.copy_(
            im_net_layer_1_b)

        imitator_probs = torch.nn.functional.softmax(imitator_network(
            input.state.float_features),
                                                     dim=1)
        bcq_mask = imitator_probs < bcq_drop_threshold
        assert bcq_mask[0][0] == 1
        assert bcq_mask[0][1] == 0

        model = BatchConstrainedDQN(
            state_dim=state_dim,
            q_network=q_network,
            imitator_network=imitator_network,
            bcq_drop_threshold=bcq_drop_threshold,
        )
        final_q_values = model(input)
        assert final_q_values.q_values[0][0] == -1e10
        assert abs(final_q_values.q_values[0][1] - 4.2) < 0.0001
Exemplo n.º 12
0
    def __init__(
        self,
        state_dim: int,
        action_dim: int,
        sizes: List[int],
        activations: List[str],
        scale: float = 0.05,
        use_batch_norm: bool = False,
        use_layer_norm: bool = False,
        use_l2_normalization: bool = False,
    ):
        """
        Args:
            use_l2_normalization: if True, divides action by l2 norm.
        """
        super().__init__()
        assert state_dim > 0, "state_dim must be > 0, got {}".format(state_dim)
        assert action_dim > 0, "action_dim must be > 0, got {}".format(
            action_dim)
        self.state_dim = state_dim
        self.action_dim = action_dim
        assert len(sizes) == len(
            activations
        ), "The numbers of sizes and activations must match; got {} vs {}".format(
            len(sizes), len(activations))
        # The last layer is mean & scale for reparameterization trick
        self.fc = FullyConnectedNetwork(
            [state_dim] + sizes + [action_dim * 2],
            activations + ["linear"],
            use_batch_norm=use_batch_norm,
            use_layer_norm=use_layer_norm,
        )
        self.use_layer_norm = use_layer_norm
        if self.use_layer_norm:
            self.loc_layer_norm = torch.nn.LayerNorm(action_dim)
            self.scale_layer_norm = torch.nn.LayerNorm(action_dim)

        self.use_l2_normalization = use_l2_normalization

        # used to calculate log-prob
        self.const = math.log(math.sqrt(2 * math.pi))
        self.eps = 1e-6
        self._log_min_max = (-20.0, 2.0)
Exemplo n.º 13
0
 def test_save_load(self):
     state_dim = 8
     action_dim = 4
     q_network = FullyConnectedDQN(state_dim,
                                   action_dim,
                                   sizes=[8, 4],
                                   activations=["relu", "relu"])
     imitator_network = FullyConnectedNetwork(
         layers=[state_dim, 8, 4, action_dim],
         activations=["relu", "relu", "linear"])
     model = BatchConstrainedDQN(
         state_dim=state_dim,
         q_network=q_network,
         imitator_network=imitator_network,
         bcq_drop_threshold=0.05,
     )
     # 6 for DQN + 6 for Imitator Network + 2 for BCQ constants
     expected_num_params, expected_num_inputs, expected_num_outputs = 14, 1, 1
     check_save_load(self, model, expected_num_params, expected_num_inputs,
                     expected_num_outputs)
Exemplo n.º 14
0
    def test_basic(self):
        state_dim = 8
        action_dim = 4
        q_network = FullyConnectedDQN(state_dim,
                                      action_dim,
                                      sizes=[8, 4],
                                      activations=["relu", "relu"])
        imitator_network = FullyConnectedNetwork(
            layers=[state_dim, 8, 4, action_dim],
            activations=["relu", "relu", "linear"])
        model = BatchConstrainedDQN(
            state_dim=state_dim,
            q_network=q_network,
            imitator_network=imitator_network,
            bcq_drop_threshold=0.05,
        )

        input = model.input_prototype()
        self.assertEqual((1, state_dim), input.state.float_features.shape)
        q_values = model(input)
        self.assertEqual((1, action_dim), q_values.q_values.shape)
Exemplo n.º 15
0
    def __init__(
        self, seq2reward_network: Seq2RewardNetwork, params: Seq2RewardTrainerParameters
    ):
        self.seq2reward_network = seq2reward_network
        self.params = params
        self.mse_optimizer = torch.optim.Adam(
            self.seq2reward_network.parameters(), lr=params.learning_rate
        )
        self.minibatch_size = self.params.batch_size
        self.loss_reporter = NoOpLossReporter()

        # PageHandler must use this to activate evaluator:
        self.calc_cpe_in_training = True
        # Turning off Q value output during training:
        self.view_q_value = params.view_q_value
        # permutations used to do planning
        self.all_permut = gen_permutations(
            params.multi_steps, len(self.params.action_names)
        )
        self.mse_loss = nn.MSELoss(reduction="mean")

        # Predict how many steps are remaining from the current step
        self.step_predict_network = FullyConnectedNetwork(
            [
                self.seq2reward_network.state_dim,
                self.params.step_predict_net_size,
                self.params.step_predict_net_size,
                self.params.multi_steps,
            ],
            ["relu", "relu", "linear"],
            use_layer_norm=False,
        )
        self.step_loss = nn.CrossEntropyLoss(reduction="mean")
        self.step_optimizer = torch.optim.Adam(
            self.step_predict_network.parameters(), lr=params.learning_rate
        )
Exemplo n.º 16
0
    def __init__(
        self,
        state_dim,
        action_dim,
        sizes,
        activations,
        scale=0.05,
        use_batch_norm=False,
        use_layer_norm=False,
    ):
        super().__init__()
        assert state_dim > 0, "state_dim must be > 0, got {}".format(state_dim)
        assert action_dim > 0, "action_dim must be > 0, got {}".format(action_dim)
        self.state_dim = state_dim
        self.action_dim = action_dim
        assert len(sizes) == len(
            activations
        ), "The numbers of sizes and activations must match; got {} vs {}".format(
            len(sizes), len(activations)
        )
        # The last layer is mean & scale for reparameterization trick
        self.fc = FullyConnectedNetwork(
            [state_dim] + sizes + [action_dim * 2],
            activations + ["linear"],
            use_batch_norm=use_batch_norm,
            use_layer_norm=use_layer_norm,
        )
        self.use_layer_norm = use_layer_norm
        if self.use_layer_norm:
            self.loc_layer_norm = nn.LayerNorm(action_dim)
            self.scale_layer_norm = nn.LayerNorm(action_dim)

        # used to calculate log-prob
        self.const = math.log(math.sqrt(2 * math.pi))
        self.eps = 1e-6
        self._log_min_max = (-20.0, 2.0)
Exemplo n.º 17
0
def get_sac_trainer(
    env: OpenAIGymEnvironment,
    rl_parameters: RLParameters,
    trainer_parameters: SACTrainerParameters,
    critic_training: FeedForwardParameters,
    actor_training: FeedForwardParameters,
    sac_value_training: Optional[FeedForwardParameters],
    use_gpu: bool,
) -> SACTrainer:
    assert rl_parameters == trainer_parameters.rl
    state_dim = get_num_output_features(env.normalization)
    action_dim = get_num_output_features(env.normalization_action)
    q1_network = FullyConnectedParametricDQN(state_dim, action_dim,
                                             critic_training.layers,
                                             critic_training.activations)
    q2_network = None
    # TODO:
    # if trainer_parameters.use_2_q_functions:
    #     q2_network = FullyConnectedParametricDQN(
    #         state_dim,
    #         action_dim,
    #         critic_training.layers,
    #         critic_training.activations,
    #     )
    value_network = None
    if sac_value_training:
        value_network = FullyConnectedNetwork(
            [state_dim] + sac_value_training.layers + [1],
            sac_value_training.activations + ["linear"],
        )
    actor_network = GaussianFullyConnectedActor(state_dim, action_dim,
                                                actor_training.layers,
                                                actor_training.activations)

    min_action_range_tensor_training = torch.full((1, action_dim), -1 + 1e-6)
    max_action_range_tensor_training = torch.full((1, action_dim), 1 - 1e-6)
    min_action_range_tensor_serving = (
        torch.from_numpy(env.action_space.low).float().unsqueeze(
            dim=0)  # type: ignore
    )
    max_action_range_tensor_serving = (
        torch.from_numpy(env.action_space.high).float().unsqueeze(
            dim=0)  # type: ignore
    )

    if use_gpu:
        q1_network.cuda()
        if q2_network:
            q2_network.cuda()
        if value_network:
            value_network.cuda()
        actor_network.cuda()

        min_action_range_tensor_training = min_action_range_tensor_training.cuda(
        )
        max_action_range_tensor_training = max_action_range_tensor_training.cuda(
        )
        min_action_range_tensor_serving = min_action_range_tensor_serving.cuda(
        )
        max_action_range_tensor_serving = max_action_range_tensor_serving.cuda(
        )

    return SACTrainer(
        q1_network,
        actor_network,
        trainer_parameters,
        use_gpu=use_gpu,
        value_network=value_network,
        q2_network=q2_network,
        min_action_range_tensor_training=min_action_range_tensor_training,
        max_action_range_tensor_training=max_action_range_tensor_training,
        min_action_range_tensor_serving=min_action_range_tensor_serving,
        max_action_range_tensor_serving=max_action_range_tensor_serving,
    )
Exemplo n.º 18
0
    def get_sac_trainer(
        self,
        env,
        use_gpu,
        use_2_q_functions=False,
        logged_action_uniform_prior=True,
        constrain_action_sum=False,
        use_value_network=True,
        use_alpha_optimizer=True,
        entropy_temperature=None,
    ):
        q_network_params = FeedForwardParameters(layers=[128, 64],
                                                 activations=["relu", "relu"])
        value_network_params = FeedForwardParameters(
            layers=[128, 64], activations=["relu", "relu"])
        actor_network_params = FeedForwardParameters(
            layers=[128, 64], activations=["relu", "relu"])

        state_dim = get_num_output_features(env.normalization)
        action_dim = get_num_output_features(
            env.normalization_continuous_action)
        q1_network = FullyConnectedParametricDQN(state_dim, action_dim,
                                                 q_network_params.layers,
                                                 q_network_params.activations)
        q2_network = None
        if use_2_q_functions:
            q2_network = FullyConnectedParametricDQN(
                state_dim,
                action_dim,
                q_network_params.layers,
                q_network_params.activations,
            )
        if constrain_action_sum:
            actor_network = DirichletFullyConnectedActor(
                state_dim,
                action_dim,
                actor_network_params.layers,
                actor_network_params.activations,
            )
        else:
            actor_network = GaussianFullyConnectedActor(
                state_dim,
                action_dim,
                actor_network_params.layers,
                actor_network_params.activations,
            )

        value_network = None
        if use_value_network:
            value_network = FullyConnectedNetwork(
                [state_dim] + value_network_params.layers + [1],
                value_network_params.activations + ["linear"],
            )

        if use_gpu:
            q1_network.cuda()
            if q2_network:
                q2_network.cuda()
            if value_network:
                value_network.cuda()
            actor_network.cuda()

        parameters = SACTrainerParameters(
            rl=RLParameters(gamma=DISCOUNT, target_update_rate=0.5),
            minibatch_size=self.minibatch_size,
            q_network_optimizer=OptimizerParameters(),
            value_network_optimizer=OptimizerParameters(),
            actor_network_optimizer=OptimizerParameters(),
            alpha_optimizer=OptimizerParameters()
            if use_alpha_optimizer else None,
            entropy_temperature=entropy_temperature,
            logged_action_uniform_prior=logged_action_uniform_prior,
        )

        return SACTrainer(
            q1_network,
            actor_network,
            parameters,
            use_gpu=use_gpu,
            value_network=value_network,
            q2_network=q2_network,
        )