Exemplo n.º 1
0
    def __init__(self,
                 action_space,
                 dueling=False,
                 dueling_value_layer_hidden_size=None,
                 **kwargs):
        """Initializes the policy

        Args:
            action_space: The action space of the ENVs, only 'Discrete' is
                supported for DQN policies
            dueling: Whether to use a dueling architecture (Parallel advantage
                and value layers). Dueling does not add an additional hidden
                layer to the model, rather the last layer of the model will be
                treated as the hidden 'advantage layer', while an additional
                'state-value' layer will be added here in parallel to the last
                layer from the model.
            dueling_value_layer_hidden_size: Size of the hidden value-layer FC
                module. In case of dueling=True the last layer of the model
                will be the 'advantage hidden layer', while this value defines
                the size of the parallel state-value layer. By default (None)
                it will be auto-set to be the same size as the last layer in
                the model (i.e. the 'advantage hidden layer').
                Note that if the last layer of the model is an LSTM and not an
                FC layer, the value-hidden-layer here will still be an FC
                layer, while the 'advantage layer' will be that LSTM layer from
                the model, which may or may not be the intention. If not the
                intention you should add an FC layer in the model after the
                LSTM layer. For example you can use a CNN->LSTM->FC model or
                CNN->FC->LSTM->FC model.
        """
        super().__init__(**kwargs)
        # Some sub-classes need multiple outputs per action (e.g.
        # distributional DQN)
        out_size = action_space.n * self._outputs_per_action()

        # The final output FC layer generating the action-Q values
        self.out_layer = linear(self.model.out_size, out_size)

        if dueling:
            # For dueling architecture, we inject the value estimation layer
            # before the last layer of the model, in parallel to it
            inner_layer_size = int(np.prod(self.model.get_layer_in_shape(-1)))
            if not dueling_value_layer_hidden_size:
                # Auto-set dueling value-layer hidden size to be same as the
                # parallel action/advanatage layer hidden size
                dueling_value_layer_hidden_size = \
                    int(np.prod(self.model.get_layer_out_shape(-1)))
            logging.getLogger().info(
                "Dueling value hidden layer size: "
                f"{inner_layer_size}x{dueling_value_layer_hidden_size}")

            self.value_hidden_layer = linear(inner_layer_size,
                                             dueling_value_layer_hidden_size)
            self.value_layer = linear(dueling_value_layer_hidden_size,
                                      self._outputs_per_action())
        else:
            self.value_layer = None
Exemplo n.º 2
0
    def __init__(self,
                 model_config,
                 observation_space,
                 action_space,
                 critic_separate_model=False):
        """Initialize an actor-critic policy

        Args:
            model_config: The configuration for creating the model
            observation_space: The observation space defining the model input
            action_space: The action space for outputting actons as
            critic_separate_model: Whether to use a duplicate/separate model
                for the critic (Otherwise, same model is used for both actor
                and critic)
        """
        super().__init__(model_config, observation_space)

        # Optionally use a separate identical model for the critic
        if not critic_separate_model:
            self.value_model = None
        else:
            self.value_model = self._create_model_from_config(
                model_config, observation_space)

        # Create the actor distribution and critic layers
        self.actor = get_dist_layer_class(action_space)(action_space,
                                                        self.model.out_size)
        self.critic = linear(self.model.out_size, 1)
Exemplo n.º 3
0
    def __init__(self, action_space, input_size):
        super().__init__(action_space, input_size)
        assert (isinstance(action_space, gym.spaces.Box))
        self.actions_shape = action_space.shape

        flat_size = np.prod(action_space.shape)

        self.mean_layer = linear(input_size, flat_size)
        self.logstd = torch.nn.Parameter(torch.zeros(flat_size))
Exemplo n.º 4
0
    def __init__(self, *args, embedding_dim=64, num_sampling_quantiles=32,
                 injection_layer=-1, **kwargs):
        """Initializes an IQN policy

        Args:
            embedding_dim: The embedding size to use in the quantile layer
                (Defaults to 64 as in the paper)
            num_sampling_quantiles: How many quantile samples to use on each
                forward pass (This value is used for all forward-passes,
                including actor-action-selection, and target+training values,
                i.e. N and N' from the paper. There is no option ATM to define
                them separately).
                Default value of 32 is used as this seems good enough, also
                according to the paper (Which uses 64 for N and N' and 32 for
                actor-action-selection)
                In case the quantile layer is injected before an LSTM layer
                then 32 causes a big slowdown in training time, and it is
                recommended to use 8 in this case which should also be good
                enough according to the paper, though this wasn't verified
            injection_layer: Before which model layer to inject the quantile
                layer and convert the batch to a multi-sample one. The
                default (-1) means to inject it before the last layer. For
                example for a CNN->FC model it will behave like in the paper
                (After the CNN layer). For a CNN->LSTM->FC model it will
                inject it between the LSTM and FC layers (Alternatively,
                setting injection_layer=1 will always inject it immediately
                after the CNN layer)
        """
        super().__init__(*args, **kwargs)

        self.num_sampling_quantiles = num_sampling_quantiles
        self.embedding_dim = embedding_dim

        # For IQN we need to 'inject' ourselves in the middle of the model,
        # typically after layer0 (The CNN layer) / before the last layer
        inner_layer_shape = self.model.set_layer_preprocessor(
            injection_layer, self._apply_quantile_layer)
        quantile_layer_size = int(np.prod(inner_layer_shape))
        logging.getLogger().info(f"IQN Layer size: {quantile_layer_size}")

        # The quantile layer uses an embedding and random samples and merges
        # to the 'state reprsentation' output of the model, usually after
        # the CNN layer or LSTM layer, depending on the model and
        # 'injection_layer' option (See _apply_quantile_layer).
        self.quantile_layer = linear(embedding_dim, quantile_layer_size)

        # The embedding range used in _apply_quantile_layer, create it only
        # once (This should have requires_grad=False by default so shouldn't
        # change during backprop)
        # register_buffer ensures it's moved to GPU/CPU etc together with the
        # whole policy/model
        self.register_buffer(
            "embedding_range",
            torch.arange(1, self.embedding_dim+1, dtype=torch.float32))
Exemplo n.º 5
0
 def __init__(self, action_space, input_size):
     super().__init__(action_space, input_size)
     assert (isinstance(action_space, gym.spaces.Discrete))
     self.logits_layer = linear(input_size, action_space.n)