示例#1
0
class RewardAuxiliaryObjective(AuxiliaryObjective):
    def __init__(self,
                 env_spec,
                 embedding_dim,
                 head_config,
                 output_nonlinearity=None,
                 output_w_init=torch.nn.init.xavier_normal_,
                 output_b_init=torch.nn.init.zeros_):

        self._env_spec = env_spec
        self._head_config = head_config
        self._output_nonlinearity = output_nonlinearity
        self._output_w_init = output_w_init
        self._output_b_init = output_b_init
        assert "dense_sizes" in head_config

        action_dim = self._env_spec.action_space.flat_dim

        self.net = MLPModule(input_dim=embedding_dim,
                             output_dim=action_dim,
                             hidden_sizes=self._head_config["dense_sizes"],
                             output_nonlinearity=self._output_nonlinearity,
                             output_w_init=self._output_w_init,
                             output_b_init=self._output_b_init
                             )  # fully-connected 1 x num_actions outputs
        self.net.to(device)

        logger.log(f"Reward net: {self.net}")

    def compute_loss(self, embedding, rewards, actions):
        preds = self.net(embedding)
        selected_predicted_rewards = torch.sum(preds * actions, axis=1)
        loss_func = torch.nn.SmoothL1Loss()
        return loss_func(selected_predicted_rewards, rewards)
示例#2
0
    def __init__(self,
                 env_spec,
                 embedding_dim,
                 head_config,
                 output_nonlinearity=None,
                 output_w_init=torch.nn.init.xavier_normal_,
                 output_b_init=torch.nn.init.zeros_):

        self._env_spec = env_spec
        self._head_config = head_config
        self._output_nonlinearity = output_nonlinearity
        self._output_w_init = output_w_init
        self._output_b_init = output_b_init
        assert "dense_sizes" in head_config

        action_dim = self._env_spec.action_space.flat_dim

        self.net = MLPModule(input_dim=embedding_dim,
                             output_dim=action_dim,
                             hidden_sizes=self._head_config["dense_sizes"],
                             output_nonlinearity=self._output_nonlinearity,
                             output_w_init=self._output_w_init,
                             output_b_init=self._output_b_init
                             )  # fully-connected 1 x num_actions outputs
        self.net.to(device)

        logger.log(f"Reward net: {self.net}")
示例#3
0
    def __init__(self,
                 input_dim,
                 output_dim,
                 hidden_sizes=(32, 32),
                 hidden_nonlinearity=torch.tanh,
                 hidden_w_init=nn.init.xavier_uniform_,
                 hidden_b_init=nn.init.zeros_,
                 output_nonlinearity=None,
                 output_w_init=nn.init.xavier_uniform_,
                 output_b_init=nn.init.zeros_,
                 learn_std=True,
                 init_std=1.0,
                 min_std=1e-6,
                 max_std=None,
                 std_parameterization='exp',
                 layer_normalization=False,
                 normal_distribution_cls=Normal):
        super(GaussianMLPModule,
              self).__init__(input_dim=input_dim,
                             output_dim=output_dim,
                             hidden_sizes=hidden_sizes,
                             hidden_nonlinearity=hidden_nonlinearity,
                             hidden_w_init=hidden_w_init,
                             hidden_b_init=hidden_b_init,
                             output_nonlinearity=output_nonlinearity,
                             output_w_init=output_w_init,
                             output_b_init=output_b_init,
                             learn_std=learn_std,
                             init_std=init_std,
                             min_std=min_std,
                             max_std=max_std,
                             std_parameterization=std_parameterization,
                             layer_normalization=layer_normalization,
                             normal_distribution_cls=normal_distribution_cls)

        self._mean_module = MLPModule(
            input_dim=self._input_dim,
            output_dim=self._action_dim,
            hidden_sizes=self._hidden_sizes,
            hidden_nonlinearity=self._hidden_nonlinearity,
            hidden_w_init=self._hidden_w_init,
            hidden_b_init=self._hidden_b_init,
            output_nonlinearity=self._output_nonlinearity,
            output_w_init=self._output_w_init,
            output_b_init=self._output_b_init,
            layer_normalization=self._layer_normalization)
示例#4
0
    def __init__(self,
                 env_spec,
                 hidden_sizes=(32, 32),
                 hidden_nonlinearity=torch.tanh,
                 hidden_w_init=nn.init.xavier_uniform_,
                 hidden_b_init=nn.init.zeros_,
                 output_nonlinearity=None,
                 output_w_init=nn.init.xavier_uniform_,
                 output_b_init=nn.init.zeros_,
                 layer_normalization=False):
        super().__init__()

        self._obs_dim = env_spec.observation_space.flat_dim
        self._action_dim = env_spec.action_space.flat_dim

        self._input_dim = self._obs_dim
        self._hidden_sizes = hidden_sizes
        self._action_dim = self._action_dim
        self._hidden_nonlinearity = hidden_nonlinearity
        self._hidden_w_init = hidden_w_init
        self._hidden_b_init = hidden_b_init
        self._output_nonlinearity = output_nonlinearity
        self._output_w_init = output_w_init
        self._output_b_init = output_b_init
        self._layer_normalization = layer_normalization

        self.module = MLPModule(input_dim=self._input_dim,
                                output_dim=self._action_dim,
                                hidden_sizes=self._hidden_sizes,
                                hidden_nonlinearity=self._hidden_nonlinearity,
                                hidden_w_init=self._hidden_w_init,
                                hidden_b_init=self._hidden_b_init,
                                output_nonlinearity=self._output_nonlinearity,
                                output_w_init=self._output_w_init,
                                output_b_init=self._output_b_init,
                                layer_normalization=self._layer_normalization)
示例#5
0
    def __init__(self,
                 input_dim,
                 output_dim,
                 single_agent_action_dim=None, # used for centralized
                 hidden_sizes=(32, 32),
                 hidden_nonlinearity=torch.tanh,
                 hidden_w_init=nn.init.xavier_uniform_,
                 hidden_b_init=nn.init.zeros_,
                 output_nonlinearity=None,
                 output_w_init=nn.init.xavier_uniform_,
                 output_b_init=nn.init.zeros_,
                 learn_std=True,
                 # duplicate_std_copies=None,
                 share_std=False,
                 init_std=1.0,
                 min_std=1e-6,
                 max_std=None,
                 std_hidden_sizes=(32, 32),
                 std_hidden_nonlinearity=torch.tanh,
                 std_hidden_w_init=nn.init.xavier_uniform_,
                 std_hidden_b_init=nn.init.zeros_,
                 std_output_nonlinearity=None,
                 std_output_w_init=nn.init.xavier_uniform_,
                 std_parameterization='exp',
                 layer_normalization=False):
        super().__init__()

        self._input_dim = input_dim
        self._hidden_sizes = hidden_sizes
        self._action_dim = output_dim # can be multiagent action dim for centralized
        self._single_agent_action_dim = single_agent_action_dim
        self._learn_std = learn_std
        # self._duplicate_std_copies = duplicate_std_copies # n agents, for centralized
        self._share_std = share_std
        self._std_hidden_sizes = std_hidden_sizes
        self._min_std = min_std
        self._max_std = max_std
        self._std_hidden_nonlinearity = std_hidden_nonlinearity
        self._std_hidden_w_init = std_hidden_w_init
        self._std_hidden_b_init = std_hidden_b_init
        self._std_output_nonlinearity = std_output_nonlinearity
        self._std_output_w_init = std_output_w_init
        self._std_parameterization = std_parameterization
        self._hidden_nonlinearity = hidden_nonlinearity
        self._hidden_w_init = hidden_w_init
        self._hidden_b_init = hidden_b_init
        self._output_nonlinearity = output_nonlinearity
        self._output_w_init = output_w_init
        self._output_b_init = output_b_init
        self._layer_normalization = layer_normalization

        if self._std_parameterization not in ('exp', 'softplus'):
            raise NotImplementedError

        if share_std:
            init_std_param = torch.Tensor([init_std]).log()
        else:
            if single_agent_action_dim is not None:
                init_std_param = torch.Tensor([init_std] * single_agent_action_dim).log()
            else:
                init_std_param = torch.Tensor([init_std] * self._action_dim).log()

        if self._learn_std:
            self._init_std = torch.nn.Parameter(init_std_param)
        else:
            self._init_std = init_std_param

        self._min_std_param = self._max_std_param = None
        if min_std is not None:
            self._min_std_param = torch.Tensor([min_std]).log()
        if max_std is not None:
            self._max_std_param = torch.Tensor([max_std]).log()

        self._mean_module = MLPModule(
            input_dim=self._input_dim,
            output_dim=self._action_dim,
            hidden_sizes=self._hidden_sizes,
            hidden_nonlinearity=self._hidden_nonlinearity,
            hidden_w_init=self._hidden_w_init,
            hidden_b_init=self._hidden_b_init,
            output_nonlinearity=self._output_nonlinearity,
            output_w_init=self._output_w_init,
            output_b_init=self._output_b_init,
            layer_normalization=self._layer_normalization)