class RewardAuxiliaryObjective(AuxiliaryObjective): def __init__(self, env_spec, embedding_dim, head_config, output_nonlinearity=None, output_w_init=torch.nn.init.xavier_normal_, output_b_init=torch.nn.init.zeros_): self._env_spec = env_spec self._head_config = head_config self._output_nonlinearity = output_nonlinearity self._output_w_init = output_w_init self._output_b_init = output_b_init assert "dense_sizes" in head_config action_dim = self._env_spec.action_space.flat_dim self.net = MLPModule(input_dim=embedding_dim, output_dim=action_dim, hidden_sizes=self._head_config["dense_sizes"], output_nonlinearity=self._output_nonlinearity, output_w_init=self._output_w_init, output_b_init=self._output_b_init ) # fully-connected 1 x num_actions outputs self.net.to(device) logger.log(f"Reward net: {self.net}") def compute_loss(self, embedding, rewards, actions): preds = self.net(embedding) selected_predicted_rewards = torch.sum(preds * actions, axis=1) loss_func = torch.nn.SmoothL1Loss() return loss_func(selected_predicted_rewards, rewards)
def __init__(self, env_spec, embedding_dim, head_config, output_nonlinearity=None, output_w_init=torch.nn.init.xavier_normal_, output_b_init=torch.nn.init.zeros_): self._env_spec = env_spec self._head_config = head_config self._output_nonlinearity = output_nonlinearity self._output_w_init = output_w_init self._output_b_init = output_b_init assert "dense_sizes" in head_config action_dim = self._env_spec.action_space.flat_dim self.net = MLPModule(input_dim=embedding_dim, output_dim=action_dim, hidden_sizes=self._head_config["dense_sizes"], output_nonlinearity=self._output_nonlinearity, output_w_init=self._output_w_init, output_b_init=self._output_b_init ) # fully-connected 1 x num_actions outputs self.net.to(device) logger.log(f"Reward net: {self.net}")
def __init__(self, input_dim, output_dim, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, hidden_w_init=nn.init.xavier_uniform_, hidden_b_init=nn.init.zeros_, output_nonlinearity=None, output_w_init=nn.init.xavier_uniform_, output_b_init=nn.init.zeros_, learn_std=True, init_std=1.0, min_std=1e-6, max_std=None, std_parameterization='exp', layer_normalization=False, normal_distribution_cls=Normal): super(GaussianMLPModule, self).__init__(input_dim=input_dim, output_dim=output_dim, hidden_sizes=hidden_sizes, hidden_nonlinearity=hidden_nonlinearity, hidden_w_init=hidden_w_init, hidden_b_init=hidden_b_init, output_nonlinearity=output_nonlinearity, output_w_init=output_w_init, output_b_init=output_b_init, learn_std=learn_std, init_std=init_std, min_std=min_std, max_std=max_std, std_parameterization=std_parameterization, layer_normalization=layer_normalization, normal_distribution_cls=normal_distribution_cls) self._mean_module = MLPModule( input_dim=self._input_dim, output_dim=self._action_dim, hidden_sizes=self._hidden_sizes, hidden_nonlinearity=self._hidden_nonlinearity, hidden_w_init=self._hidden_w_init, hidden_b_init=self._hidden_b_init, output_nonlinearity=self._output_nonlinearity, output_w_init=self._output_w_init, output_b_init=self._output_b_init, layer_normalization=self._layer_normalization)
def __init__(self, env_spec, hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, hidden_w_init=nn.init.xavier_uniform_, hidden_b_init=nn.init.zeros_, output_nonlinearity=None, output_w_init=nn.init.xavier_uniform_, output_b_init=nn.init.zeros_, layer_normalization=False): super().__init__() self._obs_dim = env_spec.observation_space.flat_dim self._action_dim = env_spec.action_space.flat_dim self._input_dim = self._obs_dim self._hidden_sizes = hidden_sizes self._action_dim = self._action_dim self._hidden_nonlinearity = hidden_nonlinearity self._hidden_w_init = hidden_w_init self._hidden_b_init = hidden_b_init self._output_nonlinearity = output_nonlinearity self._output_w_init = output_w_init self._output_b_init = output_b_init self._layer_normalization = layer_normalization self.module = MLPModule(input_dim=self._input_dim, output_dim=self._action_dim, hidden_sizes=self._hidden_sizes, hidden_nonlinearity=self._hidden_nonlinearity, hidden_w_init=self._hidden_w_init, hidden_b_init=self._hidden_b_init, output_nonlinearity=self._output_nonlinearity, output_w_init=self._output_w_init, output_b_init=self._output_b_init, layer_normalization=self._layer_normalization)
def __init__(self, input_dim, output_dim, single_agent_action_dim=None, # used for centralized hidden_sizes=(32, 32), hidden_nonlinearity=torch.tanh, hidden_w_init=nn.init.xavier_uniform_, hidden_b_init=nn.init.zeros_, output_nonlinearity=None, output_w_init=nn.init.xavier_uniform_, output_b_init=nn.init.zeros_, learn_std=True, # duplicate_std_copies=None, share_std=False, init_std=1.0, min_std=1e-6, max_std=None, std_hidden_sizes=(32, 32), std_hidden_nonlinearity=torch.tanh, std_hidden_w_init=nn.init.xavier_uniform_, std_hidden_b_init=nn.init.zeros_, std_output_nonlinearity=None, std_output_w_init=nn.init.xavier_uniform_, std_parameterization='exp', layer_normalization=False): super().__init__() self._input_dim = input_dim self._hidden_sizes = hidden_sizes self._action_dim = output_dim # can be multiagent action dim for centralized self._single_agent_action_dim = single_agent_action_dim self._learn_std = learn_std # self._duplicate_std_copies = duplicate_std_copies # n agents, for centralized self._share_std = share_std self._std_hidden_sizes = std_hidden_sizes self._min_std = min_std self._max_std = max_std self._std_hidden_nonlinearity = std_hidden_nonlinearity self._std_hidden_w_init = std_hidden_w_init self._std_hidden_b_init = std_hidden_b_init self._std_output_nonlinearity = std_output_nonlinearity self._std_output_w_init = std_output_w_init self._std_parameterization = std_parameterization self._hidden_nonlinearity = hidden_nonlinearity self._hidden_w_init = hidden_w_init self._hidden_b_init = hidden_b_init self._output_nonlinearity = output_nonlinearity self._output_w_init = output_w_init self._output_b_init = output_b_init self._layer_normalization = layer_normalization if self._std_parameterization not in ('exp', 'softplus'): raise NotImplementedError if share_std: init_std_param = torch.Tensor([init_std]).log() else: if single_agent_action_dim is not None: init_std_param = torch.Tensor([init_std] * single_agent_action_dim).log() else: init_std_param = torch.Tensor([init_std] * self._action_dim).log() if self._learn_std: self._init_std = torch.nn.Parameter(init_std_param) else: self._init_std = init_std_param self._min_std_param = self._max_std_param = None if min_std is not None: self._min_std_param = torch.Tensor([min_std]).log() if max_std is not None: self._max_std_param = torch.Tensor([max_std]).log() self._mean_module = MLPModule( input_dim=self._input_dim, output_dim=self._action_dim, hidden_sizes=self._hidden_sizes, hidden_nonlinearity=self._hidden_nonlinearity, hidden_w_init=self._hidden_w_init, hidden_b_init=self._hidden_b_init, output_nonlinearity=self._output_nonlinearity, output_w_init=self._output_w_init, output_b_init=self._output_b_init, layer_normalization=self._layer_normalization)