Exemplo n.º 1
0
    def evaluate(self, decision_requests: DecisionSteps,
                 global_agent_ids: List[str]) -> Dict[str, Any]:
        """
        Evaluates policy for the agent experiences provided.
        :param global_agent_ids:
        :param decision_requests: DecisionStep object containing inputs.
        :return: Outputs from network as defined by self.inference_dict.
        """
        vec_vis_obs, masks = self._split_decision_step(decision_requests)
        vec_obs = [torch.as_tensor(vec_vis_obs.vector_observations)]
        vis_obs = [
            torch.as_tensor(vis_ob)
            for vis_ob in vec_vis_obs.visual_observations
        ]
        memories = torch.as_tensor(
            self.retrieve_memories(global_agent_ids)).unsqueeze(0)

        run_out = {}
        with torch.no_grad():
            action, log_probs, entropy, memories = self.sample_actions(
                vec_obs, vis_obs, masks=masks, memories=memories)
        run_out["action"] = ModelUtils.to_numpy(action)
        run_out["pre_action"] = ModelUtils.to_numpy(action)
        # Todo - make pre_action difference
        run_out["log_probs"] = ModelUtils.to_numpy(log_probs)
        run_out["entropy"] = ModelUtils.to_numpy(entropy)
        run_out["learning_rate"] = 0.0
        if self.use_recurrent:
            run_out["memory_out"] = ModelUtils.to_numpy(memories).squeeze(0)
        return run_out
Exemplo n.º 2
0
    def evaluate(self, decision_requests: DecisionSteps,
                 global_agent_ids: List[str]) -> Dict[str, Any]:
        """
        Evaluates policy for the agent experiences provided.
        :param global_agent_ids:
        :param decision_requests: DecisionStep object containing inputs.
        :return: Outputs from network as defined by self.inference_dict.
        """
        obs = decision_requests.obs
        masks = self._extract_masks(decision_requests)
        tensor_obs = [torch.as_tensor(np_ob) for np_ob in obs]

        memories = torch.as_tensor(
            self.retrieve_memories(global_agent_ids)).unsqueeze(0)

        run_out = {}
        with torch.no_grad():
            action, log_probs, entropy, memories = self.sample_actions(
                tensor_obs, masks=masks, memories=memories)
        action_tuple = action.to_action_tuple()
        run_out["action"] = action_tuple
        # This is the clipped action which is not saved to the buffer
        # but is exclusively sent to the environment.
        env_action_tuple = action.to_action_tuple(clip=self._clip_action)
        run_out["env_action"] = env_action_tuple
        run_out["log_probs"] = log_probs.to_log_probs_tuple()
        run_out["entropy"] = ModelUtils.to_numpy(entropy)
        run_out["learning_rate"] = 0.0
        if self.use_recurrent:
            run_out["memory_out"] = ModelUtils.to_numpy(memories).squeeze(0)
        return run_out
Exemplo n.º 3
0
def _compare_two_policies(policy1: TorchPolicy, policy2: TorchPolicy) -> None:
    """
    Make sure two policies have the same output for the same input.
    """
    decision_step, _ = mb.create_steps_from_behavior_spec(
        policy1.behavior_spec, num_agents=1)
    vec_vis_obs, masks = policy1._split_decision_step(decision_step)
    vec_obs = [torch.as_tensor(vec_vis_obs.vector_observations)]
    vis_obs = [
        torch.as_tensor(vis_ob) for vis_ob in vec_vis_obs.visual_observations
    ]
    memories = torch.as_tensor(
        policy1.retrieve_memories(list(decision_step.agent_id))).unsqueeze(0)

    with torch.no_grad():
        _, log_probs1, _, _, _ = policy1.sample_actions(vec_obs,
                                                        vis_obs,
                                                        masks=masks,
                                                        memories=memories,
                                                        all_log_probs=True)
        _, log_probs2, _, _, _ = policy2.sample_actions(vec_obs,
                                                        vis_obs,
                                                        masks=masks,
                                                        memories=memories,
                                                        all_log_probs=True)

    np.testing.assert_array_equal(log_probs1, log_probs2)
Exemplo n.º 4
0
 def compute_gradient_magnitude(self, policy_batch: AgentBuffer,
                                expert_batch: AgentBuffer) -> torch.Tensor:
     """
     Gradient penalty from https://arxiv.org/pdf/1704.00028. Adds stability esp.
     for off-policy. Compute gradients w.r.t randomly interpolated input.
     """
     policy_inputs = self.get_state_inputs(policy_batch)
     expert_inputs = self.get_state_inputs(expert_batch)
     interp_inputs = []
     for policy_input, expert_input in zip(policy_inputs, expert_inputs):
         obs_epsilon = torch.rand(policy_input.shape)
         interp_input = obs_epsilon * policy_input + (
             1 - obs_epsilon) * expert_input
         interp_input.requires_grad = True  # For gradient calculation
         interp_inputs.append(interp_input)
     if self._settings.use_actions:
         policy_action = self.get_action_input(policy_batch)
         expert_action = self.get_action_input(expert_batch)
         action_epsilon = torch.rand(policy_action.shape)
         policy_dones = torch.as_tensor(policy_batch[BufferKey.DONE],
                                        dtype=torch.float).unsqueeze(1)
         expert_dones = torch.as_tensor(expert_batch[BufferKey.DONE],
                                        dtype=torch.float).unsqueeze(1)
         dones_epsilon = torch.rand(policy_dones.shape)
         action_inputs = torch.cat(
             [
                 action_epsilon * policy_action +
                 (1 - action_epsilon) * expert_action,
                 dones_epsilon * policy_dones +
                 (1 - dones_epsilon) * expert_dones,
             ],
             dim=1,
         )
         action_inputs.requires_grad = True
         hidden, _ = self.encoder(interp_inputs, action_inputs)
         encoder_input = tuple(interp_inputs + [action_inputs])
     else:
         hidden, _ = self.encoder(interp_inputs)
         encoder_input = tuple(interp_inputs)
     if self._settings.use_vail:
         use_vail_noise = True
         z_mu = self._z_mu_layer(hidden)
         hidden = z_mu + torch.randn_like(
             z_mu) * self._z_sigma * use_vail_noise
     estimate = self._estimator(hidden).squeeze(1).sum()
     gradient = torch.autograd.grad(estimate,
                                    encoder_input,
                                    create_graph=True)[0]
     # Norm's gradient could be NaN at 0. Use our own safe_norm
     safe_norm = (torch.sum(gradient**2, dim=1) + self.EPSILON).sqrt()
     gradient_mag = torch.mean((safe_norm - 1)**2)
     return gradient_mag
Exemplo n.º 5
0
 def get_state_encoding(self, mini_batch: AgentBuffer) -> torch.Tensor:
     """
     Creates the observation input.
     """
     n_vis = len(self._state_encoder.visual_processors)
     hidden, _ = self._state_encoder.forward(
         vec_inputs=[
             torch.as_tensor(mini_batch["vector_obs"], dtype=torch.float)
         ],
         vis_inputs=[
             torch.as_tensor(mini_batch["visual_obs%d" % i],
                             dtype=torch.float) for i in range(n_vis)
         ],
     )
     return hidden
 def compute_estimate(self,
                      mini_batch: AgentBuffer,
                      use_vail_noise: bool = False) -> torch.Tensor:
     """
     Given a mini_batch, computes the estimate (How much the discriminator believes
     the data was sampled from the demonstration data).
     :param mini_batch: The AgentBuffer of data
     :param use_vail_noise: Only when using VAIL : If true, will sample the code, if
     false, will return the mean of the code.
     """
     vec_inputs, vis_inputs = self.get_state_inputs(mini_batch)
     if self._settings.use_actions:
         actions = self.get_action_input(mini_batch)
         dones = torch.as_tensor(mini_batch["done"],
                                 dtype=torch.float).unsqueeze(1)
         action_inputs = torch.cat([actions, dones], dim=1)
         hidden, _ = self.encoder(vec_inputs, vis_inputs, action_inputs)
     else:
         hidden, _ = self.encoder(vec_inputs, vis_inputs)
     z_mu: Optional[torch.Tensor] = None
     if self._settings.use_vail:
         z_mu = self._z_mu_layer(hidden)
         hidden = torch.normal(z_mu, self._z_sigma * use_vail_noise)
     estimate = self._estimator(hidden)
     return estimate, z_mu
 def get_action_input(self, mini_batch: AgentBuffer) -> torch.Tensor:
     """
     Creates the action Tensor. In continuous case, corresponds to the action. In
     the discrete case, corresponds to the concatenation of one hot action Tensors.
     """
     return self._action_flattener.forward(
         torch.as_tensor(mini_batch["actions"], dtype=torch.float))
Exemplo n.º 8
0
def _compare_two_policies(policy1: TorchPolicy, policy2: TorchPolicy) -> None:
    """
    Make sure two policies have the same output for the same input.
    """
    policy1.actor = policy1.actor.to(default_device())
    policy2.actor = policy2.actor.to(default_device())

    decision_step, _ = mb.create_steps_from_behavior_spec(
        policy1.behavior_spec, num_agents=1)
    np_obs = decision_step.obs
    masks = policy1._extract_masks(decision_step)
    memories = torch.as_tensor(
        policy1.retrieve_memories(list(decision_step.agent_id))).unsqueeze(0)
    tensor_obs = [ModelUtils.list_to_tensor(obs) for obs in np_obs]

    with torch.no_grad():
        _, log_probs1, _, _ = policy1.sample_actions(tensor_obs,
                                                     masks=masks,
                                                     memories=memories)
        _, log_probs2, _, _ = policy2.sample_actions(tensor_obs,
                                                     masks=masks,
                                                     memories=memories)
    np.testing.assert_array_equal(
        ModelUtils.to_numpy(log_probs1.all_discrete_tensor),
        ModelUtils.to_numpy(log_probs2.all_discrete_tensor),
    )
Exemplo n.º 9
0
 def list_to_tensor(ndarray_list: List[np.ndarray],
                    dtype: Optional[torch.dtype] = None) -> torch.Tensor:
     """
     Converts a list of numpy arrays into a tensor. MUCH faster than
     calling as_tensor on the list directly.
     """
     return torch.as_tensor(np.asanyarray(ndarray_list), dtype=dtype)
Exemplo n.º 10
0
 def update_normalization(self, vector_obs: np.ndarray) -> None:
     """
     If this policy normalizes vector observations, this will update the norm values in the graph.
     :param vector_obs: The vector observations to add to the running estimate of the distribution.
     """
     vector_obs = [torch.as_tensor(vector_obs)]
     if self.use_vec_obs and self.normalize:
         self.actor_critic.update_normalization(vector_obs)
Exemplo n.º 11
0
 def _extract_masks(self, decision_requests: DecisionSteps) -> np.ndarray:
     mask = None
     if self.behavior_spec.action_spec.discrete_size > 0:
         mask = torch.ones([len(decision_requests), np.sum(self.act_size)])
         if decision_requests.action_mask is not None:
             mask = torch.as_tensor(
                 1 - np.concatenate(decision_requests.action_mask, axis=1)
             )
     return mask
Exemplo n.º 12
0
    def compute_gradient_magnitude(self, policy_batch: AgentBuffer,
                                   expert_batch: AgentBuffer) -> torch.Tensor:
        """
        Gradient penalty from https://arxiv.org/pdf/1704.00028. Adds stability esp.
        for off-policy. Compute gradients w.r.t randomly interpolated input.
        """
        policy_obs = self.get_state_encoding(policy_batch)
        expert_obs = self.get_state_encoding(expert_batch)
        obs_epsilon = torch.rand(policy_obs.shape)
        encoder_input = obs_epsilon * policy_obs + (1 -
                                                    obs_epsilon) * expert_obs
        if self._settings.use_actions:
            policy_action = self.get_action_input(policy_batch)
            expert_action = self.get_action_input(expert_batch)
            action_epsilon = torch.rand(policy_action.shape)
            policy_dones = torch.as_tensor(policy_batch["done"],
                                           dtype=torch.float).unsqueeze(1)
            expert_dones = torch.as_tensor(expert_batch["done"],
                                           dtype=torch.float).unsqueeze(1)
            dones_epsilon = torch.rand(policy_dones.shape)
            encoder_input = torch.cat(
                [
                    encoder_input,
                    action_epsilon * policy_action +
                    (1 - action_epsilon) * expert_action,
                    dones_epsilon * policy_dones +
                    (1 - dones_epsilon) * expert_dones,
                ],
                dim=1,
            )
        hidden = self.encoder(encoder_input)
        if self._settings.use_vail:
            use_vail_noise = True
            z_mu = self._z_mu_layer(hidden)
            hidden = torch.normal(z_mu, self._z_sigma * use_vail_noise)
        estimate = self._estimator(hidden).squeeze(1).sum()

        gradient = torch.autograd.grad(estimate,
                                       encoder_input,
                                       create_graph=True)[0]
        # Norm's gradient could be NaN at 0. Use our own safe_norm
        safe_norm = (torch.sum(gradient**2, dim=1) + self.EPSILON).sqrt()
        gradient_mag = torch.mean((safe_norm - 1)**2)
        return gradient_mag
Exemplo n.º 13
0
 def list_to_tensor_list(
     ndarray_list: List[np.ndarray], dtype: Optional[torch.dtype] = torch.float32
 ) -> torch.Tensor:
     """
     Converts a list of numpy arrays into a list of tensors. MUCH faster than
     calling as_tensor on the list directly.
     """
     return [
         torch.as_tensor(np.asanyarray(_arr), dtype=dtype) for _arr in ndarray_list
     ]
Exemplo n.º 14
0
 def forward(self, action: torch.Tensor) -> torch.Tensor:
     if self._specs.is_action_continuous():
         return action
     else:
         return torch.cat(
             ModelUtils.actions_to_onehot(
                 torch.as_tensor(action, dtype=torch.long),
                 self._specs.discrete_action_branches,
             ),
             dim=1,
         )
Exemplo n.º 15
0
 def _split_decision_step(
     self, decision_requests: DecisionSteps
 ) -> Tuple[SplitObservations, np.ndarray]:
     vec_vis_obs = SplitObservations.from_observations(
         decision_requests.obs)
     mask = None
     if not self.use_continuous_act:
         mask = torch.ones([len(decision_requests), np.sum(self.act_size)])
         if decision_requests.action_mask is not None:
             mask = torch.as_tensor(
                 1 - np.concatenate(decision_requests.action_mask, axis=1))
     return vec_vis_obs, mask
Exemplo n.º 16
0
    def __init__(
        self,
        observation_specs: List[ObservationSpec],
        network_settings: NetworkSettings,
        action_spec: ActionSpec,
    ):
        super().__init__()
        self.normalize = network_settings.normalize
        self.use_lstm = network_settings.memory is not None
        self.h_size = network_settings.hidden_units
        self.m_size = (network_settings.memory.memory_size
                       if network_settings.memory is not None else 0)
        self.action_spec = action_spec
        self.observation_encoder = ObservationEncoder(
            observation_specs,
            self.h_size,
            network_settings.vis_encode_type,
            self.normalize,
        )
        self.processors = self.observation_encoder.processors

        # Modules for multi-agent self-attention
        obs_only_ent_size = self.observation_encoder.total_enc_size
        q_ent_size = (obs_only_ent_size +
                      sum(self.action_spec.discrete_branches) +
                      self.action_spec.continuous_size)

        attention_embeding_size = self.h_size
        self.obs_encoder = EntityEmbedding(obs_only_ent_size, None,
                                           attention_embeding_size)
        self.obs_action_encoder = EntityEmbedding(q_ent_size, None,
                                                  attention_embeding_size)

        self.self_attn = ResidualSelfAttention(attention_embeding_size)

        self.linear_encoder = LinearEncoder(
            attention_embeding_size,
            network_settings.num_layers,
            self.h_size,
            kernel_gain=(0.125 / self.h_size)**0.5,
        )

        if self.use_lstm:
            self.lstm = LSTM(self.h_size, self.m_size)
        else:
            self.lstm = None  # type: ignore
        self._current_max_agents = torch.nn.Parameter(torch.as_tensor(1),
                                                      requires_grad=False)
Exemplo n.º 17
0
 def forward(self, action: AgentAction) -> torch.Tensor:
     """
     Returns a tensor corresponding the flattened action
     :param action: An AgentAction object
     """
     action_list: List[torch.Tensor] = []
     if self._specs.continuous_size > 0:
         action_list.append(action.continuous_tensor)
     if self._specs.discrete_size > 0:
         flat_discrete = torch.cat(
             ModelUtils.actions_to_onehot(
                 torch.as_tensor(action.discrete_tensor, dtype=torch.long),
                 self._specs.discrete_branches,
             ),
             dim=1,
         )
         action_list.append(flat_discrete)
     return torch.cat(action_list, dim=1)
Exemplo n.º 18
0
    def forward(
        self,
        obs_only: List[List[torch.Tensor]],
        obs: List[List[torch.Tensor]],
        actions: List[AgentAction],
        memories: Optional[torch.Tensor] = None,
        sequence_length: int = 1,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Returns sampled actions.
        If memory is enabled, return the memories as well.
        :param obs_only: Observations to be processed that do not have corresponding actions.
            These are encoded with the obs_encoder.
        :param obs: Observations to be processed that do have corresponding actions.
            After concatenation with actions, these are processed with obs_action_encoder.
        :param actions: After concatenation with obs, these are processed with obs_action_encoder.
        :param memories: If using memory, a Tensor of initial memories.
        :param sequence_length: If using memory, the sequence length.
        """
        self_attn_masks = []
        self_attn_inputs = []
        concat_f_inp = []
        if obs:
            obs_attn_mask = self._get_masks_from_nans(obs)
            obs = self._copy_and_remove_nans_from_obs(obs, obs_attn_mask)
            for inputs, action in zip(obs, actions):
                encoded = self.observation_encoder(inputs)
                cat_encodes = [
                    encoded,
                    action.to_flat(self.action_spec.discrete_branches),
                ]
                concat_f_inp.append(torch.cat(cat_encodes, dim=1))
            f_inp = torch.stack(concat_f_inp, dim=1)
            self_attn_masks.append(obs_attn_mask)
            self_attn_inputs.append(self.obs_action_encoder(None, f_inp))

        concat_encoded_obs = []
        if obs_only:
            obs_only_attn_mask = self._get_masks_from_nans(obs_only)
            obs_only = self._copy_and_remove_nans_from_obs(
                obs_only, obs_only_attn_mask)
            for inputs in obs_only:
                encoded = self.observation_encoder(inputs)
                concat_encoded_obs.append(encoded)
            g_inp = torch.stack(concat_encoded_obs, dim=1)
            self_attn_masks.append(obs_only_attn_mask)
            self_attn_inputs.append(self.obs_encoder(None, g_inp))

        encoded_entity = torch.cat(self_attn_inputs, dim=1)
        encoded_state = self.self_attn(encoded_entity, self_attn_masks)

        flipped_masks = 1 - torch.cat(self_attn_masks, dim=1)
        num_agents = torch.sum(flipped_masks, dim=1, keepdim=True)
        if torch.max(num_agents).item() > self._current_max_agents:
            self._current_max_agents = torch.nn.Parameter(torch.as_tensor(
                torch.max(num_agents).item()),
                                                          requires_grad=False)

        # num_agents will be -1 for a single agent and +1 when the current maximum is reached
        num_agents = num_agents * 2.0 / self._current_max_agents - 1

        encoding = self.linear_encoder(encoded_state)
        if self.use_lstm:
            # Resize to (batch, sequence length, encoding size)
            encoding = encoding.reshape([-1, sequence_length, self.h_size])
            encoding, memories = self.lstm(encoding, memories)
            encoding = encoding.reshape([-1, self.m_size // 2])
        encoding = torch.cat([encoding, num_agents], dim=1)
        return encoding, memories
Exemplo n.º 19
0
 def update_normalization(self, buffer: AgentBuffer) -> None:
     obs = ObsUtil.from_buffer(buffer, len(self.processors))
     for vec_input, enc in zip(obs, self.processors):
         if isinstance(enc, VectorInput):
             enc.update_normalization(torch.as_tensor(vec_input))
Exemplo n.º 20
0
    def __init__(self, policy: TorchPolicy, trainer_params: TrainerSettings):
        super().__init__(policy, trainer_params)
        hyperparameters: SACSettings = cast(SACSettings,
                                            trainer_params.hyperparameters)
        self.tau = hyperparameters.tau
        self.init_entcoef = hyperparameters.init_entcoef

        self.policy = policy
        policy_network_settings = policy.network_settings

        self.tau = hyperparameters.tau
        self.burn_in_ratio = 0.0

        # Non-exposed SAC parameters
        self.discrete_target_entropy_scale = 0.2  # Roughly equal to e-greedy 0.05
        self.continuous_target_entropy_scale = 1.0

        self.stream_names = list(self.reward_signals.keys())
        # Use to reduce "survivor bonus" when using Curiosity or GAIL.
        self.gammas = [
            _val.gamma for _val in trainer_params.reward_signals.values()
        ]
        self.use_dones_in_backup = {
            name: int(not self.reward_signals[name].ignore_done)
            for name in self.stream_names
        }
        self._action_spec = self.policy.behavior_spec.action_spec

        self.value_network = TorchSACOptimizer.PolicyValueNetwork(
            self.stream_names,
            self.policy.behavior_spec.sensor_specs,
            policy_network_settings,
            self._action_spec,
        )

        self.target_network = ValueNetwork(
            self.stream_names,
            self.policy.behavior_spec.sensor_specs,
            policy_network_settings,
        )
        ModelUtils.soft_update(self.policy.actor_critic.critic,
                               self.target_network, 1.0)

        # We create one entropy coefficient per action, whether discrete or continuous.
        _disc_log_ent_coef = torch.nn.Parameter(
            torch.log(
                torch.as_tensor([self.init_entcoef] *
                                len(self._action_spec.discrete_branches))),
            requires_grad=True,
        )
        _cont_log_ent_coef = torch.nn.Parameter(torch.log(
            torch.as_tensor([self.init_entcoef])),
                                                requires_grad=True)
        self._log_ent_coef = TorchSACOptimizer.LogEntCoef(
            discrete=_disc_log_ent_coef, continuous=_cont_log_ent_coef)
        _cont_target = (
            -1 * self.continuous_target_entropy_scale *
            np.prod(self._action_spec.continuous_size).astype(np.float32))
        _disc_target = [
            self.discrete_target_entropy_scale * np.log(i).astype(np.float32)
            for i in self._action_spec.discrete_branches
        ]
        self.target_entropy = TorchSACOptimizer.TargetEntropy(
            continuous=_cont_target, discrete=_disc_target)
        policy_params = list(
            self.policy.actor_critic.network_body.parameters()) + list(
                self.policy.actor_critic.action_model.parameters())
        value_params = list(self.value_network.parameters()) + list(
            self.policy.actor_critic.critic.parameters())

        logger.debug("value_vars")
        for param in value_params:
            logger.debug(param.shape)
        logger.debug("policy_vars")
        for param in policy_params:
            logger.debug(param.shape)

        self.decay_learning_rate = ModelUtils.DecayedValue(
            hyperparameters.learning_rate_schedule,
            hyperparameters.learning_rate,
            1e-10,
            self.trainer_settings.max_steps,
        )
        self.policy_optimizer = torch.optim.Adam(
            policy_params, lr=hyperparameters.learning_rate)
        self.value_optimizer = torch.optim.Adam(
            value_params, lr=hyperparameters.learning_rate)
        self.entropy_optimizer = torch.optim.Adam(
            self._log_ent_coef.parameters(), lr=hyperparameters.learning_rate)
        self._move_to_device(default_device())