Пример #1
0
    def compute_actions(self,
                        obs_batch,
                        state_batches=None,
                        prev_action_batch=None,
                        prev_reward_batch=None,
                        info_batch=None,
                        episodes=None,
                        **kwargs):
        obs_batch, action_mask = self._unpack_observation(obs_batch)
        assert len(state_batches) == self.n_agents, state_batches
        state_batches = np.stack(state_batches, axis=1)

        # Compute actions
        with th.no_grad():
            q_values, hiddens = _mac(self.model, th.from_numpy(obs_batch),
                                     th.from_numpy(state_batches))
            avail = th.from_numpy(action_mask).float()
            masked_q_values = q_values.clone()
            masked_q_values[avail == 0.0] = -float("inf")
            # epsilon-greedy action selector
            random_numbers = th.rand_like(q_values[:, :, 0])
            pick_random = (random_numbers < self.cur_epsilon).long()
            random_actions = Categorical(avail).sample().long()
            actions = (pick_random * random_actions +
                       (1 - pick_random) * masked_q_values.max(dim=2)[1])
            actions = var_to_np(actions)
            hiddens = var_to_np(hiddens)

        return (TupleActions(list(actions.transpose([1, 0]))),
                hiddens.transpose([1, 0, 2]), {})
Пример #2
0
 def compute_actions(self, obs, state, is_training=False):
     assert not state, "RNN not supported"
     with self.lock:
         ob = torch.from_numpy(np.array(obs)).float()
         logits, values = self._model(ob)
         samples = F.softmax(logits, dim=1).multinomial(1).squeeze(0)
         return var_to_np(samples), [], {"vf_preds": var_to_np(values)}
Пример #3
0
 def compute_action(self, ob, *args):
     """Should take in a SINGLE ob"""
     with self.lock:
         ob = Variable(torch.from_numpy(ob).float().unsqueeze(0))
         logits, values = self._model(ob)
         samples = self._model.probs(logits).multinomial().squeeze()
         values = values.squeeze(0)
         return var_to_np(samples), {"value": var_to_np(values)}
Пример #4
0
 def compute(self, ob, *args):
     """Should take in a SINGLE ob"""
     with self.lock:
         ob = Variable(torch.from_numpy(ob).float().unsqueeze(0))
         logits, values = self._model(ob)
         samples = self._model.probs(logits).multinomial().squeeze()
         values = values.squeeze(0)
         return var_to_np(samples), {"vf_preds": var_to_np(values)}
Пример #5
0
 def compute(self, ob, *args):
     """Should take in a SINGLE ob"""
     with self.lock:
         ob = torch.from_numpy(ob).float().unsqueeze(0)
         logits, values = self._model(ob)
         # TODO(alok): Support non-categorical distributions. Multinomial
         # is only for categorical.
         sampled_actions = F.softmax(logits, dim=1).multinomial(1).squeeze()
         values = values.squeeze()
         return var_to_np(sampled_actions), {"vf_preds": var_to_np(values)}
Пример #6
0
 def _value(self, obs):
     with self.lock:
         obs = torch.from_numpy(obs).float().unsqueeze(0)
         res = self.model.hidden_layers(obs)
         res = self.model.value_branch(res)
         res = res.squeeze()
         return var_to_np(res)
Пример #7
0
 def value(self, ob, *args):
     with self.lock:
         ob = Variable(torch.from_numpy(ob).float().unsqueeze(0))
         res = self._model.hidden_layers(ob)
         res = self._model.value_branch(res)
         res = res.squeeze(0)
         return var_to_np(res)
Пример #8
0
 def value(self, ob, *args):
     with self.lock:
         ob = Variable(torch.from_numpy(ob).float().unsqueeze(0))
         res = self._model.hidden_layers(ob)
         res = self._model.value_branch(res)
         res = res.squeeze(0)
         return var_to_np(res)
Пример #9
0
 def compute_actions(
         self, obs_batch, state_batches=None, is_training=False):
     if state_batches:
         raise NotImplementedError("Torch RNN support")
     with self.lock:
         with torch.no_grad():
             ob = torch.from_numpy(np.array(obs_batch)).float()
             model_out = self._model(ob)
             logits = model_out[0]  # assume the first output is the logits
             actions = F.softmax(logits, dim=1).multinomial(1).squeeze(0)
             return var_to_np(actions), [], self.extra_action_out(model_out)
Пример #10
0
 def compute_gradients(self, postprocessed_batch):
     with self.lock:
         loss_in = []
         for key in self._loss_inputs:
             loss_in.append(torch.from_numpy(postprocessed_batch[key]))
         loss_out = self._loss(*loss_in)
         self._optimizer.zero_grad()
         loss_out.backward()
         # Note that return values are just references;
         # calling zero_grad will modify the values
         grads = [var_to_np(p.grad.data) for p in self._model.parameters()]
         return grads, {}
Пример #11
0
 def extra_action_out(self, model_out):
     return {"vf_preds": var_to_np(model_out[1])}
Пример #12
0
 def compute_logits(self, ob, *args):
     with self.lock:
         ob = Variable(torch.from_numpy(ob).float().unsqueeze(0))
         res = self._model.hidden_layers(ob)
         return var_to_np(self._model.logits(res))
Пример #13
0
 def compute_logits(self, ob, *args):
     with self.lock:
         ob = Variable(torch.from_numpy(ob).float().unsqueeze(0))
         res = self._model.hidden_layers(ob)
         return var_to_np(self._model.logits(res))