def test_categorical(): probs = torch.as_tensor([0.2, 0.6, 0.2]) dist = torch.distributions.Categorical(probs=probs) mode = mode_of_distribution(dist) assert mode.tolist() == 1 probs = torch.as_tensor([0.6, 0.2, 0.2]) dist = torch.distributions.Categorical(probs=probs) mode = mode_of_distribution(dist) assert mode.tolist() == 0 probs = torch.as_tensor([[0.6, 0.2, 0.2], [0.2, 0.2, 0.6]]) dist = torch.distributions.Categorical(probs) mode = mode_of_distribution(dist) assert mode.tolist() == [0, 2]
def test_independent_normal(): loc = torch.as_tensor([[0.3, 0.7], [0.2, 0.4]]) scale = torch.as_tensor([[0.1, 0.2], [0.3, 0.8]]) dist = torch.distributions.Independent( torch.distributions.Normal(loc, scale), 1) mode = mode_of_distribution(dist) torch_assert_allclose(mode, loc)
def _batch_act_train(self, batch_obs): assert self.training statevar = self.batch_states(batch_obs, self.device, self.phi) if self.t == 0: with torch.no_grad(): pout, _ = self.model(statevar) action = pout.sample() self._flush_storage(statevar.shape, action) self.states[self.t - self.t_start] = statevar # NOTE: if self.t - self.t_start == self.update_steps * 2: self.update() # NOTE: inference during training with torch.no_grad(): pout, value = self.model(statevar) # epsilon-greedy epsilon = self.compute_epsilon(self.t) if np.random.rand() < epsilon: # random action = pout.sample() else: # greedy action = mode_of_distribution(pout) self.actions[self.t - self.t_start] = action.reshape( -1, *self.action_shape) self.value_preds[self.t - self.t_start] = value[:, 0] return action.cpu().numpy()
def batch_select_greedy_action(self, batch_obs, deterministic=False): with torch.no_grad(), pfrl.utils.evaluating(self.policy): batch_xs = self.batch_states(batch_obs, self.device, self.phi) policy_out = self.policy(batch_xs) if deterministic: batch_action = mode_of_distribution(policy_out).cpu().numpy() else: batch_action = policy_out.sample().cpu().numpy() return batch_action
def _batch_act_eval(self, batch_obs): assert not self.training statevar = self.batch_states(batch_obs, self.device, self.phi) with torch.no_grad(): pout, _ = self.model(statevar) if self.act_deterministically: action = mode_of_distribution(pout) else: action = pout.sample() return action.cpu().numpy()
def _act_eval(self, obs): # Use the process-local model for acting with torch.no_grad(), pfrl.utils.evaluating(self.model): statevar = self.batch_states([obs], self.device, self.phi) if self.recurrent: (pout, _), self.test_recurrent_states = one_step_forward( self.model, statevar, self.test_recurrent_states) else: pout, _ = self.model(statevar) if self.act_deterministically: return mode_of_distribution(pout).cpu().numpy()[0] else: return pout.sample().cpu().numpy()[0]
def _act_eval(self, obs): with torch.no_grad(): batch_obs = self.batch_states([obs], self.device, self.phi) if self.recurrent: action_distrib, self.test_recurrent_states = one_step_forward( self.model, batch_obs, self.test_recurrent_states ) else: action_distrib = self.model(batch_obs) if self.act_deterministically: return mode_of_distribution(action_distrib).cpu().numpy()[0] else: return action_distrib.sample().cpu().numpy()[0]
def _act_eval(self, obs): # Use the process-local model for acting with torch.no_grad(): statevar = batch_states([obs], self.device, self.phi) if self.recurrent: (action_distrib, _, _), self.test_recurrent_states = one_step_forward( self.model, statevar, self.test_recurrent_states) else: action_distrib, _, _ = self.model(statevar) if self.act_deterministically: return mode_of_distribution(action_distrib).numpy()[0] else: return action_distrib.sample().numpy()[0]
def batch_select_onpolicy_action(self, batch_obs, deterministic=False): with torch.no_grad(), pfrl.utils.evaluating(self.policy): batch_xs = self.batch_states(batch_obs, self.device, self.phi) batch_distribution = self.policy(batch_xs) if deterministic and self.add_entropy: batch_action = mode_of_distribution(batch_distribution) else: batch_action = batch_distribution.sample() batch_action = self.scale * batch_action batch_action = batch_action.cpu().numpy() return list(batch_action)
def _batch_act_eval(self, batch_obs): assert not self.training b_state = self.batch_states(batch_obs, self.device, self.phi) if self.obs_normalizer: b_state = self.obs_normalizer(b_state, update=False) with torch.no_grad(), pfrl.utils.evaluating(self.model): if self.recurrent: (action_distrib, _), self.test_recurrent_states = one_step_forward( self.model, b_state, self.test_recurrent_states ) else: action_distrib, _ = self.model(b_state) if self.act_deterministically: action = mode_of_distribution(action_distrib).cpu().numpy() else: action = action_distrib.sample().cpu().numpy() return action
def test_transform(): base_dist = torch.distributions.Normal(loc=2, scale=1) dist = torch.distributions.TransformedDistribution( base_dist, [torch.distributions.transforms.TanhTransform()]) mode = mode_of_distribution(dist) torch_assert_allclose(mode.tolist(), math.tanh(2))
def test_multivariate_normal(): loc = torch.as_tensor([0.3, 0.7]) cov = torch.as_tensor([[0.1, 0.0], [0.0, 0.9]]) dist = torch.distributions.MultivariateNormal(loc, cov) mode = mode_of_distribution(dist) torch_assert_allclose(mode, loc)
def test_normal(): loc = torch.as_tensor([0.3, 0.5]) scale = torch.as_tensor([0.1, 0.9]) dist = torch.distributions.Normal(loc, scale) mode = mode_of_distribution(dist) torch_assert_allclose(mode, loc)