def convert_to_tensor(self, arr) -> Tensor: """Convert an array to a PyTorch tensor in this policy's device. Args: arr (array_like): object which can be converted using `np.asarray` """ return convert_to_tensor(arr, self.device)
def get_exploration_action(self, *, action_distribution, timestep, explore=True): # pylint:disable=unused-argument if explore: obs = action_distribution.inputs["obs"] acts = ptu.convert_to_tensor( [self.action_space.sample() for _ in range(obs.size(0))], obs.device) logp = ptu.convert_to_tensor( [ -np.log(self.action_space.high - self.action_space.low).sum(axis=-1) ] * obs.size(0), obs.device, ) return acts, logp return action_distribution.deterministic_sample()
def get_actor_outputs(module, row, n_samples): obs = ptu.convert_to_tensor(row[SampleBatch.CUR_OBS], "cpu") with torch.no_grad(): acts, logp = module.sample(obs, (n_samples,)) deterministic, _ = module.deterministic(obs) deterministic.unsqueeze_(0) log_prob = module.log_prob(obs, acts) entropy = -log_prob.mean() nll_grad = ptu.flat_grad(entropy, module.parameters()) return { "acts": acts, "logp": logp, "det": deterministic, "log_prob": log_prob.detach(), "entropy": entropy.detach(), "nll_grad": nll_grad, }
def stat_to_tensor_dict(self, info: StatDict) -> TensorDict: return {k: convert_to_tensor(v, self.device) for k, v in info.items()}
def get_model_samples(module, row, n_samples): obs = ptu.convert_to_tensor(row[SampleBatch.CUR_OBS], "cpu") act = ptu.convert_to_tensor(row[SampleBatch.ACTIONS], "cpu") new_obs, _ = module.sample(obs, act, (n_samples,)) return new_obs.detach().numpy()