def update_context(self, timestep): """Append single transition to the current context. Args: timestep (metarl._dtypes.TimeStep): Timestep containing transition information to be added to context. """ o = torch.as_tensor(timestep.observation[None, None, ...], device=global_device()).float() a = torch.as_tensor(timestep.action[None, None, ...], device=global_device()).float() r = torch.as_tensor(np.array([timestep.reward])[None, None, ...], device=global_device()).float() no = torch.as_tensor(timestep.next_observation[None, None, ...], device=global_device()).float() if self._use_next_obs: data = torch.cat([o, a, r, no], dim=2) else: data = torch.cat([o, a, r], dim=2) if self._context is None: self._context = data else: self._context = torch.cat([self._context, data], dim=1)
def test_utils_set_gpu_mode(): """Test setting gpu mode to False to force CPU.""" if torch.cuda.is_available(): set_gpu_mode(mode=True) assert global_device() == torch.device('cuda:0') assert tu._USE_GPU else: set_gpu_mode(mode=False) assert global_device() == torch.device('cpu') assert not tu._USE_GPU assert not tu._GPU_ID
def adapt_policy(self, exploration_policy, exploration_trajectories): """Produce a policy adapted for a task. Args: exploration_policy (metarl.Policy): A policy which was returned from get_exploration_policy(), and which generated exploration_trajectories by interacting with an environment. The caller may not use this object after passing it into this method. exploration_trajectories (metarl.TrajectoryBatch): Trajectories to adapt to, generated by exploration_policy exploring the environment. Returns: metarl.Policy: A policy adapted to the task represented by the exploration_trajectories. """ total_steps = sum(exploration_trajectories.lengths) o = exploration_trajectories.observations a = exploration_trajectories.actions r = exploration_trajectories.rewards.reshape(total_steps, 1) ctxt = np.hstack((o, a, r)).reshape(1, total_steps, -1) context = torch.as_tensor(ctxt, device=global_device()).float() self._policy.infer_posterior(context) return self._policy
def get_action(self, observation): r"""Get a single action given an observation. Args: observation (np.ndarray): Observation from the environment. Shape is :math:`env_spec.observation_space`. Returns: tuple: * np.ndarray: Predicted action. Shape is :math:`env_spec.action_space`. * dict: * np.ndarray[float]: Mean of the distribution * np.ndarray[float]: Standard deviation of logarithmic values of the distribution. """ with torch.no_grad(): if not isinstance(observation, torch.Tensor): observation = torch.as_tensor(observation).float().to( global_device()) observation = torch.Tensor(observation).unsqueeze(0) dist = self.forward(observation) return (dist.rsample().squeeze(0).numpy(), dict(mean=dist.mean.squeeze(0).numpy(), log_std=(dist.variance**.5).log().squeeze(0).numpy()))
def get_actions(self, observations): r"""Get actions given observations. Args: observations (np.ndarray): Observations from the environment. Shape is :math:`batch_dim \bullet env_spec.observation_space`. Returns: tuple: * np.ndarray: Predicted actions. :math:`batch_dim \bullet env_spec.action_space`. * dict: * np.ndarray[float]: Mean of the distribution. * np.ndarray[float]: Standard deviation of logarithmic values of the distribution. """ with torch.no_grad(): if not isinstance(observations, torch.Tensor): observations = torch.as_tensor(observations).float().to( global_device()) dist = self.forward(torch.Tensor(observations)) return (dist.rsample().numpy(), dict(mean=dist.mean.numpy(), log_std=(dist.variance.sqrt()).log().numpy()))
def test_to(): """Test the torch function that moves modules to GPU. Test that the policy and qfunctions are moved to gpu if gpu is available. """ env_names = ['CartPole-v0', 'CartPole-v1'] task_envs = [MetaRLEnv(env_name=name) for name in env_names] env = MultiEnvWrapper(task_envs, sample_strategy=round_robin_strategy) deterministic.set_seed(0) policy = TanhGaussianMLPPolicy( env_spec=env.spec, hidden_sizes=[1, 1], hidden_nonlinearity=torch.nn.ReLU, output_nonlinearity=None, min_std=np.exp(-20.), max_std=np.exp(2.), ) qf1 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[1, 1], hidden_nonlinearity=F.relu) qf2 = ContinuousMLPQFunction(env_spec=env.spec, hidden_sizes=[1, 1], hidden_nonlinearity=F.relu) replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), ) num_tasks = 2 buffer_batch_size = 2 mtsac = MTSAC(policy=policy, qf1=qf1, qf2=qf2, gradient_steps_per_itr=150, max_path_length=150, eval_env=env, env_spec=env.spec, num_tasks=num_tasks, steps_per_epoch=5, replay_buffer=replay_buffer, min_buffer_size=1e3, target_update_tau=5e-3, discount=0.99, buffer_batch_size=buffer_batch_size) set_gpu_mode(torch.cuda.is_available()) mtsac.to() device = global_device() for param in mtsac._qf1.parameters(): assert param.device == device for param in mtsac._qf2.parameters(): assert param.device == device for param in mtsac._qf2.parameters(): assert param.device == device for param in mtsac._policy.parameters(): assert param.device == device assert mtsac._log_alpha.device == device
def to(self, device=None): """Put all the networks within the model on device. Args: device (str): ID of GPU or CPU. """ device = device or global_device() for net in self.networks: net.to(device)
def compute_kl_div(self): r"""Compute :math:`KL(q(z|c) \| p(z))`. Returns: float: :math:`KL(q(z|c) \| p(z))`. """ prior = torch.distributions.Normal( torch.zeros(self._latent_dim).to(global_device()), torch.ones(self._latent_dim).to(global_device())) posteriors = [ torch.distributions.Normal(mu, torch.sqrt(var)) for mu, var in zip( torch.unbind(self.z_means), torch.unbind(self.z_vars)) ] kl_divs = [ torch.distributions.kl.kl_divergence(post, prior) for post in posteriors ] kl_div_sum = torch.sum(torch.stack(kl_divs)) return kl_div_sum
def _sample_data(self, indices): """Sample batch of training data from a list of tasks. Args: indices (list): List of task indices to sample from. Returns: torch.Tensor: Obervations, with shape :math:`(X, N, O^*)` where X is the number of tasks. N is batch size. torch.Tensor: Actions, with shape :math:`(X, N, A^*)`. torch.Tensor: Rewards, with shape :math:`(X, N, 1)`. torch.Tensor: Next obervations, with shape :math:`(X, N, O^*)`. torch.Tensor: Dones, with shape :math:`(X, N, 1)`. """ # transitions sampled randomly from replay buffer initialized = False for idx in indices: batch = self._replay_buffers[idx].sample_transitions( self._batch_size) if not initialized: o = batch['observations'][np.newaxis] a = batch['actions'][np.newaxis] r = batch['rewards'][np.newaxis] no = batch['next_observations'][np.newaxis] d = batch['dones'][np.newaxis] initialized = True else: o = np.vstack((o, batch['observations'][np.newaxis])) a = np.vstack((a, batch['actions'][np.newaxis])) r = np.vstack((r, batch['rewards'][np.newaxis])) no = np.vstack((no, batch['next_observations'][np.newaxis])) d = np.vstack((d, batch['dones'][np.newaxis])) o = torch.as_tensor(o, device=global_device()).float() a = torch.as_tensor(a, device=global_device()).float() r = torch.as_tensor(r, device=global_device()).float() no = torch.as_tensor(no, device=global_device()).float() d = torch.as_tensor(d, device=global_device()).float() return o, a, r, no, d
def reset_belief(self, num_tasks=1): r"""Reset :math:`q(z \| c)` to the prior and sample a new z from the prior. Args: num_tasks (int): Number of tasks. """ # reset distribution over z to the prior mu = torch.zeros(num_tasks, self._latent_dim).to(global_device()) if self._use_information_bottleneck: var = torch.ones(num_tasks, self._latent_dim).to(global_device()) else: var = torch.zeros(num_tasks, self._latent_dim).to(global_device()) self.z_means = mu self.z_vars = var # sample a new z from the prior self.sample_from_belief() # reset the context collected so far self._context = None # reset any hidden state in the encoder network (relevant for RNN) self._context_encoder.reset()
def to(self, device=None): """Put all the networks within the model on device. Args: device (str): ID of GPU or CPU. """ if device is None: device = global_device() for net in self.networks: net.to(device) if not self._use_automatic_entropy_tuning: self._log_alpha = torch.Tensor([self._fixed_alpha ]).log().to(device) else: self._log_alpha = torch.Tensor([self._initial_log_entropy ]).to(device).requires_grad_() self._alpha_optimizer = self._optimizer([self._log_alpha], lr=self._policy_lr)
def _sample_context(self, indices): """Sample batch of context from a list of tasks. Args: indices (list): List of task indices to sample from. Returns: torch.Tensor: Context data, with shape :math:`(X, N, C)`. X is the number of tasks. N is batch size. C is the combined size of observation, action, reward, and next observation if next observation is used in context. Otherwise, C is the combined size of observation, action, and reward. """ # make method work given a single task index if not hasattr(indices, '__iter__'): indices = [indices] initialized = False for idx in indices: batch = self._context_replay_buffers[idx].sample_transitions( self._embedding_batch_size) o = batch['observations'] a = batch['actions'] r = batch['rewards'] context = np.hstack((np.hstack((o, a)), r)) if self._use_next_obs_in_context: context = np.hstack((context, batch['next_observations'])) if not initialized: final_context = context[np.newaxis] initialized = True else: final_context = np.vstack((final_context, context[np.newaxis])) final_context = torch.as_tensor(final_context, device=global_device()).float() if len(indices) == 1: final_context = final_context.unsqueeze(0) return final_context
def get_action(self, obs): """Sample action from the policy, conditioned on the task embedding. Args: obs (torch.Tensor): Observation values, with shape :math:`(1, O)`. O is the size of the flattened observation space. Returns: torch.Tensor: Output action value, with shape :math:`(1, A)`. A is the size of the flattened action space. dict: * np.ndarray[float]: Mean of the distribution. * np.ndarray[float]: Standard deviation of logarithmic values of the distribution. """ z = self.z obs = torch.as_tensor(obs[None], device=global_device()).float() obs_in = torch.cat([obs, z], dim=1) action, info = self._policy.get_action(obs_in) action = np.squeeze(action, axis=0) info['mean'] = np.squeeze(info['mean'], axis=0) return action, info