示例#1
0
    def update_context(self, timestep):
        # append a single timestep [o, a, r, no] to the context
        '''
        [[[o1, o2, o3, .., a1, a2, ...], -> timestep = 1
        [o1, o2, o3, .., a1, a2, ...]]] -> timestep = 2
        '''
        o = torch.as_tensor(timestep.state[None, None, ...],
                            device=tu.global_device()).float()
        a = torch.as_tensor(timestep.action[None, None, ...],
                            device=tu.global_device()).float()
        r = torch.as_tensor(np.array([timestep.env_reward])[None, None, ...],
                            device=tu.global_device()).float()
        s = torch.as_tensor(np.array([timestep.skill])[None, None, ...],
                            device=tu.global_device()).float()
        no = torch.as_tensor(timestep.next_state[None, None, ...],
                             device=tu.global_device()).float()

        if self._use_next_obs:
            data = torch.cat([o, a, r, s, no], dim=2)
        else:
            data = torch.cat([o, a, r, s], dim=2)

        if self._context is None:
            self._context = data
        else:
            self._context = torch.cat([self._context, data], dim=1)
示例#2
0
    def update_context(self, timestep):
        """Append single transition to the current context.

        Args:
            timestep (garage._dtypes.TimeStep): Timestep containing transition
                information to be added to context.

        """
        o = torch.as_tensor(timestep.observation[None, None, ...],
                            device=tu.global_device()).float()
        a = torch.as_tensor(timestep.action[None, None, ...],
                            device=tu.global_device()).float()
        r = torch.as_tensor(np.array([timestep.reward])[None, None, ...],
                            device=tu.global_device()).float()
        no = torch.as_tensor(timestep.next_observation[None, None, ...],
                             device=tu.global_device()).float()

        if self._use_next_obs:
            data = torch.cat([o, a, r, no], dim=2)
        else:
            data = torch.cat([o, a, r], dim=2)

        if self._context is None:
            self._context = data
        else:
            self._context = torch.cat([self._context, data], dim=1)
示例#3
0
    def _skills_reason_optimize_policy(self):
        self._controller.reset_belief()

        # data shape is (task, batch, feat)
        obs, actions, rewards, skills, next_obs, terms, context = self.\
            _sample_skill_path()

        # skills_pred is distribution
        policy_outputs, skills_pred, task_z = self._controller(obs, context)

        _, policy_mean, policy_log_std, policy_log_pi = policy_outputs[:4]

        self.context_optimizer.zero_grad()
        if self._use_information_bottleneck:
            kl_div = self._controller.compute_kl_div()
            kl_loss = self._kl_lambda * kl_div
            kl_loss.backward(retain_graph=True)

        skills_target = skills.clone().detach().requires_grad_(True)\
            .to(tu.global_device())
        skills_pred = skills_pred.to(tu.global_device())

        policy_loss = F.mse_loss(skills_pred.flatten(), skills_target.flatten())\
                      * self._skills_reason_reward_scale

        mean_reg_loss = self._policy_mean_reg_coeff * (policy_mean**2).mean()
        std_reg_loss = self._policy_std_reg_coeff * (policy_log_std**2).mean()

        #took away the pre-activation reg term
        policy_reg_loss = mean_reg_loss + std_reg_loss
        policy_loss = policy_loss + policy_reg_loss

        self._controller_optimizer.zero_grad()
        policy_loss.backward()
        self._controller_optimizer.step()
    def forward(self, states, actions, skills):
        """Return Q-value(s)."""
        if not isinstance(states, torch.Tensor):
            states = torch.from_numpy(states).float().to(tu.global_device())
        if not isinstance(actions, torch.Tensor):
            actions = torch.from_numpy(actions).float().to(tu.global_device())
        if not isinstance(skills, torch.Tensor):
            skills = torch.from_numpy(skills).float().to(tu.global_device())

        return super().forward(torch.cat([states, skills, actions], 1))
示例#5
0
def test_utils_set_gpu_mode():
    """Test setting gpu mode to False to force CPU."""
    if torch.cuda.is_available():
        tu.set_gpu_mode(mode=True)
        assert tu.global_device() == torch.device('cuda:0')
        assert tu._USE_GPU
    else:
        tu.set_gpu_mode(mode=False)
        assert tu.global_device() == torch.device('cpu')
        assert not tu._USE_GPU
    assert not tu._GPU_ID
 def forward(self, states, skills):
     if not isinstance(states, torch.Tensor):
         states = torch.from_numpy(states).float().to(tu.global_device())
         if len(states.shape) == 1:
             states = states.unsqueeze(0)
     if not isinstance(skills, torch.Tensor):
         skills = torch.from_numpy(skills).float().to(tu.global_device())
         if len(skills.shape) == 1:
             skills = skills.unsqueeze(0)
     states = states.to(tu.global_device())
     skills = skills.to(tu.global_device())
     # print("in tanh_gaussian_mlp_policy")
     # print(states.size())
     # print(skills.size())
     return super().forward(torch.cat((states, skills), 1))
示例#7
0
    def get_actions(self, observations):
        r"""Get actions given observations.

        Args:
            observations (np.ndarray): Observations from the environment.
                Shape is :math:`batch_dim \bullet env_spec.observation_space`.

        Returns:
            tuple:
                * np.ndarray: Predicted actions.
                    :math:`batch_dim \bullet env_spec.action_space`.
                * dict:
                    * np.ndarray[float]: Mean of the distribution.
                    * np.ndarray[float]: Standard deviation of logarithmic
                        values of the distribution.

        """
        with torch.no_grad():
            if not isinstance(observations, torch.Tensor):
                observations = torch.as_tensor(observations).float().to(
                    tu.global_device())
            dist = self.forward(torch.Tensor(observations))
            return (dist.rsample().numpy(),
                    dict(mean=dist.mean.numpy(),
                         log_std=(dist.variance.sqrt()).log().numpy()))
示例#8
0
文件: kant.py 项目: fangqyi/garage
    def _sample_path_context(self, indices):
        if not hasattr(indices, '__iter__'):
            indices = [indices]

        initialized = False
        for idx in indices:
            path = self._context_replay_buffers[idx].sample_path()
            o = path['states']
            a = path['actions']
            r = path['env_rewards']
            z = path['skills_onehot']
            context = np.hstack((np.hstack((np.hstack((o, a)), r)), z))
            if self._use_next_obs_in_context:
                context = np.hstack((context, path['states']))

            if not initialized:
                final_context = context[np.newaxis]
                initialized = True
            else:
                final_context = np.vstack((final_context, context[np.newaxis]))

        final_context = torch.as_tensor(final_context,
                                        device=tu.global_device()).float()
        if len(indices) == 1:
            final_context = final_context.unsqueeze(0)

        return final_context
示例#9
0
文件: pearl.py 项目: bhaprayan/garage
    def adapt_policy(self, exploration_policy, exploration_trajectories):
        """Produce a policy adapted for a task.

        Args:
            exploration_policy (garage.Policy): A policy which was returned
                from get_exploration_policy(), and which generated
                exploration_trajectories by interacting with an environment.
                The caller may not use this object after passing it into this
                method.
            exploration_trajectories (garage.TrajectoryBatch): Trajectories to
                adapt to, generated by exploration_policy exploring the
                environment.

        Returns:
            garage.Policy: A policy adapted to the task represented by the
                exploration_trajectories.

        """
        total_steps = sum(exploration_trajectories.lengths)
        o = exploration_trajectories.observations
        a = exploration_trajectories.actions
        r = exploration_trajectories.rewards.reshape(total_steps, 1)
        ctxt = np.hstack((o, a, r)).reshape(1, total_steps, -1)
        context = torch.as_tensor(ctxt, device=tu.global_device()).float()
        self._policy.infer_posterior(context)

        return self._policy
示例#10
0
    def get_action(self, observation):
        r"""Get a single action given an observation.

        Args:
            observation (np.ndarray): Observation from the environment.
                Shape is :math:`env_spec.observation_space`.

        Returns:
            tuple:
                * np.ndarray: Predicted action. Shape is
                    :math:`env_spec.action_space`.
                * dict:
                    * np.ndarray[float]: Mean of the distribution
                    * np.ndarray[float]: Standard deviation of logarithmic
                        values of the distribution.

        """
        with torch.no_grad():
            if not isinstance(observation, torch.Tensor):
                observation = torch.as_tensor(observation).float().to(
                    tu.global_device())
            observation = torch.Tensor(observation).unsqueeze(0)
            dist = self.forward(observation)
            return (dist.rsample().squeeze(0).numpy(),
                    dict(mean=dist.mean.squeeze(0).numpy(),
                         log_std=(dist.variance**.5).log().squeeze(0).numpy()))
示例#11
0
 def get_action(self, obs):
     z = self.z
     obs = torch.as_tensor(obs[None], device=tu.global_device()).float()
     obs_in = torch.cat([obs, z], dim=1)
     skill_choice, info = self._controller_policy.get_action(obs_in)
     skill_z = torch.eye(self._num_skills)[skill_choice]
     action, _ = self._sub_actor.get_action(obs, skill_z)
     return action, skill_choice, info
示例#12
0
 def forward(self, states):
     if not isinstance(states, torch.Tensor):
         states = torch.from_numpy(states).float().to(tu.global_device())
     # print("in forward")
     # print(states.size())
     # states = torch.from_numpy(np.array([1, 2, 3])).float().to(tu.global_device())
     x = super().forward(states)
     return torch.softmax(x, dim=-1)
示例#13
0
def test_to():
    """Test the torch function that moves modules to GPU.

        Test that the policy and qfunctions are moved to gpu if gpu is
        available.

    """
    env_names = ['CartPole-v0', 'CartPole-v1']
    task_envs = [GarageEnv(env_name=name) for name in env_names]
    env = MultiEnvWrapper(task_envs, sample_strategy=round_robin_strategy)
    deterministic.set_seed(0)
    policy = TanhGaussianMLPPolicy(
        env_spec=env.spec,
        hidden_sizes=[1, 1],
        hidden_nonlinearity=torch.nn.ReLU,
        output_nonlinearity=None,
        min_std=np.exp(-20.),
        max_std=np.exp(2.),
    )

    qf1 = ContinuousMLPQFunction(env_spec=env.spec,
                                 hidden_sizes=[1, 1],
                                 hidden_nonlinearity=F.relu)

    qf2 = ContinuousMLPQFunction(env_spec=env.spec,
                                 hidden_sizes=[1, 1],
                                 hidden_nonlinearity=F.relu)
    replay_buffer = PathBuffer(capacity_in_transitions=int(1e6), )

    num_tasks = 2
    buffer_batch_size = 2
    mtsac = MTSAC(policy=policy,
                  qf1=qf1,
                  qf2=qf2,
                  gradient_steps_per_itr=150,
                  max_path_length=150,
                  eval_env=env,
                  env_spec=env.spec,
                  num_tasks=num_tasks,
                  steps_per_epoch=5,
                  replay_buffer=replay_buffer,
                  min_buffer_size=1e3,
                  target_update_tau=5e-3,
                  discount=0.99,
                  buffer_batch_size=buffer_batch_size)

    tu.set_gpu_mode(torch.cuda.is_available())
    mtsac.to()
    device = tu.global_device()
    for param in mtsac._qf1.parameters():
        assert param.device == device
    for param in mtsac._qf2.parameters():
        assert param.device == device
    for param in mtsac._qf2.parameters():
        assert param.device == device
    for param in mtsac._policy.parameters():
        assert param.device == device
    assert mtsac._log_alpha.device == device
示例#14
0
文件: kant.py 项目: fangqyi/garage
    def _sample_skill_path(self):
        path = self._skills_replay_buffer.sample_path()
        # TODO: trim or extend batch to the same size
        o = path['states']
        a = path['actions']
        r = path['env_rewards']
        z = path['skills_onehot']
        context = np.hstack((np.hstack((np.hstack((o, a)), r)), z))
        if self._use_next_obs_in_context:
            context = np.hstack((context, path['next_states']))

        context = context[np.newaxis]
        o = path['states'][np.newaxis]
        a = path['actions'][np.newaxis]
        r = path['env_rewards'][np.newaxis]
        z = path['skills_onehot'][np.newaxis]
        no = path['next_states'][np.newaxis]
        d = path['dones'][np.newaxis]

        o = torch.as_tensor(o, device=tu.global_device()).float()
        a = torch.as_tensor(a, device=tu.global_device()).float()
        r = torch.as_tensor(r, device=tu.global_device()).float()
        z = torch.as_tensor(z, device=tu.global_device()).float()
        no = torch.as_tensor(no, device=tu.global_device()).float()
        d = torch.as_tensor(d, device=tu.global_device()).float()
        context = torch.as_tensor(context, device=tu.global_device()).float()
        context = context.unsqueeze(0)

        return o, a, r, z, no, d, context
示例#15
0
    def get_actions(self, states):
        with torch.no_grad():
            if not isinstance(states, torch.Tensor):
                states = torch.from_numpy(states).float().to(
                    tu.global_device())
            states = states.to(tu.global_device())
            dist = self.forward(states).to('cpu').detach()
            actions = np.array([np.random.choice(self._action_dim,
                                                 p=dist.numpy()[idx])
                                for idx in range(dist.numpy().shape[0])])
            ret_mean = np.mean(dist.numpy())
            ret_log_std = np.log((np.std(dist.numpy())))

            len = actions.shape[0]
            ret_log_pi = np.log(dist[np.arange(len), actions])

            return (actions, dict(mean=ret_mean, log_std=ret_log_std,
                                  log_pi=ret_log_pi, dist=dist))
示例#16
0
    def get_action(self, state):
        with torch.no_grad():
            if not isinstance(state, torch.Tensor):
                state = torch.from_numpy(state).float().to(
                    tu.global_device())

            state = state.to(tu.global_device())
            dist = self.forward(state.unsqueeze(0)).squeeze(0).to('cpu').detach()

            # action = torch.tensor([torch.rsample()])
            action = np.array([np.random.choice(self._action_dim,
                                                p=dist.squeeze(0).numpy())])

            ret_mean = np.mean(dist.numpy())
            ret_log_std = np.log((np.std(dist.numpy())))
            ret_log_pi = np.log(dist[..., list(action)])

            return (action, dict(mean=ret_mean, log_std=ret_log_std,
                                 log_pi=ret_log_pi, dist=dist))
示例#17
0
文件: pearl.py 项目: bhaprayan/garage
    def to(self, device=None):
        """Put all the networks within the model on device.

        Args:
            device (str): ID of GPU or CPU.

        """
        device = device or tu.global_device()
        for net in self.networks:
            net.to(device)
示例#18
0
    def adapt_policy(self, exploration_policy, exploration_trajectories):
        total_steps = sum(exploration_trajectories.lengths)
        o = exploration_trajectories.states
        a = exploration_trajectories.actions
        r = exploration_trajectories.env_rewards.reshape(total_steps, 1)
        s = exploration_trajectories.skills_onehot
        ctxt = np.hstack((o, a, r, s)).reshape(1, total_steps, -1)
        context = torch.as_tensor(ctxt, device=tu.global_device()).float()
        self._controller.infer_posterior(context)

        return self._controller
示例#19
0
    def compute_kl_div(self):
        r"""Compute :math:`KL(q(z|c) \| p(z))`.

        Returns:
            float: :math:`KL(q(z|c) \| p(z))`.

        """
        prior = torch.distributions.Normal(
            torch.zeros(self._latent_dim).to(tu.global_device()),
            torch.ones(self._latent_dim).to(tu.global_device()))
        posteriors = [
            torch.distributions.Normal(mu, torch.sqrt(var)) for mu, var in zip(
                torch.unbind(self.z_means), torch.unbind(self.z_vars))
        ]
        kl_divs = [
            torch.distributions.kl.kl_divergence(post, prior)
            for post in posteriors
        ]
        kl_div_sum = torch.sum(torch.stack(kl_divs))
        return kl_div_sum
示例#20
0
文件: pearl.py 项目: bhaprayan/garage
    def _sample_data(self, indices):
        """Sample batch of training data from a list of tasks.

        Args:
            indices (list): List of task indices to sample from.

        Returns:
            torch.Tensor: Obervations, with shape :math:`(X, N, O^*)` where X
                is the number of tasks. N is batch size.
            torch.Tensor: Actions, with shape :math:`(X, N, A^*)`.
            torch.Tensor: Rewards, with shape :math:`(X, N, 1)`.
            torch.Tensor: Next obervations, with shape :math:`(X, N, O^*)`.
            torch.Tensor: Dones, with shape :math:`(X, N, 1)`.

        """
        # transitions sampled randomly from replay buffer
        initialized = False
        for idx in indices:
            batch = self._replay_buffers[idx].sample_transitions(
                self._batch_size)
            if not initialized:
                o = batch['observations'][np.newaxis]
                a = batch['actions'][np.newaxis]
                r = batch['rewards'][np.newaxis]
                no = batch['next_observations'][np.newaxis]
                d = batch['dones'][np.newaxis]
                initialized = True
            else:
                o = np.vstack((o, batch['observations'][np.newaxis]))
                a = np.vstack((a, batch['actions'][np.newaxis]))
                r = np.vstack((r, batch['rewards'][np.newaxis]))
                no = np.vstack((no, batch['next_observations'][np.newaxis]))
                d = np.vstack((d, batch['dones'][np.newaxis]))

        o = torch.as_tensor(o, device=tu.global_device()).float()
        a = torch.as_tensor(a, device=tu.global_device()).float()
        r = torch.as_tensor(r, device=tu.global_device()).float()
        no = torch.as_tensor(no, device=tu.global_device()).float()
        d = torch.as_tensor(d, device=tu.global_device()).float()

        return o, a, r, no, d
示例#21
0
    def _sample_task_path(self, indices):
        if not hasattr(indices, '__iter__'):
            indices = [indices]

        initialized = False
        for idx in indices:
            path = self._replay_buffers[idx].sample_path()
            # TODO: trim or extend batch to the same size
            if not initialized:
                o = path['states'][np.newaxis]
                a = path['actions'][np.newaxis]
                r = path['env_rewards'][np.newaxis]
                z = path['skills_onehot'][np.newaxis]
                no = path['next_states'][np.newaxis]
                d = path['dones'][np.newaxis]
                initialized = True
            else:
                o = np.vstack((o, path['states'][np.newaxis]))
                a = np.vstack((a, path['actions'][np.newaxis]))
                r = np.vstack((r, path['env_rewards'][np.newaxis]))
                z = np.vstack((z, path['skills_onehot'][np.newaxis]))
                no = np.vstack((no, path['next_states'][np.newaxis]))
                d = np.vstack((d, path['dones'][np.newaxis]))

        o = torch.as_tensor(o, device=tu.global_device()).float()
        a = torch.as_tensor(a, device=tu.global_device()).float()
        r = torch.as_tensor(r, device=tu.global_device()).float()
        z = torch.as_tensor(z, device=tu.global_device()).float()
        no = torch.as_tensor(no, device=tu.global_device()).float()
        d = torch.as_tensor(d, device=tu.global_device()).float()

        return o, a, r, z, no, d
示例#22
0
文件: diayn.py 项目: fangqyi/garage
    def _discriminator_objective(self, samples_data):
        states = samples_data['next_state']

        discriminator_pred = self._discriminator(states)
        discriminator_target = (samples_data['skill'].type(
            torch.cuda.FloatTensor).requires_grad_(True)).to(
                tu.global_device())

        discriminator_loss = torch.mean(
            F.cross_entropy(discriminator_pred,
                            discriminator_target.flatten()))

        return discriminator_loss
示例#23
0
    def forward(self, observations, skills):
        if not isinstance(observations, torch.Tensor):
            observations = torch.from_numpy(observations).float().to(
                tu.global_device())
            if len(observations.shape) == 1:
                observations = observations.unsqueeze(0)
        if not isinstance(skills, torch.Tensor):
            skills = torch.from_numpy(skills).float().to(tu.global_device())
            if len(skills.shape) == 1:
                skills = skills.unsqueeze(0)
        input = torch.cat([observations, skills], dim=1).to(tu.global_device())

        log_p_x_t, reg_loss_t, x_t, log_ws_t, mus_t, log_sigs_t = self.distribution.get_p_params(
            input)
        raw_actions = x_t.detach().cpu().numpy()
        actions = np.tanh(raw_actions) if self._squash else raw_actions

        return actions, dict(log_p_x_t=log_p_x_t,
                             reg_loss_t=reg_loss_t,
                             x_t=x_t,
                             log_ws_t=log_ws_t,
                             mus_t=mus_t,
                             log_sigs_t=log_sigs_t)
示例#24
0
    def reset_belief(self, num_tasks=1):
        r"""Reset :math:`q(z \| c)` to the prior and sample a new z from the prior.

        Args:
            num_tasks (int): Number of tasks.

        """
        # reset distribution over z to the prior
        mu = torch.zeros(num_tasks, self._latent_dim).to(tu.global_device())
        if self._use_information_bottleneck:
            var = torch.ones(num_tasks,
                             self._latent_dim).to(tu.global_device())
        else:
            var = torch.zeros(num_tasks,
                              self._latent_dim).to(tu.global_device())
        self.z_means = mu
        self.z_vars = var
        # sample a new z from the prior
        self.sample_from_belief()
        # reset the context collected so far
        self._context = None
        # reset any hidden state in the encoder network (relevant for RNN)
        self._context_encoder.reset(num_tasks)
示例#25
0
文件: gmm.py 项目: fangqyi/garage
    def get_p_params(self, input):
        log_ws_t, xz_mus_t, xz_log_sigs_t = self.get_p_xz_params(input)
        # (N x K), (N x K x Dx), (N x K x Dx)
        N = log_ws_t.shape[0]
        xz_sigs_t = torch.exp(xz_log_sigs_t)

        # Sample the latent code
        z_t = torch.multinomial(torch.exp(log_ws_t), num_samples=1)  # N*1

        # Choose mixture component corresponding to the latent
        mask_t = torch.eye(self._K)[z_t[:, 0]].to(tu.global_device())
        mask_t = mask_t.ge(1)  # turn into boolean
        xz_mu_t = torch.masked_select(xz_mus_t, mask_t)
        xz_sig_t = torch.masked_select(xz_sigs_t, mask_t)

        # Sample x
        x_t = xz_mu_t + xz_sig_t * torch.normal(mean=torch.zeros(
            (N, self._Dx)).to(tu.global_device()),
                                                std=1.0)

        if not self._reparameterize:
            x_t = x_t.detach().cpu().numpy()

        # log p(x|z)
        log_p_xz_t = self._create_log_gaussian(xz_mus_t, xz_log_sigs_t,
                                               x_t[:, None, :])
        # N*K

        # log p(x)
        log_p_x_t = torch.logsumexp(log_p_xz_t + log_ws_t, dim=1)
        log_p_x_t -= torch.logsumexp(log_ws_t, dim=1)

        reg_loss_t = 0
        reg_loss_t += self._reg * 0.5 * torch.mean(xz_log_sigs_t**2)
        reg_loss_t += self._reg * 0.5 * torch.mean(xz_mus_t**2)

        return log_p_x_t, reg_loss_t, x_t, log_ws_t, xz_mus_t, xz_log_sigs_t
示例#26
0
文件: kant.py 项目: fangqyi/garage
    def _sample_task_path(self, indices):
        if not hasattr(indices, '__iter__'):
            indices = [indices]

        initialized = False
        for idx in indices:
            path = self._context_replay_buffers[idx].sample_path()
            # should be replay_buffers[]
            # TODO: trim or extend batch to the same size

            context_o = path['states']
            context_a = path['actions']
            context_r = path['env_rewards']
            context_z = path['skills_onehot']
            context = np.hstack((np.hstack((np.hstack(
                (context_o, context_a)), context_r)), context_z))
            if self._use_next_obs_in_context:
                context = np.hstack((context, path['next_states']))

            if not initialized:
                final_context = context[np.newaxis]
                o = path['states'][np.newaxis]
                a = path['actions'][np.newaxis]
                r = path['env_rewards'][np.newaxis]
                z = path['skills_onehot'][np.newaxis]
                no = path['next_states'][np.newaxis]
                d = path['dones'][np.newaxis]
                initialized = True
            else:
                # print(o.shape)
                # print(path['states'].shape)
                o = np.vstack((o, path['states'][np.newaxis]))
                a = np.vstack((a, path['actions'][np.newaxis]))
                r = np.vstack((r, path['env_rewards'][np.newaxis]))
                z = np.vstack((z, path['skills_onehot'][np.newaxis]))
                no = np.vstack((no, path['next_states'][np.newaxis]))
                d = np.vstack((d, path['dones'][np.newaxis]))
                final_context = np.vstack((final_context, context[np.newaxis]))

        o = torch.as_tensor(o, device=tu.global_device()).float()
        a = torch.as_tensor(a, device=tu.global_device()).float()
        r = torch.as_tensor(r, device=tu.global_device()).float()
        z = torch.as_tensor(z, device=tu.global_device()).float()
        no = torch.as_tensor(no, device=tu.global_device()).float()
        d = torch.as_tensor(d, device=tu.global_device()).float()
        final_context = torch.as_tensor(final_context,
                                        device=tu.global_device()).float()
        if len(indices) == 1:
            final_context = final_context.unsqueeze(0)

        return o, a, r, z, no, d, final_context
示例#27
0
文件: sac.py 项目: yus-nas/garage
    def to(self, device=None):
        """Put all the networks within the model on device.

        Args:
            device (str): ID of GPU or CPU.

        """
        if device is None:
            device = tu.global_device()
        for net in self.networks:
            net.to(device)
        self.log_alpha = torch.Tensor([self._initial_log_entropy
                                       ]).to(device).requires_grad_()
        if self.use_automatic_entropy_tuning:
            self.alpha_optimizer = self._optimizer([self.log_alpha],
                                                   lr=self.policy_lr)
示例#28
0
文件: mtsac.py 项目: fangqyi/garage
    def to(self, device=None):
        """Put all the networks within the model on device.

        Args:
            device (str): ID of GPU or CPU.

        """
        super().to(device)
        if device is None:
            device = tu.global_device()
        if not self._use_automatic_entropy_tuning:
            self._log_alpha = torch.Tensor([self._fixed_alpha] *
                                           self._num_tasks).log().to(device)
        else:
            self._log_alpha = torch.Tensor(
                [self._initial_log_entropy] *
                self._num_tasks).to(device).requires_grad_()
            self._alpha_optimizer = self._optimizer([self._log_alpha],
                                                    lr=self._policy_lr)
示例#29
0
文件: pearl.py 项目: bhaprayan/garage
    def _sample_context(self, indices):
        """Sample batch of context from a list of tasks.

        Args:
            indices (list): List of task indices to sample from.

        Returns:
            torch.Tensor: Context data, with shape :math:`(X, N, C)`. X is the
                number of tasks. N is batch size. C is the combined size of
                observation, action, reward, and next observation if next
                observation is used in context. Otherwise, C is the combined
                size of observation, action, and reward.

        """
        # make method work given a single task index
        if not hasattr(indices, '__iter__'):
            indices = [indices]

        initialized = False
        for idx in indices:
            batch = self._context_replay_buffers[idx].sample_transitions(
                self._embedding_batch_size)
            o = batch['observations']
            a = batch['actions']
            r = batch['rewards']
            context = np.hstack((np.hstack((o, a)), r))
            if self._use_next_obs_in_context:
                context = np.hstack((context, batch['next_observations']))

            if not initialized:
                final_context = context[np.newaxis]
                initialized = True
            else:
                final_context = np.vstack((final_context, context[np.newaxis]))

        final_context = torch.as_tensor(final_context,
                                        device=tu.global_device()).float()
        if len(indices) == 1:
            final_context = final_context.unsqueeze(0)

        return final_context
示例#30
0
    def get_action(self, obs):
        """Sample action from the policy, conditioned on the task embedding.

        Args:
            obs (torch.Tensor): Observation values, with shape :math:`(1, O)`.
                O is the size of the flattened observation space.

        Returns:
            torch.Tensor: Output action value, with shape :math:`(1, A)`.
                A is the size of the flattened action space.
            dict:
                * np.ndarray[float]: Mean of the distribution.
                * np.ndarray[float]: Standard deviation of logarithmic values
                    of the distribution.

        """
        z = self.z
        obs = torch.as_tensor(obs[None], device=tu.global_device()).float()
        obs_in = torch.cat([obs, z], dim=1)
        action, info = self._policy.get_action(obs_in)
        action = np.squeeze(action, axis=0)
        info['mean'] = np.squeeze(info['mean'], axis=0)
        return action, info