Пример #1
0
def all_steps(policy,
              qf1,
              qf2,
              env,
              num_curriculum_eps=1,
              curr_k=10,
              curr_beta=0.9,
              curr_ep_length=300,
              bootstrap_value=True,
              use_cuda=True):
    if use_cuda:
        device = 'cuda'
    else:
        device = 'cpu'
    logger.log('\nStarting adaptative curriculum.\n')
    p = {}
    det_policy = MakeDeterministic(policy)
    for k, v in env.curr_grid.items():
        capabilities = []
        for i, init in enumerate(v):
            logger.log('Inicialização - Curriculum {}. Iter {} -> {}'.format(
                k, i, init))
            for e in range(num_curriculum_eps):
                accum_c = []
                o, d, ep_ret, ep_len = env.reset(curr_init=init,
                                                 init_strategy='adaptative',
                                                 curr_var=k), False, 0, 0
                c = 0
                while not (d or (ep_len == curr_ep_length)):
                    # Take deterministic actions at test time
                    a, _ = det_policy.get_action(o)
                    o, r, d, _ = env.step(a)
                    if bootstrap_value:
                        o = torch.Tensor(o).to(device)
                        dist = policy(o.view(1, -1))
                        new_obs_actions, _ = dist.rsample_and_logprob()
                        q_new_actions = torch.min(
                            qf1(o.view(1, -1), new_obs_actions),
                            qf2(o.view(1, -1), new_obs_actions),
                        )
                        # Estimates value
                        v = q_new_actions.mean()
                        if not use_cuda:
                            c += v.detach().numpy()
                        else:
                            c += v.detach().cpu().numpy()
                    else:
                        # Uses returns instead
                        ep_ret += r
                        c = ep_ret
                    ep_len += 1
                accum_c.append(c)
            capabilities.append(np.mean(accum_c))
        max_capability = np.max(capabilities)
        f = np.exp(-curr_k *
                   np.abs(np.array(capabilities) / max_capability - curr_beta))
        p[k] = f / f.sum()
    return p
Пример #2
0
def fill_buffer(buffer,
                meta_env,
                expert,
                expert_policy_specs,
                task_params_sampler,
                num_rollouts_per_task,
                max_path_length,
                no_terminal=False,
                policy_is_scripted=False,
                render=False,
                check_for_success=False,
                wrap_absorbing=False,
                subsample_factor=1,
                deterministic=True):
    expert_uses_pixels = expert_policy_specs['policy_uses_pixels']
    expert_uses_task_params = expert_policy_specs['policy_uses_task_params']
    # hack
    if 'concat_task_params_to_policy_obs' in expert_policy_specs:
        concat_task_params_to_policy_obs = expert_policy_specs[
            'concat_task_params_to_policy_obs']
    else:
        concat_task_params_to_policy_obs = False

    # this is something for debugging few shot fetch demos
    # first_complete_list = []

    for task_params, obs_task_params in task_params_sampler:
        # print('Doing Task {}...'.format(task_params))

        debug_stats = []
        meta_env.reset(task_params=task_params,
                       obs_task_params=obs_task_params)
        task_id = meta_env.task_identifier

        num_rollouts_completed = 0
        while num_rollouts_completed < num_rollouts_per_task:
            cur_rollout_rewards = 0
            print('\tRollout %d...' % num_rollouts_completed)
            cur_path_builder = PathBuilder()

            observation = meta_env.reset(task_params=task_params,
                                         obs_task_params=obs_task_params)
            if policy_is_scripted:
                policy = expert
                policy.reset(meta_env)
            else:
                if isinstance(meta_env, AntLinearClassifierEnv):
                    policy = expert.get_exploration_policy(
                        meta_env.targets[meta_env.true_label])
                    # print(meta_env.true_label)
                    if deterministic: policy.deterministic = True
                elif isinstance(meta_env, Walker2DRandomDynamicsEnv):
                    # print('WalkerEnv')
                    policy = expert.get_exploration_policy(obs_task_params)
                    if deterministic:
                        # print('deterministic')
                        policy = MakeDeterministic(policy)
                else:
                    policy = expert.get_exploration_policy(obs_task_params)
                    if deterministic: policy.deterministic = True
            terminal = False

            subsample_mod = randint(0, subsample_factor - 1)
            step_num = 0

            rollout_debug = []
            while (not terminal) and step_num < max_path_length:
                if render: meta_env.render()
                if isinstance(meta_env.observation_space, Dict):
                    if expert_uses_pixels:
                        agent_obs = observation['pixels']
                    else:
                        agent_obs = observation['obs']
                        if isinstance(meta_env, AntLinearClassifierEnv):
                            if meta_env.use_relative_pos:
                                agent_obs = np.concatenate([
                                    agent_obs[:-12],
                                    meta_env.get_body_com("torso").flat
                                ]).copy()
                            else:
                                agent_obs = agent_obs[:-12]
                else:
                    agent_obs = observation
                if expert_uses_task_params:
                    if concat_task_params_to_policy_obs:
                        agent_obs = np.concatenate(
                            (agent_obs, obs_task_params), -1)
                    # else:
                    # agent_obs = {'obs': agent_obs, 'obs_task_params': obs_task_params}

                if policy_is_scripted:
                    action, agent_info = policy.get_action(
                        agent_obs, meta_env, len(cur_path_builder))
                else:
                    action, agent_info = policy.get_action(agent_obs)

                next_ob, raw_reward, terminal, env_info = (
                    meta_env.step(action))
                # raw_reward = -1.0 * env_info['run_cost']
                # raw_reward = env_info['vel']
                cur_rollout_rewards += raw_reward
                # if step_num < 200: cur_rollout_rewards += raw_reward

                # rollout_debug.append(env_info['l2_dist'])

                if no_terminal: terminal = False
                if wrap_absorbing:
                    terminal_array = np.array([False])
                else:
                    terminal_array = np.array([terminal])

                reward = raw_reward
                reward = np.array([reward])

                if step_num % subsample_factor == subsample_mod:
                    cur_path_builder.add_all(observations=observation,
                                             actions=action,
                                             rewards=reward,
                                             next_observations=next_ob,
                                             terminals=terminal_array,
                                             absorbing=np.array([0.0, 0.0]),
                                             agent_infos=agent_info,
                                             env_infos=env_info)
                observation = next_ob
                step_num += 1

            if terminal and wrap_absorbing:
                '''
                If we wrap absorbing states, two additional
                transitions must be added: (s_T, s_abs) and
                (s_abs, s_abs). In Disc Actor Critic paper
                they make s_abs be a vector of 0s with last
                dim set to 1. Here we are going to add the following:
                ([next_ob,0], random_action, [next_ob, 1]) and
                ([next_ob,1], random_action, [next_ob, 1])
                This way we can handle varying types of terminal states.
                '''
                # next_ob is the absorbing state
                # for now just sampling random action
                cur_path_builder.add_all(
                    observations=next_ob,
                    actions=action,
                    # the reward doesn't matter
                    rewards=0.0,
                    next_observations=next_ob,
                    terminals=np.array([False]),
                    absorbing=np.array([0.0, 1.0]),
                    agent_infos=agent_info,
                    env_infos=env_info)
                cur_path_builder.add_all(
                    observations=next_ob,
                    actions=action,
                    # the reward doesn't matter
                    rewards=0.0,
                    next_observations=next_ob,
                    terminals=np.array([False]),
                    absorbing=np.array([1.0, 1.0]),
                    agent_infos=agent_info,
                    env_infos=env_info)

            # if necessary check if it was successful
            if check_for_success:
                was_successful = np.sum([
                    e_info['is_success']
                    for e_info in cur_path_builder['env_infos']
                ]) > 0
                if was_successful:
                    print('\t\tSuccessful')
                else:
                    print('\t\tNot Successful')
            if (check_for_success
                    and was_successful) or (not check_for_success):
                for timestep in range(len(cur_path_builder)):
                    buffer.add_sample(
                        cur_path_builder['observations'][timestep],
                        cur_path_builder['actions'][timestep],
                        cur_path_builder['rewards'][timestep],
                        cur_path_builder['terminals'][timestep],
                        cur_path_builder['next_observations'][timestep],
                        task_id,
                        agent_info=cur_path_builder['agent_infos'][timestep],
                        env_info=cur_path_builder['env_infos'][timestep],
                        absorbing=cur_path_builder['absorbing'][timestep])
                buffer.terminate_episode(task_id)
                num_rollouts_completed += 1
                print('\t\tReturn: %.2f' % (cur_rollout_rewards))
                debug_stats.append(cur_rollout_rewards)

                # print('Min L2: %.3f' % np.min(rollout_debug))

            # print(policy.first_time_all_complete)
            # first_complete_list.append(expert_policy.first_time_all_complete)
    # print(np.histogram(first_complete_list, bins=100))
        print('%.1f +/- %.1f' % (np.mean(debug_stats), np.std(debug_stats)))
        print('\n\n')
Пример #3
0
class SmacAgent(nn.Module):
    def __init__(self,
                 latent_dim,
                 context_encoder,
                 policy,
                 reward_predictor,
                 use_next_obs_in_context=False,
                 _debug_ignore_context=False,
                 _debug_do_not_sqrt=False,
                 _debug_use_ground_truth_context=False):
        super().__init__()
        self.latent_dim = latent_dim

        self.context_encoder = context_encoder
        self.policy = policy
        self.reward_predictor = reward_predictor
        self.deterministic_policy = MakeDeterministic(self.policy)
        self._debug_ignore_context = _debug_ignore_context
        self._debug_use_ground_truth_context = _debug_use_ground_truth_context

        # self.recurrent = kwargs['recurrent']
        # self.use_ib = kwargs['use_information_bottleneck']
        # self.sparse_rewards = kwargs['sparse_rewards']
        self.use_next_obs_in_context = use_next_obs_in_context

        # initialize buffers for z dist and z
        # use buffers so latent context can be saved along with model weights
        self.register_buffer('z', torch.zeros(1, latent_dim))
        self.register_buffer('z_means', torch.zeros(1, latent_dim))
        self.register_buffer('z_vars', torch.zeros(1, latent_dim))

        self.z_means = None
        self.z_vars = None
        self.context = None
        self.z = None

        # rp = reward predictor
        # TODO: add back in reward predictor code
        self.z_means_rp = None
        self.z_vars_rp = None
        self.z_rp = None
        self.context_encoder_rp = context_encoder
        self._use_context_encoder_snapshot_for_reward_pred = False

        self.latent_prior = torch.distributions.Normal(
            ptu.zeros(self.latent_dim), ptu.ones(self.latent_dim))

        self._debug_do_not_sqrt = _debug_do_not_sqrt

    def clear_z(self, num_tasks=1):
        '''
        reset q(z|c) to the prior
        sample a new z from the prior
        '''
        #  reset distribution over z to the prior
        mu = ptu.zeros(num_tasks, self.latent_dim)
        var = ptu.ones(num_tasks, self.latent_dim)
        self.z_means = mu
        self.z_vars = var

    @property
    def use_context_encoder_snapshot_for_reward_pred(self):
        return self._use_context_encoder_snapshot_for_reward_pred

    @use_context_encoder_snapshot_for_reward_pred.setter
    def use_context_encoder_snapshot_for_reward_pred(self, value):
        if value and not self.use_context_encoder_snapshot_for_reward_pred:
            # copy context encoder on switch
            self.context_encoder_rp = copy.deepcopy(self.context_encoder)
            self.context_encoder_rp.to(ptu.device)
            self.reward_predictor = copy.deepcopy(self.reward_predictor)
            self.reward_predictor.to(ptu.device)
        self._use_context_encoder_snapshot_for_reward_pred = value

    def detach_z(self):
        ''' disable backprop through z '''
        self.z = self.z.detach()
        if self.recurrent:
            self.context_encoder.hidden = self.context_encoder.hidden.detach()

        self.z_rp = self.z_rp.detach()
        if self.recurrent:
            self.context_encoder_rp.hidden = self.context_encoder_rp.hidden.detach(
            )

    def update_context(self, context, inputs):
        ''' append single transition to the current context '''
        if self._debug_use_ground_truth_context:
            return context
        o, a, r, no, d, info = inputs
        o = ptu.from_numpy(o[None, None, ...])
        a = ptu.from_numpy(a[None, None, ...])
        r = ptu.from_numpy(np.array([r])[None, None, ...])
        no = ptu.from_numpy(no[None, None, ...])

        if self.use_next_obs_in_context:
            data = torch.cat([o, a, r, no], dim=2)
        else:
            data = torch.cat([o, a, r], dim=2)
        if context is None:
            context = data
        else:
            try:
                context = torch.cat([context, data], dim=1)
            except Exception as e:
                import ipdb
                ipdb.set_trace()
        return context

    def compute_kl_div(self):
        ''' compute KL( q(z|c) || r(z) ) '''
        prior = torch.distributions.Normal(ptu.zeros(self.latent_dim),
                                           ptu.ones(self.latent_dim))
        posteriors = [
            torch.distributions.Normal(mu, torch.sqrt(var)) for mu, var in zip(
                torch.unbind(self.z_means), torch.unbind(self.z_vars))
        ]
        kl_divs = [
            torch.distributions.kl.kl_divergence(post, prior)
            for post in posteriors
        ]
        kl_div_sum = torch.sum(torch.stack(kl_divs))
        return kl_div_sum

    def batched_latent_prior(self, batch_size):
        return torch.distributions.Normal(
            ptu.zeros(batch_size, self.latent_dim),
            ptu.ones(batch_size, self.latent_dim))

    def latent_posterior(self,
                         context,
                         squeeze=False,
                         for_reward_prediction=False):
        ''' compute q(z|c) as a function of input context and sample new z from it'''
        if isinstance(context, np.ndarray):
            context = ptu.from_numpy(context)
        if self._debug_use_ground_truth_context:
            if squeeze:
                context = context.squeeze(dim=0)
            return Delta(context)
        if for_reward_prediction:
            context_encoder = self.context_encoder_rp
        else:
            context_encoder = self.context_encoder
        params = context_encoder(context)
        params = params.view(context.size(0), -1, context_encoder.output_size)
        mu = params[..., :self.latent_dim]
        sigma_squared = F.softplus(params[..., self.latent_dim:])
        z_params = [
            _product_of_gaussians(m, s)
            for m, s in zip(torch.unbind(mu), torch.unbind(sigma_squared))
        ]
        z_means = torch.stack([p[0] for p in z_params])
        z_vars = torch.stack([p[1] for p in z_params])
        if squeeze:
            z_means = z_means.squeeze(dim=0)
            z_vars = z_vars.squeeze(dim=0)
        if self._debug_do_not_sqrt:
            return torch.distributions.Normal(z_means, z_vars)
        else:
            return torch.distributions.Normal(z_means, torch.sqrt(z_vars))

    def get_action(self, obs, z, deterministic=False):
        ''' sample action from the policy, conditioned on the task embedding '''
        obs = ptu.from_numpy(obs[None])
        if self._debug_ignore_context:
            z = ptu.from_numpy(z[None]) * 0
        else:
            z = ptu.from_numpy(z[None])
        if len(obs.shape) != len(z.shape):
            import ipdb
            ipdb.set_trace()
        in_ = torch.cat([obs, z], dim=1)[0]
        if deterministic:
            return self.deterministic_policy.get_action(in_)
        else:
            return self.policy.get_action(in_)

    def set_num_steps_total(self, n):
        self.policy.set_num_steps_total(n)

    def forward(
        self,
        obs,
        context,
        return_task_z=False,
        return_latent_posterior=False,
        return_latent_posterior_and_task_z=False,
    ):
        ''' given context, get statistics under the current policy of a set of observations '''
        context_distrib = self.latent_posterior(context)
        task_z = context_distrib.rsample()

        t, b, _ = obs.size()
        obs = obs.view(t * b, -1)
        task_z = [z.repeat(b, 1) for z in task_z]
        task_z = torch.cat(task_z, dim=0)

        # run policy, get log probs and new actions
        in_ = torch.cat([obs, task_z.detach()], dim=1)
        action_distribution = self.policy(in_)
        # policy_outputs = self.policy(in_, reparameterize=True, return_log_prob=True)
        if return_latent_posterior_and_task_z:
            return action_distribution, context_distrib, task_z
        if return_latent_posterior:
            return action_distribution, context_distrib
        if return_task_z:
            return action_distribution, task_z
        else:
            return action_distribution

        # return policy_outputs, task_z

    def infer_reward(self, obs, action, z):
        obs = ptu.from_numpy(obs[None])
        action = ptu.from_numpy(action[None])
        z = ptu.from_numpy(z[None])
        reward = self.reward_predictor(obs, action, z)
        return ptu.get_numpy(reward)[0]

    def log_diagnostics(self, eval_statistics):
        '''
        adds logging data about encodings to eval_statistics
        '''
        z_mean = np.mean(np.abs(ptu.get_numpy(self.z_means[0])))
        z_sig = np.mean(ptu.get_numpy(self.z_vars[0]))
        eval_statistics['Z mean eval'] = z_mean
        eval_statistics['Z variance eval'] = z_sig

        # z_mean_rp = np.mean(np.abs(ptu.get_numpy(self.z_means_rp[0])))
        # z_sig_rp = np.mean(ptu.get_numpy(self.z_vars_rp[0]))
        # eval_statistics['Z rew-pred mean eval'] = z_mean_rp
        # eval_statistics['Z rew-pred variance eval'] = z_sig_rp

    @property
    def networks(self):
        if self.context_encoder is self.context_encoder_rp:
            return [self.context_encoder, self.policy]
        else:
            return [self.context_encoder, self.context_encoder_rp, self.policy]