예제 #1
0
 def postprocess_trajectory(self,
                            sample_batch,
                            other_agent_batches=None,
                            episode=None):
     sample_batch = super().postprocess_trajectory(sample_batch)
     return compute_gae_for_sample_batch(self, sample_batch,
                                         other_agent_batches, episode)
예제 #2
0
def postprocess_trajectory(
    policy: Policy,
    sample_batch: SampleBatch,
    other_agent_batches: Optional[Dict[AgentID, SampleBatch]] = None,
    episode: Optional[Episode] = None,
) -> SampleBatch:
    """Postprocesses a trajectory and returns the processed trajectory.

    The trajectory contains only data from one episode and from one agent.
    - If  `config.batch_mode=truncate_episodes` (default), sample_batch may
    contain a truncated (at-the-end) episode, in case the
    `config.rollout_fragment_length` was reached by the sampler.
    - If `config.batch_mode=complete_episodes`, sample_batch will contain
    exactly one episode (no matter how long).
    New columns can be added to sample_batch and existing ones may be altered.

    Args:
        policy (Policy): The Policy used to generate the trajectory
            (`sample_batch`)
        sample_batch (SampleBatch): The SampleBatch to postprocess.
        other_agent_batches (Optional[Dict[PolicyID, SampleBatch]]): Optional
            dict of AgentIDs mapping to other agents' trajectory data (from the
            same episode). NOTE: The other agents use the same policy.
        episode (Optional[Episode]): Optional multi-agent episode
            object in which the agents operated.

    Returns:
        SampleBatch: The postprocessed, modified SampleBatch (or a new one).
    """
    if not policy.config["vtrace"]:
        sample_batch = compute_gae_for_sample_batch(policy, sample_batch,
                                                    other_agent_batches,
                                                    episode)

    return sample_batch
예제 #3
0
def postprocess_ppo_gae(
        policy: Policy,
        sample_batch: SampleBatch,
        other_agent_batches: Optional[Dict[AgentID, SampleBatch]] = None,
        episode: Optional[Episode] = None) -> SampleBatch:

    return compute_gae_for_sample_batch(policy, sample_batch,
                                        other_agent_batches, episode)
예제 #4
0
def add_advantages(policy: Policy,
                   sample_batch: SampleBatch,
                   other_agent_batches: Optional[Dict[PolicyID,
                                                      SampleBatch]] = None,
                   episode: Optional[MultiAgentEpisode] = None) -> SampleBatch:

    return compute_gae_for_sample_batch(policy, sample_batch,
                                        other_agent_batches, episode)
예제 #5
0
 def postprocess_trajectory(
     self,
     sample_batch: SampleBatch,
     other_agent_batches: Optional[Dict[AgentID, SampleBatch]] = None,
     episode: Optional[Episode] = None,
 ):
     sample_batch = super().postprocess_trajectory(sample_batch)
     return compute_gae_for_sample_batch(self, sample_batch,
                                         other_agent_batches, episode)
예제 #6
0
 def postprocess_trajectory(self,
                            sample_batch,
                            other_agent_batches=None,
                            episode=None):
     # Do all post-processing always with no_grad().
     # Not using this here will introduce a memory leak
     # in torch (issue #6962).
     # TODO: no_grad still necessary?
     with torch.no_grad():
         return compute_gae_for_sample_batch(self, sample_batch,
                                             other_agent_batches, episode)
예제 #7
0
        def postprocess_trajectory(
            self,
            sample_batch: SampleBatch,
            other_agent_batches: Optional[SampleBatch] = None,
            episode: Optional["Episode"] = None,
        ):
            if not self.config["vtrace"]:
                sample_batch = compute_gae_for_sample_batch(
                    self, sample_batch, other_agent_batches, episode)

            return sample_batch
예제 #8
0
    def test_ppo_free_log_std(self):
        """Tests the free log std option works."""
        config = (
            ppo.PPOConfig()
            .rollouts(
                num_rollout_workers=0,
            )
            .training(
                gamma=0.99,
                model=dict(
                    fcnet_hiddens=[10],
                    fcnet_activation="linear",
                    free_log_std=True,
                    vf_share_layers=True,
                ),
            )
        )

        for fw, sess in framework_iterator(config, session=True):
            trainer = ppo.PPO(config=config, env="CartPole-v0")
            policy = trainer.get_policy()

            # Check the free log std var is created.
            if fw == "torch":
                matching = [
                    v for (n, v) in policy.model.named_parameters() if "log_std" in n
                ]
            else:
                matching = [
                    v for v in policy.model.trainable_variables() if "log_std" in str(v)
                ]
            assert len(matching) == 1, matching
            log_std_var = matching[0]

            def get_value():
                if fw == "tf":
                    return policy.get_session().run(log_std_var)[0]
                elif fw == "torch":
                    return log_std_var.detach().cpu().numpy()[0]
                else:
                    return log_std_var.numpy()[0]

            # Check the variable is initially zero.
            init_std = get_value()
            assert init_std == 0.0, init_std
            batch = compute_gae_for_sample_batch(policy, FAKE_BATCH.copy())
            if fw == "torch":
                batch = policy._lazy_tensor_dict(batch)
            policy.learn_on_batch(batch)

            # Check the variable is updated.
            post_std = get_value()
            assert post_std != 0.0, post_std
            trainer.stop()
예제 #9
0
def postprocess_advantages(policy,
                           sample_batch,
                           other_agent_batches=None,
                           episode=None):

    # Stub serving backward compatibility.
    deprecation_warning(
        old="rllib.agents.a3c.a3c_tf_policy.postprocess_advantages",
        new="rllib.evaluation.postprocessing.compute_gae_for_sample_batch",
        error=False)

    return compute_gae_for_sample_batch(policy, sample_batch,
                                        other_agent_batches, episode)
예제 #10
0
 def postprocess_trajectory(self,
                            sample_batch,
                            other_agent_batches=None,
                            episode=None):
     '''
     Calculate GAE in postprocess
     '''
     with torch.no_grad():
         # Call super's postprocess_trajectory first.
         # sample_batch = super().postprocess_trajectory(
         #     sample_batch, other_agent_batches, episode)
         return compute_gae_for_sample_batch(self, sample_batch,
                                             other_agent_batches, episode)
예제 #11
0
def postprocess_ppo_gae(
        policy: Policy,
        sample_batch: SampleBatch,
        other_agent_batches: Optional[Dict[AgentID, SampleBatch]] = None,
        episode: Optional[MultiAgentEpisode] = None) -> SampleBatch:

    # Stub serving backward compatibility.
    deprecation_warning(
        old="rllib.agents.ppo.ppo_tf_policy.postprocess_ppo_gae",
        new="rllib.evaluation.postprocessing.compute_gae_for_sample_batch",
        error=False)

    return compute_gae_for_sample_batch(policy, sample_batch,
                                        other_agent_batches, episode)
예제 #12
0
def add_advantages(policy: Policy,
                   sample_batch: SampleBatch,
                   other_agent_batches: Optional[Dict[PolicyID,
                                                      SampleBatch]] = None,
                   episode: Optional[MultiAgentEpisode] = None) -> SampleBatch:

    # Stub serving backward compatibility.
    deprecation_warning(
        old="rllib.agents.a3c.a3c_torch_policy.add_advantages",
        new="rllib.evaluation.postprocessing.compute_gae_for_sample_batch",
        error=False)

    return compute_gae_for_sample_batch(policy, sample_batch,
                                        other_agent_batches, episode)
예제 #13
0
    def test_ppo_free_log_std(self):
        """Tests the free log std option works."""
        config = copy.deepcopy(ppo.DEFAULT_CONFIG)
        config["num_workers"] = 0  # Run locally.
        config["gamma"] = 0.99
        config["model"]["fcnet_hiddens"] = [10]
        config["model"]["fcnet_activation"] = "linear"
        config["model"]["free_log_std"] = True
        config["model"]["vf_share_layers"] = True

        for fw, sess in framework_iterator(config, session=True):
            trainer = ppo.PPOTrainer(config=config, env="CartPole-v0")
            policy = trainer.get_policy()

            # Check the free log std var is created.
            if fw == "torch":
                matching = [
                    v for (n, v) in policy.model.named_parameters()
                    if "log_std" in n
                ]
            else:
                matching = [
                    v for v in policy.model.trainable_variables()
                    if "log_std" in str(v)
                ]
            assert len(matching) == 1, matching
            log_std_var = matching[0]

            def get_value():
                if fw == "tf":
                    return policy.get_session().run(log_std_var)[0]
                elif fw == "torch":
                    return log_std_var.detach().cpu().numpy()[0]
                else:
                    return log_std_var.numpy()[0]

            # Check the variable is initially zero.
            init_std = get_value()
            assert init_std == 0.0, init_std
            batch = compute_gae_for_sample_batch(policy, FAKE_BATCH.copy())
            if fw == "torch":
                batch = policy._lazy_tensor_dict(batch)
            policy.learn_on_batch(batch)

            # Check the variable is updated.
            post_std = get_value()
            assert post_std != 0.0, post_std
            trainer.stop()
예제 #14
0
 def postprocess_trajectory(
     self,
     sample_batch: SampleBatch,
     other_agent_batches: Optional[Dict[Any, SampleBatch]] = None,
     episode: Optional["Episode"] = None,
 ):
     # Call super's postprocess_trajectory first.
     sample_batch = super().postprocess_trajectory(sample_batch,
                                                   other_agent_batches,
                                                   episode)
     if not self.config["vtrace"]:
         # Do all post-processing always with no_grad().
         # Not using this here will introduce a memory leak
         # in torch (issue #6962).
         with torch.no_grad():
             sample_batch = compute_gae_for_sample_batch(
                 self, sample_batch, other_agent_batches, episode)
     return sample_batch
예제 #15
0
    def test_ppo_loss_function(self):
        """Tests the PPO loss function math."""
        config = copy.deepcopy(ppo.DEFAULT_CONFIG)
        config["num_workers"] = 0  # Run locally.
        config["gamma"] = 0.99
        config["model"]["fcnet_hiddens"] = [10]
        config["model"]["fcnet_activation"] = "linear"
        config["model"]["vf_share_layers"] = True

        for fw, sess in framework_iterator(config, session=True):
            trainer = ppo.PPOTrainer(config=config, env="CartPole-v0")
            policy = trainer.get_policy()

            # Check no free log std var by default.
            if fw == "torch":
                matching = [
                    v for (n, v) in policy.model.named_parameters()
                    if "log_std" in n
                ]
            else:
                matching = [
                    v for v in policy.model.trainable_variables()
                    if "log_std" in str(v)
                ]
            assert len(matching) == 0, matching

            # Post-process (calculate simple (non-GAE) advantages) and attach
            # to train_batch dict.
            # A = [0.99^2 * 0.5 + 0.99 * -1.0 + 1.0, 0.99 * 0.5 - 1.0, 0.5] =
            # [0.50005, -0.505, 0.5]
            train_batch = compute_gae_for_sample_batch(policy,
                                                       FAKE_BATCH.copy())
            if fw == "torch":
                train_batch = policy._lazy_tensor_dict(train_batch)

            # Check Advantage values.
            check(train_batch[Postprocessing.VALUE_TARGETS],
                  [0.50005, -0.505, 0.5])

            # Calculate actual PPO loss.
            if fw in ["tf2", "tfe"]:
                ppo_surrogate_loss_tf(policy, policy.model, Categorical,
                                      train_batch)
            elif fw == "torch":
                ppo_surrogate_loss_torch(policy, policy.model,
                                         TorchCategorical, train_batch)

            vars = policy.model.variables() if fw != "torch" else \
                list(policy.model.parameters())
            if fw == "tf":
                vars = policy.get_session().run(vars)
            expected_shared_out = fc(train_batch[SampleBatch.CUR_OBS],
                                     vars[0 if fw != "torch" else 2],
                                     vars[1 if fw != "torch" else 3],
                                     framework=fw)
            expected_logits = fc(expected_shared_out,
                                 vars[2 if fw != "torch" else 0],
                                 vars[3 if fw != "torch" else 1],
                                 framework=fw)
            expected_value_outs = fc(expected_shared_out,
                                     vars[4],
                                     vars[5],
                                     framework=fw)

            kl, entropy, pg_loss, vf_loss, overall_loss = \
                self._ppo_loss_helper(
                    policy, policy.model,
                    Categorical if fw != "torch" else TorchCategorical,
                    train_batch,
                    expected_logits, expected_value_outs,
                    sess=sess
                )
            if sess:
                policy_sess = policy.get_session()
                k, e, pl, v, tl = policy_sess.run(
                    [
                        policy._mean_kl,
                        policy._mean_entropy,
                        policy._mean_policy_loss,
                        policy._mean_vf_loss,
                        policy._total_loss,
                    ],
                    feed_dict=policy._get_loss_inputs_dict(train_batch,
                                                           shuffle=False))
                check(k, kl)
                check(e, entropy)
                check(pl, np.mean(-pg_loss))
                check(v, np.mean(vf_loss), decimals=4)
                check(tl, overall_loss, decimals=4)
            else:
                check(policy._mean_kl, kl)
                check(policy._mean_entropy, entropy)
                check(policy._mean_policy_loss, np.mean(-pg_loss))
                check(policy._mean_vf_loss, np.mean(vf_loss), decimals=4)
                check(policy._total_loss, overall_loss, decimals=4)
            trainer.stop()
예제 #16
0
    def learn_on_batch(self, train_batch):
        # print(type(train_batch))
        # Turn the values into tensors
        # train_batch_tensor = self._lazy_tensor_dict(train_batch)
        # train_batch_tensor = train_batch_tensor
        # restore_original_dimensions()
        # print(train_batch_tensor.keys())
        # update the skill dynamics

        # Set Model to train mode.
        if self.model:
            self.model.train()
        if self.dynamics:
            self.dynamics.train()

        stats = defaultdict(int)
        if self.use_dynamics:
            c = 0
            for ep in range(self.dynamics_epochs):
                for mb in minibatches(
                        train_batch, self.minibatch_size
                ):  # minibatches(train_batch.copy(), self.minibatch_size)
                    c += 1
                    mb["is_training"] = True
                    minibatch = self._lazy_tensor_dict(mb)

                    obs = _unpack_obs(minibatch['obs'],
                                      self.model.options['orig_obs_space'],
                                      torch)
                    next_obs = _unpack_obs(
                        minibatch['new_obs'],
                        self.model.options['orig_obs_space'], torch)
                    dynamics_obs = obs['dynamics_obs']
                    next_dynamics_obs = next_obs['dynamics_obs'] - obs[
                        'dynamics_obs']
                    z = obs['z']

                    log_prob = self.dynamics.get_log_prob(dynamics_obs,
                                                          z,
                                                          next_dynamics_obs,
                                                          training=True)
                    dynamics_loss = -torch.mean(log_prob)
                    orth_loss = self.dynamics.orthogonal_regularization()
                    l2_loss = self.dynamics.l2_regularization()
                    if self.config['dynamics_orth_reg']:
                        dynamics_loss += orth_loss
                    if self.config['dynamics_l2_reg'] and not self.config[
                            'dynamics_spectral_norm']:
                        dynamics_loss += l2_loss
                    self.dynamics_opt.zero_grad()
                    dynamics_loss.backward()
                    if self.config['grad_clip']:
                        grad_norm = nn.utils.clip_grad_norm_(
                            self.dynamics.parameters(),
                            self.config['grad_clip'])
                    self.dynamics_opt.step()
                    stats['dynamics_loss'] += dynamics_loss.item()
                    stats['orth_loss'] += orth_loss.item()
                    stats['l2_loss'] += l2_loss.item()
            stats['dynamics_loss'] /= c
            stats['orth_loss'] /= c
            stats['l2_loss'] /= c

            self.dynamics.eval()
            # compute intrinsic reward
            with torch.no_grad():
                batch = self._lazy_tensor_dict(train_batch)
                obs = _unpack_obs(batch['obs'],
                                  self.model.options['orig_obs_space'], torch)
                next_obs = _unpack_obs(batch['new_obs'],
                                       self.model.options['orig_obs_space'],
                                       torch)
                z = obs['z']
                dynamics_obs = obs['dynamics_obs']
                next_dynamics_obs = next_obs['dynamics_obs'] - obs[
                    'dynamics_obs']

                dads_reward, info = self.dynamics.compute_reward(
                    dynamics_obs, z, next_dynamics_obs)
                dads_reward = self.config[
                    'dads_reward_scale'] * dads_reward.numpy()
                # # replace the reward column in train_batch
                # print(train_batch['rewards'].shape)
                train_batch['rewards'] = dads_reward
                stats['avg_dads_reward'] = dads_reward.mean()
                stats['num_skills_higher_prob'] = info['num_higher_prob']

        # calculate GAE for dads reward here?
        trajs = train_batch.split_by_episode()
        processed_trajs = []
        for traj in trajs:
            processed_trajs.append(compute_gae_for_sample_batch(self, traj))
        batch = SampleBatch.concat_samples(processed_trajs)

        # train_batch = compute_gae_for_sample_batch(self, self._lazy_numpy_dict(train_batch))
        # train_batch = self._lazy_tensor_dict(train_batch)
        # update agent using RL algo
        # split to minibatches
        c = 0
        for ep in range(self.ppo_epochs):
            # batch.shuffle()
            for mb in minibatches(batch, self.minibatch_size):
                c += 1
                mb["is_training"] = True
                # minibatch = mb.copy()
                mb['advantages'] = standardize(mb['advantages'])
                minibatch = self._lazy_tensor_dict(mb)
                # compute the loss
                loss_out = ppo_surrogate_loss(self, self.model,
                                              self.dist_class, minibatch)
                # compute gradient
                self.ppo_opt.zero_grad()
                # the learning_rate is already used in ppo_surrogate_loss
                loss_out.backward()
                # grad norm
                if self.config['grad_clip']:
                    grad_norm = nn.utils.clip_grad_norm_(
                        self.model.parameters(), self.config['grad_clip'])
                self.ppo_opt.step()
                # log stats
                stats['ppo_loss'] += loss_out.item()
        stats['ppo_loss'] /= c
        # add more info about the loss
        stats.update(kl_and_loss_stats(self, train_batch))

        #  {
        #     "loss": loss_out.item(),
        #     'test': 1
        #     # "grad_norm": grad_norm
        #     # if isinstance(grad_norm, float) else grad_norm.item(),
        # }
        return {LEARNER_STATS_KEY: stats}