Exemplo n.º 1
0
 def _load_all_demos(self):
     num_demos_by_round = []
     for round_num in range(self._last_loaded_round + 1,
                            self.round_num + 1):
         round_dir = self._demo_dir_path_for_round(round_num)
         demo_paths = self._get_demo_paths(round_dir)
         self._all_demos.extend(_load_trajectory(p) for p in demo_paths)
         num_demos_by_round.append(len(demo_paths))
     tf.logging.info(f"Loaded {len(self._all_demos)} total")
     demo_transitions = rollout.flatten_trajectories(self._all_demos)
     return demo_transitions, num_demos_by_round
Exemplo n.º 2
0
def test_train_disc_improve_D(use_gail,
                              env='CartPole-v1',
                              n_timesteps=200,
                              n_steps=1000):
    trainer = init_trainer(env, use_gail=use_gail)
    obs_old, act, obs_new, _ = rollout.flatten_trajectories(
        rollout.generate(trainer.gen_policy, env, n_timesteps=n_timesteps))
    kwargs = dict(gen_old_obs=obs_old, gen_act=act, gen_new_obs=obs_new)
    loss1 = trainer.eval_disc_loss(**kwargs)
    trainer.train_disc(n_steps=n_steps, **kwargs)
    loss2 = trainer.eval_disc_loss(**kwargs)
    assert loss2 < loss1
Exemplo n.º 3
0
    def _populate_gen_replay_buffer(self) -> None:
        """Generate and store generator samples in the buffer.

    More specifically, rolls out generator-policy trajectories in the
    environment until `self._n_disc_samples_per_buffer` obs-act-obs samples are
    produced, and then stores these samples.
    """
        gen_rollouts = rollout.flatten_trajectories(
            rollout.generate(self.gen_policy,
                             self.env,
                             n_timesteps=self._n_disc_samples_per_buffer))[:3]
        self._gen_replay_buffer.store(*gen_rollouts)
Exemplo n.º 4
0
  def pop_transitions(self) -> rollout.Transitions:
    """Pops recorded transitions, returning them as an instance of Transitions.

    Raises a RuntimeError if called when `self.n_transitions == 0`.
    """
    if self.n_transitions == 0:
      # It would be better to return an empty `Transitions`, but we would need
      # to get the non-zero dimensions of every np.ndarray attribute correct to
      # avoid downstream errors. This is easier and sufficient for now.
      raise RuntimeError("Called pop_transitions on an empty BufferingWrapper")
    partial_trajs = self._finish_partial_trajectories()
    self._trajectories.extend(partial_trajs)
    transitions = rollout.flatten_trajectories(self._trajectories)
    assert len(transitions.obs) == self.n_transitions
    self._trajectories = []
    self.n_transitions = 0
    return transitions
Exemplo n.º 5
0
def adversarial_learning(
    venv,
    expert=None,
    expert_venv=None,
    expert_trajectories=None,
    state_only=False,
    policy_fn=get_ppo,
    total_timesteps=20000,
    gen_batch_size=200,
    disc_batch_size=100,
    updates_per_batch=2,
    policy_lr=1e-3,
    reward_lr=1e-3,
    is_airl=True,
    **kwargs,
):
    # Set up generator
    gen_policy = policy_fn(venv, learning_rate=policy_lr)
    policy = gen_policy

    # Set up discriminator
    if is_airl:
        rn = BasicShapedRewardNet(
            venv.observation_space,
            venv.action_space,
            theta_units=[32, 32],
            phi_units=[32, 32],
            scale=True,
            state_only=state_only,
        )
        discrim = DiscrimNetAIRL(rn, entropy_weight=1.0)
    else:
        rn = None
        discrim = DiscrimNetGAIL(venv.observation_space, venv.action_space)

    # Set up optimizer
    train_op = tf.train.AdamOptimizer(learning_rate=reward_lr).minimize(
        tf.reduce_mean(discrim.disc_loss))

    # Set up environment reward
    reward_train = functools.partial(
        discrim.reward_train, gen_log_prob_fn=gen_policy.action_probability)
    venv_train = reward_wrapper.RewardVecEnvWrapper(venv, reward_train)
    venv_train_buffering = BufferingWrapper(venv_train)
    gen_policy.set_env(venv_train_buffering)  # possibly redundant

    # Set up replay buffers
    gen_replay_buffer_capacity = 20 * gen_batch_size
    gen_replay_buffer = buffer.ReplayBuffer(gen_replay_buffer_capacity, venv)

    if expert_trajectories is not None:
        expert_transitions = flatten_trajectories(expert_trajectories)
        exp_replay_buffer = buffer.ReplayBuffer.from_data(expert_transitions)
    else:
        exp_replay_buffer = buffer.ReplayBuffer(gen_replay_buffer_capacity,
                                                venv)

    # Start training
    sess = tf.get_default_session()
    sess.run(tf.global_variables_initializer())

    num_epochs = int(np.ceil(total_timesteps / gen_batch_size))

    for epoch in range(num_epochs):
        # Train gen
        gen_policy.learn(total_timesteps=gen_batch_size,
                         reset_num_timesteps=True)
        gen_replay_buffer.store(venv_train_buffering.pop_transitions())

        if expert_trajectories is None:
            exp_replay_buffer.store(
                flatten_trajectories(
                    sample_trajectories(expert_venv,
                                        expert,
                                        n_timesteps=gen_batch_size)))

        # Train disc
        for _ in range(updates_per_batch):
            disc_minibatch_size = disc_batch_size // updates_per_batch
            half_minibatch = disc_minibatch_size // 2

            gen_samples = gen_replay_buffer.sample(half_minibatch)
            expert_samples = exp_replay_buffer.sample(half_minibatch)

            obs = np.concatenate([gen_samples.obs, expert_samples.obs])
            acts = np.concatenate([gen_samples.acts, expert_samples.acts])
            next_obs = np.concatenate(
                [gen_samples.next_obs, expert_samples.next_obs])
            labels = np.concatenate(
                [np.ones(half_minibatch),
                 np.zeros(half_minibatch)])

            log_act_prob = gen_policy.action_probability(obs,
                                                         actions=acts,
                                                         logp=True)
            log_act_prob = log_act_prob.reshape((disc_minibatch_size, ))

            _, logits_v, loss_v = sess.run(
                [
                    train_op,
                    discrim._disc_logits_gen_is_high,
                    discrim._disc_loss,
                ],
                feed_dict={
                    discrim.obs_ph: obs,
                    discrim.act_ph: acts,
                    discrim.next_obs_ph: next_obs,
                    discrim.labels_gen_is_one_ph: labels,
                    discrim.log_policy_act_prob_ph: log_act_prob,
                },
            )

    results = {}
    results["reward_model"] = rn
    results["discrim"] = discrim
    results["policy"] = gen_policy

    return results
Exemplo n.º 6
0
def test_train_disc_no_crash(use_gail, env='CartPole-v1', n_timesteps=200):
    trainer = init_trainer(env, use_gail=use_gail)
    trainer.train_disc()
    obs_old, act, obs_new, _ = rollout.flatten_trajectories(
        rollout.generate(trainer.gen_policy, env, n_timesteps=n_timesteps))
    trainer.train_disc(gen_old_obs=obs_old, gen_act=act, gen_new_obs=obs_new)