예제 #1
0
def test_trainer_makes_progress(tmpdir, session):
  venv = util.make_vec_env(ENV_NAME, 10)
  trainer = make_trainer(tmpdir)
  with pytest.raises(dagger.NeedsDemosException):
    trainer.extend_and_update()
  assert trainer.round_num == 0
  pre_train_rew_mean = rollout.mean_return(
      trainer.bc_trainer.policy, venv, sample_until=rollout.min_episodes(20),
      deterministic_policy=True)
  # checking that the initial policy is poor can be flaky; sometimes the
  # randomly initialised policy performs very well, and it's not clear why
  # assert pre_train_rew_mean < 100
  with serialize.load_policy('ppo2', EXPERT_POLICY_PATH, venv) as expert_policy:
    for i in range(5):
      # roll out a few trajectories for dataset, then train for a few steps
      collector = trainer.get_trajectory_collector()
      for _ in range(10):
        obs = collector.reset()
        done = False
        while not done:
          (expert_action, ), _, _, _ = expert_policy.step(
              obs[None], deterministic=True)
          obs, _, done, _ = collector.step(expert_action)
      trainer.extend_and_update(n_epochs=10)
  # make sure we're doing better than a random policy would
  post_train_rew_mean = rollout.mean_return(
      trainer.bc_trainer.policy, venv, sample_until=rollout.min_episodes(20),
      deterministic_policy=True)
  assert post_train_rew_mean > 150, \
      f'pre-train mean {pre_train_rew_mean}, post-train mean ' \
      f'{post_train_rew_mean}'
예제 #2
0
def test_density_reward(density_type, is_stationary):
    # test on Pendulum rather than Cartpole because I don't handle episodes that
    # terminate early yet (see issue #40)
    env_name = "Pendulum-v0"
    env = util.make_vec_env(env_name, 2)

    # construct density-based reward from expert rollouts
    rollout_path = "tests/data/expert_models/pendulum_0/rollouts/final.pkl"
    expert_trajectories_all = types.load(rollout_path)
    n_experts = len(expert_trajectories_all)
    expert_trajectories_train = expert_trajectories_all[: n_experts // 2]
    reward_fn = DensityReward(
        trajectories=expert_trajectories_train,
        density_type=density_type,
        kernel="gaussian",
        obs_space=env.observation_space,
        act_space=env.action_space,
        is_stationary=is_stationary,
        kernel_bandwidth=0.2,
        standardise_inputs=True,
    )

    # check that expert policy does better than a random policy under our reward
    # function
    random_policy = RandomPolicy(env.observation_space, env.action_space)
    sample_until = rollout.min_episodes(n_experts // 2)
    random_trajectories = rollout.generate_trajectories(
        random_policy, env, sample_until=sample_until
    )
    expert_trajectories_test = expert_trajectories_all[n_experts // 2 :]
    random_score = score_trajectories(random_trajectories, reward_fn)
    expert_score = score_trajectories(expert_trajectories_test, reward_fn)
    assert expert_score > random_score
예제 #3
0
def trainer(_algorithm_cls, _parallel: bool, tmpdir: str,
            _convert_dataset: bool):
    logger.configure(tmpdir, ["tensorboard", "stdout"])
    trajs = types.load(
        "tests/data/expert_models/cartpole_0/rollouts/final.pkl")
    if _convert_dataset:
        trans = rollout.flatten_trajectories(trajs)
        expert_data = datasets.TransitionsDictDatasetAdaptor(trans)
    else:
        expert_data = rollout.flatten_trajectories(trajs)

    venv = util.make_vec_env(
        "CartPole-v1",
        n_envs=2,
        parallel=_parallel,
        log_dir=tmpdir,
    )

    gen_policy = util.init_rl(venv, verbose=1)

    return _algorithm_cls(
        venv=venv,
        expert_data=expert_data,
        gen_policy=gen_policy,
        log_dir=tmpdir,
    )
예제 #4
0
def test_train_disc_small_expert_data_warning(tmpdir, _algorithm_cls):
    logger.configure(tmpdir, ["tensorboard", "stdout"])
    venv = util.make_vec_env(
        "CartPole-v1",
        n_envs=2,
        parallel=_parallel,
    )

    gen_algo = util.init_rl(venv, verbose=1)
    small_data = rollout.generate_transitions(gen_algo, venv, n_timesteps=20)

    with pytest.raises(ValueError, match="Transitions.*expert_batch_size"):
        _algorithm_cls(
            venv=venv,
            expert_data=small_data,
            expert_batch_size=21,
            gen_algo=gen_algo,
            log_dir=tmpdir,
        )

    with pytest.raises(ValueError, match="expert_batch_size.*positive"):
        _algorithm_cls(
            venv=venv,
            expert_data=small_data,
            expert_batch_size=-1,
            gen_algo=gen_algo,
            log_dir=tmpdir,
        )
예제 #5
0
def test_reward_overwrite():
    """Test that reward wrapper actually overwrites base rewards."""
    env_name = "Pendulum-v0"
    num_envs = 3
    env = util.make_vec_env(env_name, num_envs)
    reward_fn = FunkyReward()
    wrapped_env = reward_wrapper.RewardVecEnvWrapper(env, reward_fn)
    policy = RandomPolicy(env.observation_space, env.action_space)
    sample_until = rollout.min_episodes(10)
    default_stats = rollout.rollout_stats(
        rollout.generate_trajectories(policy, env, sample_until))
    wrapped_stats = rollout.rollout_stats(
        rollout.generate_trajectories(policy, wrapped_env, sample_until))
    # Pendulum-v0 always has negative rewards
    assert default_stats["return_max"] < 0
    # ours gives between 1 * traj_len and num_envs * traj_len reward
    # (trajectories are all constant length of 200 in Pendulum)
    steps = wrapped_stats["len_mean"]
    assert wrapped_stats["return_min"] == 1 * steps
    assert wrapped_stats["return_max"] == num_envs * steps

    # check that wrapped reward is negative (all pendulum rewards is negative)
    # and other rewards are non-negative
    rand_act, _, _, _ = policy.step(wrapped_env.reset())
    _, rew, _, infos = wrapped_env.step(rand_act)
    assert np.all(rew >= 0)
    assert np.all([info_dict["wrapped_env_rew"] < 0 for info_dict in infos])
예제 #6
0
def test_actions_valid(env_name, policy_type):
    """Test output actions of our custom policies always lie in action space."""
    venv = util.make_vec_env(env_name, n_envs=1, parallel=False)
    policy = serialize.load_policy(policy_type, "foobar", venv)
    transitions = rollout.generate_transitions(policy, venv, n_timesteps=100)

    for a in transitions.acts:
        assert venv.action_space.contains(a)
예제 #7
0
def _factory_via_serialized(
    factory_from_policy: Callable[[vec_env.VecEnv, policies.BasePolicy], T],
    env_name: str,
    policy_type: str,
    policy_path: str,
    **kwargs,
) -> Iterator[T]:
    venv = util.make_vec_env(env_name, **kwargs)
    with serialize.load_policy(policy_type, policy_path, venv) as policy:
        with factory_from_policy(venv, policy) as generator:
            yield generator
def _factory_via_serialized(
    factory_from_policy: Callable[[vec_env.VecEnv, policies.BasePolicy], T],
    env_name: str,
    policy_type: str,
    policy_path: str,
    parallel: bool = True,
    **kwargs,
) -> Iterator[T]:
    venv = util.make_vec_env(env_name, parallel=parallel, **kwargs)
    with tf.device("/cpu:0"):
        # It's normally faster to do policy inference on CPU, since batch sizes are small.
        with serialize.load_policy(policy_type, policy_path, venv) as policy:
            with factory_from_policy(venv, policy) as generator:
                yield generator
예제 #9
0
def test_reward_valid(env_name, reward_type):
    """Test output of reward function is appropriate shape and type."""
    venv = util.make_vec_env(env_name, n_envs=1, parallel=False)
    TRAJECTORY_LEN = 10
    obs = _sample(venv.observation_space, TRAJECTORY_LEN)
    actions = _sample(venv.action_space, TRAJECTORY_LEN)
    next_obs = _sample(venv.observation_space, TRAJECTORY_LEN)
    steps = np.arange(0, TRAJECTORY_LEN)

    reward_fn = serialize.load_reward(reward_type, "foobar", venv)
    pred_reward = reward_fn(obs, actions, next_obs, steps)

    assert pred_reward.shape == (TRAJECTORY_LEN, )
    assert isinstance(pred_reward[0], numbers.Number)
예제 #10
0
    def __init__(
        self,
        venv: vec_env.VecEnv,
        expert_data: Union[Iterable[Mapping], types.Transitions],
        expert_batch_size: int,
        gen_algo: on_policy_algorithm.OnPolicyAlgorithm,
        *,
        # FIXME(sam) pass in discrim net directly; don't ask for kwargs indirectly
        discrim_kwargs: Optional[Mapping] = None,
        policy,
        _init_setup_model,
        **kwargs,
    ):
        """Generative Adversarial Imitation Learning.

        Most parameters are described in and passed to `AdversarialTrainer.__init__`.
        Additional parameters that `GAIL` adds on top of its superclass initializer are
        as follows:

        Args:
            discrim_kwargs: Optional keyword arguments to use while constructing the
                DiscrimNetGAIL.

        """
        env_name = 'ReachObjectUR5Sim-v0'
        num_vec = 1
        max_episode_steps = 1000
        venv = util.make_vec_env(
            env_name,
            num_vec,
            #seed=_seed,
            #parallel=parallel,
            #log_dir=log_dir,
            max_episode_steps=max_episode_steps,
        )
        discrim_kwargs = discrim_kwargs or {}
        discrim = discrim_nets.DiscrimNetGAIL(venv.observation_space,
                                              venv.action_space,
                                              **discrim_kwargs)
        super().__init__(
            venv=venv,
            gen_algo=gen_algo,
            discrim=discrim,
            expert_data=expert_data,
            expert_batch_size=expert_batch_size,  #**kwargs
        )
        self.verbose = False
        self.tensorboard_log = None
        self.use_sde = False
예제 #11
0
파일: train.py 프로젝트: gkswamy98/pillbox
def train_gail(env, n=0):
    venv = util.make_vec_env(env, n_envs=8)
    if isinstance(venv.action_space, Discrete):
        w = 64
    else:
        w = 256
    expert_data = make_sads_dataloader(env, max_trajs=5)
    logger.configure(os.path.join("learners", "GAIL"))

    for i in range(n):
        discrim_net = discrim_nets.ActObsMLP(
            action_space=venv.action_space,
            observation_space=venv.observation_space,
            hid_sizes=(w, w),
        )
        gail_trainer = adversarial.GAIL(
            venv,
            expert_data=expert_data,
            expert_batch_size=32,
            gen_algo=PPO("MlpPolicy",
                         venv,
                         verbose=1,
                         n_steps=1024,
                         policy_kwargs=dict(net_arch=[w, w])),
            discrim_kwargs={'discrim_net': discrim_net})
        mean_rewards = []
        std_rewards = []
        for train_steps in range(20):
            if train_steps > 0:
                if 'Bullet' in env:
                    gail_trainer.train(total_timesteps=25000)
                else:
                    gail_trainer.train(total_timesteps=16384)

            def get_policy(*args, **kwargs):
                return gail_trainer.gen_algo.policy

            model = PPO(get_policy, env, verbose=1)
            mean_reward, std_reward = evaluate_policy(model,
                                                      model.env,
                                                      n_eval_episodes=10)
            mean_rewards.append(mean_reward)
            std_rewards.append(std_reward)
            print("{0} Steps: {1}".format(train_steps, mean_reward))
            np.savez(os.path.join("learners", env,
                                  "gail_rewards_{0}".format(i)),
                     means=mean_rewards,
                     stds=std_rewards)
예제 #12
0
def test_serialize_identity(env_name, model_cfg, normalize, tmpdir):
    """Test output actions of deserialized policy are same as original."""
    orig_venv = venv = util.make_vec_env(env_name, n_envs=1, parallel=False)
    vec_normalize = None
    if normalize:
        venv = vec_normalize = VecNormalize(venv)

    model_name, model_cls_name = model_cfg
    try:
        model_cls = registry.load_attr(model_cls_name)
    except (AttributeError, ImportError):  # pragma: no cover
        pytest.skip(
            "Couldn't load stable baselines class. "
            "(Probably because mpi4py not installed.)"
        )

    model = model_cls("MlpPolicy", venv)
    model.learn(1000)

    venv.env_method("seed", 0)
    venv.reset()
    if normalize:
        # don't want statistics to change as we collect rollouts
        vec_normalize.training = False
    orig_rollout = rollout.generate_transitions(
        model,
        venv,
        n_timesteps=1000,
        deterministic_policy=True,
        rng=np.random.RandomState(0),
    )

    serialize.save_stable_model(tmpdir, model, vec_normalize)
    # We use `orig_venv` since `load_policy` automatically wraps `loaded`
    # with a VecNormalize, when appropriate.
    with serialize.load_policy(model_name, tmpdir, orig_venv) as loaded:
        orig_venv.env_method("seed", 0)
        orig_venv.reset()
        new_rollout = rollout.generate_transitions(
            loaded,
            orig_venv,
            n_timesteps=1000,
            deterministic_policy=True,
            rng=np.random.RandomState(0),
        )

    assert np.allclose(orig_rollout.acts, new_rollout.acts)
예제 #13
0
def test_serialize_identity(env_name, net_cls, tmpdir):
    """Does output of deserialized reward network match that of original?"""
    logging.info(f"Testing {net_cls}")

    venv = util.make_vec_env(env_name, n_envs=1, parallel=False)
    original = net_cls(venv.observation_space, venv.action_space)
    random = base.RandomPolicy(venv.observation_space, venv.action_space)

    tmppath = os.path.join(tmpdir, "reward.pt")
    th.save(original, tmppath)
    loaded = th.load(tmppath)

    assert original.observation_space == loaded.observation_space
    assert original.action_space == loaded.action_space

    transitions = rollout.generate_transitions(random, venv, n_timesteps=100)

    unshaped_fn = serialize.load_reward("RewardNet_unshaped", tmppath, venv)
    shaped_fn = serialize.load_reward("RewardNet_shaped", tmppath, venv)
    rewards = {
        "train": [],
        "test": [],
    }
    for net in [original, loaded]:
        trans_args = (
            transitions.obs,
            transitions.acts,
            transitions.next_obs,
            transitions.dones,
        )
        rewards["train"].append(net.predict_reward_train(*trans_args))
        rewards["test"].append(net.predict_reward_test(*trans_args))

    args = (
        transitions.obs,
        transitions.acts,
        transitions.next_obs,
        transitions.dones,
    )
    rewards["train"].append(shaped_fn(*args))
    rewards["test"].append(unshaped_fn(*args))

    for key, predictions in rewards.items():
        assert len(predictions) == 3
        assert np.allclose(predictions[0], predictions[1])
        assert np.allclose(predictions[0], predictions[2])
예제 #14
0
def test_serialize_identity(session, env_name, reward_net):
    """Does output of deserialized reward network match that of original?"""
    net_name, net_cls = reward_net
    print(f"Testing {net_name}")

    venv = util.make_vec_env(env_name, n_envs=1, parallel=False)
    with tf.variable_scope("original"):
        original = net_cls(venv.observation_space, venv.action_space)
    random = base.RandomPolicy(venv.observation_space, venv.action_space)
    session.run(tf.global_variables_initializer())

    with tempfile.TemporaryDirectory(
            prefix='imitation-serialize-rew') as tmpdir:
        original.save(tmpdir)
        with tf.variable_scope("loaded"):
            loaded = net_cls.load(tmpdir)

        assert original.observation_space == loaded.observation_space
        assert original.action_space == loaded.action_space

        rollouts = rollout.generate_transitions(random, venv, n_timesteps=100)
        feed_dict = {}
        outputs = {'train': [], 'test': []}
        for net in [original, loaded]:
            feed_dict.update(_make_feed_dict(net, rollouts))
            outputs['train'].append(net.reward_output_train)
            outputs['test'].append(net.reward_output_test)

        unshaped_name = f"{net_name}_unshaped"
        shaped_name = f"{net_name}_shaped"
        with serialize.load_reward(unshaped_name, tmpdir, venv) as unshaped_fn:
            with serialize.load_reward(shaped_name, tmpdir, venv) as shaped_fn:
                rewards = session.run(outputs, feed_dict=feed_dict)

                old_obs, actions, new_obs, _ = rollouts
                steps = np.zeros((old_obs.shape[0], ))
                rewards['train'].append(
                    shaped_fn(old_obs, actions, new_obs, steps))
                rewards['test'].append(
                    unshaped_fn(old_obs, actions, new_obs, steps))

    for key, predictions in rewards.items():
        assert len(predictions) == 3
        assert np.allclose(predictions[0], predictions[1])
        assert np.allclose(predictions[0], predictions[2])
예제 #15
0
def test_density_trainer_smoke():
    # tests whether density trainer runs, not whether it's good
    # (it's actually really poor)
    env_name = "Pendulum-v0"
    rollout_path = "tests/data/expert_models/pendulum_0/rollouts/final.pkl"
    rollouts = types.load(rollout_path)[:2]
    env = util.make_vec_env(env_name, 2)
    imitation_trainer = util.init_rl(env)
    density_trainer = DensityTrainer(
        env,
        rollouts=rollouts,
        imitation_trainer=imitation_trainer,
        density_type=STATE_ACTION_DENSITY,
        is_stationary=False,
        kernel="gaussian",
    )
    density_trainer.train_policy(n_timesteps=2)
    density_trainer.test_policy(n_trajectories=2)
예제 #16
0
def test_train_disc_small_expert_data_warning(tmpdir, _algorithm_cls):
    logger.configure(tmpdir, ["tensorboard", "stdout"])
    venv = util.make_vec_env(
        "CartPole-v1",
        n_envs=2,
        parallel=_parallel,
        log_dir=tmpdir,
    )

    gen_algo = util.init_rl(venv, verbose=1)
    small_data = rollout.generate_transitions(gen_algo, venv, n_timesteps=20)

    with pytest.warns(RuntimeWarning, match="discriminator batch size"):
        _algorithm_cls(
            venv=venv,
            expert_data=small_data,
            gen_algo=gen_algo,
            log_dir=tmpdir,
        )
예제 #17
0
파일: train.py 프로젝트: gkswamy98/pillbox
def train_bc(env, n=0):
    venv = util.make_vec_env(env, n_envs=8)
    if isinstance(venv.action_space, Discrete):
        w = 64
    else:
        w = 256
    for i in range(n):
        mean_rewards = []
        std_rewards = []
        for num_trajs in range(0, 26, 5):
            if num_trajs == 0:
                expert_data = make_sa_dataloader(env, normalize=False)
            else:
                expert_data = make_sa_dataloader(env,
                                                 max_trajs=num_trajs,
                                                 normalize=False)
            bc_trainer = bc.BC(venv.observation_space,
                               venv.action_space,
                               expert_data=expert_data,
                               policy_class=policies.ActorCriticPolicy,
                               ent_weight=0.,
                               l2_weight=0.,
                               policy_kwargs=dict(net_arch=[w, w]))
            if num_trajs > 0:
                bc_trainer.train(n_batches=int(5e5))

            def get_policy(*args, **kwargs):
                return bc_trainer.policy

            model = PPO(get_policy, env, verbose=1)
            model.save(
                os.path.join("learners", env,
                             "bc_{0}_{1}".format(i, num_trajs)))
            mean_reward, std_reward = evaluate_policy(model,
                                                      model.get_env(),
                                                      n_eval_episodes=10)
            mean_rewards.append(mean_reward)
            std_rewards.append(std_reward)
            print("{0} Trajs: {1}".format(num_trajs, mean_reward))
            np.savez(os.path.join("learners", env, "bc_rewards_{0}".format(i)),
                     means=mean_rewards,
                     stds=std_rewards)
예제 #18
0
def test_serialize_identity(session, env_name, net_cls, tmpdir):
    """Does output of deserialized reward network match that of original?"""
    logging.info(f"Testing {net_cls}")

    venv = util.make_vec_env(env_name, n_envs=1, parallel=False)
    with tf.variable_scope("original"):
        original = net_cls(venv.observation_space, venv.action_space)
    random = base.RandomPolicy(venv.observation_space, venv.action_space)
    session.run(tf.global_variables_initializer())

    original.save(tmpdir)
    with tf.variable_scope("loaded"):
        loaded = reward_net.RewardNet.load(tmpdir)

    assert original.observation_space == loaded.observation_space
    assert original.action_space == loaded.action_space

    transitions = rollout.generate_transitions(random, venv, n_timesteps=100)
    feed_dict = {}
    outputs = {"train": [], "test": []}
    for net in [original, loaded]:
        feed_dict.update(_make_feed_dict(net, transitions))
        outputs["train"].append(net.reward_output_train)
        outputs["test"].append(net.reward_output_test)

    with serialize.load_reward("RewardNet_unshaped", tmpdir, venv) as unshaped_fn:
        with serialize.load_reward("RewardNet_shaped", tmpdir, venv) as shaped_fn:
            rewards = session.run(outputs, feed_dict=feed_dict)

            args = (
                transitions.obs,
                transitions.acts,
                transitions.next_obs,
                transitions.dones,
            )
            rewards["train"].append(shaped_fn(*args))
            rewards["test"].append(unshaped_fn(*args))

    for key, predictions in rewards.items():
        assert len(predictions) == 3
        assert np.allclose(predictions[0], predictions[1])
        assert np.allclose(predictions[0], predictions[2])
예제 #19
0
def test_serialize_identity(env_name, model_cfg, normalize, tmpdir):
    """Test output actions of deserialized policy are same as original."""
    orig_venv = venv = util.make_vec_env(env_name, n_envs=1, parallel=False)
    vec_normalize = None
    if normalize:
        venv = vec_normalize = VecNormalize(venv)

    model_name, model_cls_name = model_cfg
    model_cls = registry.load_attr(model_cls_name)

    # FIXME(sam): verbose=1 is a hack to stop it from setting up SB logger
    model = model_cls("MlpPolicy", venv, verbose=1)
    model.learn(1000)

    venv.env_method("seed", 0)
    venv.reset()
    if normalize:
        # don't want statistics to change as we collect rollouts
        vec_normalize.training = False
    orig_rollout = rollout.generate_transitions(
        model,
        venv,
        n_timesteps=1000,
        deterministic_policy=True,
        rng=np.random.RandomState(0),
    )

    serialize.save_stable_model(tmpdir, model, vec_normalize)
    # We use `orig_venv` since `load_policy` automatically wraps `loaded`
    # with a VecNormalize, when appropriate.
    loaded = serialize.load_policy(model_name, tmpdir, orig_venv)
    orig_venv.env_method("seed", 0)
    orig_venv.reset()
    new_rollout = rollout.generate_transitions(
        loaded,
        orig_venv,
        n_timesteps=1000,
        deterministic_policy=True,
        rng=np.random.RandomState(0),
    )

    assert np.allclose(orig_rollout.acts, new_rollout.acts)
예제 #20
0
def rollouts_from_policy(
    _run,
    _seed: int,
    *,
    num_vec: int,
    rollout_save_n_timesteps: int,
    rollout_save_n_episodes: int,
    log_dir: str,
    policy_path: str,
    policy_type: str,
    env_name: str,
    parallel: bool,
    rollout_save_path: str,
    max_episode_steps: Optional[int],
) -> None:
    """Loads a saved policy and generates rollouts.

    Unlisted arguments are the same as in `rollouts_and_policy()`.

    Args:
        policy_type: Argument to `imitation.policies.serialize.load_policy`.
        policy_path: Argument to `imitation.policies.serialize.load_policy`.
        rollout_save_path: Rollout pickle is saved to this path.
    """
    os.makedirs(log_dir, exist_ok=True)
    sacred_util.build_sacred_symlink(log_dir, _run)

    sample_until = rollout.make_sample_until(
        rollout_save_n_timesteps, rollout_save_n_episodes
    )

    venv = util.make_vec_env(
        env_name,
        num_vec,
        seed=_seed,
        parallel=parallel,
        log_dir=log_dir,
        max_episode_steps=max_episode_steps,
    )

    with serialize.load_policy(policy_type, policy_path, venv) as policy:
        rollout.rollout_and_save(rollout_save_path, policy, venv, sample_until)
예제 #21
0
def trainer(
    _algorithm_cls,
    _parallel: bool,
    tmpdir: str,
    _convert_dataset: bool,
    expert_batch_size: int,
    expert_transitions: types.Transitions,
):
    logger.configure(tmpdir, ["tensorboard", "stdout"])
    if _convert_dataset:
        expert_data = th_data.DataLoader(
            expert_transitions,
            batch_size=expert_batch_size,
            collate_fn=types.transitions_collate_fn,
            shuffle=True,
            drop_last=True,
        )
    else:
        expert_data = expert_transitions

    venv = util.make_vec_env(
        "CartPole-v1",
        n_envs=2,
        parallel=_parallel,
        log_dir=tmpdir,
    )

    gen_algo = util.init_rl(venv, verbose=1)

    trainer = _algorithm_cls(
        venv=venv,
        expert_data=expert_data,
        expert_batch_size=expert_batch_size,
        gen_algo=gen_algo,
        log_dir=tmpdir,
    )

    try:
        yield trainer
    finally:
        venv.close()
예제 #22
0
def make_pm(env_name="evaluating_rewards/PointMassLine-v0", extra_dones: Optional[int] = None):
    """Make transitions factory for Point Mass environment.

    Args:
        env_name: The name of the environment in the Gym registry.
        extra_dones: If specified, the frequency at which to artificially insert dones.
            At episode termination, the next potential is fixed to zero, making the
            constant bias of the potential important. At all other points the constant
            bias has no effect (undiscounted) or minimal effect (discounted) to the
            reward output. Increasing the frequency of dones is a form of dataset
            augmentation, that lets us learn the constant bias more quickly. This is
            definitely "cheating", but it seems worth it to keep the unit tests quick.

    Returns:
        A dict of observation space, action space and dataset generator.
    """
    venv = util.make_vec_env(env_name)
    obs_space = venv.observation_space
    act_space = venv.action_space

    pm = point_mass.PointMassPolicy(obs_space, act_space)
    with datasets.transitions_factory_from_policy(venv, pm) as transitions_factory:

        def f(total_timesteps: int):
            trans = transitions_factory(total_timesteps)
            if extra_dones is not None:
                dones = np.array(trans.dones)
                dones[::extra_dones] = True
                trans = dataclasses.replace(trans, dones=dones)
            return trans

        # It's OK to return dataset_generator outside the with context:
        # rollout_policy_generator doesn't actually have any internal resources
        # (some other datasets do).
        return {
            "observation_space": obs_space,
            "action_space": act_space,
            "dataset_generator": f,
        }
예제 #23
0
def test_density_trainer(density_type, is_stationary):
    env_name = "Pendulum-v0"
    rollout_path = "tests/data/expert_models/pendulum_0/rollouts/final.pkl"
    rollouts = types.load(rollout_path)
    env = util.make_vec_env(env_name, 2)
    imitation_trainer = util.init_rl(env)
    density_trainer = DensityTrainer(
        env,
        rollouts=rollouts,
        imitation_trainer=imitation_trainer,
        density_type=density_type,
        is_stationary=is_stationary,
        kernel="gaussian",
    )
    novice_stats = density_trainer.test_policy()
    density_trainer.train_policy(2000)
    good_stats = density_trainer.test_policy()
    # Novice is bad
    assert novice_stats["return_mean"] < -500
    # Density is also pretty bad, but shouldn't make things more than 50% worse.
    # It would be nice to have a less flaky/more meaningful test here.
    assert good_stats["return_mean"] > 1.5 * novice_stats["return_mean"]
예제 #24
0
def policy_eval(_seed: int, env_name: str, timesteps: int, num_vec: int,
                parallel: bool, render: bool, policy_type: str,
                policy_path: str, log_dir: str):
    """Rolls a policy out in an environment, collecting statistics.

  Args:
    _seed: generated by Sacred.
    env_name: Gym environment identifier.
    timesteps: Minimum number of timesteps to evaluate for.
    num_vec: Number of environments to run simultaneously.
    parallel: If True, use `SubprocVecEnv` for true parallelism; otherwise,
        uses `DummyVecEnv`.
    render: If True, renders interactively to the screen.
    policy_type: A unique identifier for the saved policy,
        defined in POLICY_CLASSES.
    policy_path: A path to the serialized policy.
    log_dir: The directory to log intermediate output to. (As of 2019-07-19
        this is just episode-by-episode reward from bench.Monitor.)

  Returns:
    Statistics returned by `imitation.util.rollout.rollout_stats`.
  """
    tf.logging.set_verbosity(tf.logging.INFO)
    tf.logging.info('Logging to %s', log_dir)

    venv = util.make_vec_env(env_name,
                             num_vec,
                             seed=_seed,
                             parallel=parallel,
                             log_dir=log_dir)
    if render:
        venv = InteractiveRender(venv)
    # TODO(adam): add support for videos using VideoRecorder?

    policy = serialize.load_policy(policy_type, policy_path, venv)
    stats = rollout.rollout_stats(policy, venv, n_timesteps=timesteps)

    return stats
def make_venv(
    _seed,
    env_name: str,
    num_vec: int,
    parallel: bool,
    log_dir: str,
    max_episode_steps: int,
    env_make_kwargs: Mapping[str, Any],
    **kwargs,
) -> vec_env.VecEnv:
    """Builds the vector environment.

     Args:
        env_name: The environment to train in.
        num_vec: Number of `gym.Env` instances to combine into a vector environment.
        parallel: Whether to use "true" parallelism. If True, then use `SubProcVecEnv`.
            Otherwise, use `DummyVecEnv` which steps through environments serially.
        max_episode_steps: If not None, then a TimeLimit wrapper is applied to each
            environment to artificially limit the maximum number of timesteps in an
            episode.
        log_dir: Logs episode return statistics to a subdirectory 'monitor`.
        env_make_kwargs: The kwargs passed to `spec.make` of a gym environment.
        kwargs: Passed through to `util.make_vec_env`.

    Returns:
        The constructed vector environment.
    """
    return util.make_vec_env(
        env_name,
        num_vec,
        seed=_seed,
        parallel=parallel,
        max_episode_steps=max_episode_steps,
        log_dir=log_dir,
        env_make_kwargs=env_make_kwargs,
        **kwargs,
    )
예제 #26
0
def test_serialize_identity(session, env_name, discrim_net_cls, tmpdir):
    """Does output of deserialized discriminator match that of original?"""
    venv = util.make_vec_env(env_name, parallel=False)
    original = DISCRIM_NET_SETUPS[discrim_net_cls](venv)
    random = base.RandomPolicy(venv.observation_space, venv.action_space)
    session.run(tf.global_variables_initializer())

    original.save(tmpdir)
    with tf.variable_scope("loaded"):
        loaded = discrim_net.DiscrimNet.load(tmpdir)

    transitions = rollout.generate_transitions(random, venv, n_timesteps=100)
    length = len(transitions.obs)  # n_timesteps is only a lower bound
    labels = np.random.randint(2, size=length).astype(np.float32)
    log_prob = np.random.randn(length)

    feed_dict = {}
    outputs = {"train": [], "test": []}
    for net in [original, loaded]:
        feed_dict.update(
            {
                net.obs_ph: transitions.obs,
                net.act_ph: transitions.acts,
                net.next_obs_ph: transitions.next_obs,
                net.labels_gen_is_one_ph: labels,
                net.log_policy_act_prob_ph: log_prob,
            }
        )
        outputs["train"].append(net.policy_train_reward)
        outputs["test"].append(net.policy_test_reward)

    rewards = session.run(outputs, feed_dict=feed_dict)

    for key, predictions in rewards.items():
        assert len(predictions) == 2
        assert np.allclose(predictions[0], predictions[1])
예제 #27
0
def venv():
    return util.make_vec_env("MountainCar-v0")
예제 #28
0
def init_trainer(
    env_name: str,
    expert_trajectories: Sequence[types.Trajectory],
    *,
    log_dir: str,
    seed: int = 0,
    use_gail: bool = False,
    num_vec: int = 8,
    parallel: bool = False,
    max_episode_steps: Optional[int] = None,
    scale: bool = True,
    airl_entropy_weight: float = 1.0,
    discrim_kwargs: dict = {},
    reward_kwargs: dict = {},
    trainer_kwargs: dict = {},
    init_rl_kwargs: dict = {},
):
    """Builds an AdversarialTrainer, ready to be trained on expert demonstrations.

    Args:
      env_name: The string id of a gym environment.
      expert_trajectories: Demonstrations from expert.
      seed: Random seed.
      log_dir: Directory for logging output. Will generate a unique sub-directory
          within this directory for all output.
      use_gail: If True, then train using GAIL. If False, then train
          using AIRL.
      num_vec: The number of vectorized environments.
      parallel: If True, then use SubprocVecEnv; otherwise, DummyVecEnv.
      max_episode_steps: If specified, wraps VecEnv in TimeLimit wrapper with
          this episode length before returning.
      policy_dir: The directory containing the pickled experts for
          generating rollouts.
      scale: If True, then scale input Tensors to the interval [0, 1].
      airl_entropy_weight: Only applicable for AIRL. The `entropy_weight`
          argument of `DiscrimNetAIRL.__init__`.
      trainer_kwargs: Arguments for the Trainer constructor.
      reward_kwargs: Arguments for the `*RewardNet` constructor.
      discrim_kwargs: Arguments for the `DiscrimNet*` constructor.
      init_rl_kwargs: Keyword arguments passed to `init_rl`,
          used to initialize the RL algorithm.
    """
    logger.configure(folder=log_dir, format_strs=["tensorboard", "stdout"])
    env = util.make_vec_env(
        env_name,
        num_vec,
        seed=seed,
        parallel=parallel,
        log_dir=log_dir,
        max_episode_steps=max_episode_steps,
    )
    gen_policy = util.init_rl(env, verbose=1, **init_rl_kwargs)

    if use_gail:
        discrim = discrim_net.DiscrimNetGAIL(env.observation_space,
                                             env.action_space,
                                             scale=scale,
                                             **discrim_kwargs)
    else:
        rn = BasicShapedRewardNet(env.observation_space,
                                  env.action_space,
                                  scale=scale,
                                  **reward_kwargs)
        discrim = discrim_net.DiscrimNetAIRL(
            rn, entropy_weight=airl_entropy_weight, **discrim_kwargs)

    expert_demos = rollout.flatten_trajectories(expert_trajectories)
    trainer = AdversarialTrainer(env,
                                 gen_policy,
                                 discrim,
                                 expert_demos,
                                 log_dir=log_dir,
                                 **trainer_kwargs)
    return trainer
예제 #29
0
def rollouts_and_policy(
    _run,
    _seed: int,
    env_name: str,
    total_timesteps: int,
    *,
    log_dir: str,
    num_vec: int,
    parallel: bool,
    max_episode_steps: Optional[int],
    normalize: bool,
    normalize_kwargs: dict,
    init_rl_kwargs: dict,
    n_episodes_eval: int,
    reward_type: Optional[str],
    reward_path: Optional[str],
    rollout_save_interval: int,
    rollout_save_final: bool,
    rollout_save_n_timesteps: Optional[int],
    rollout_save_n_episodes: Optional[int],
    policy_save_interval: int,
    policy_save_final: bool,
    init_tensorboard: bool,
) -> dict:
    """Trains an expert policy from scratch and saves the rollouts and policy.

    Checkpoints:
      At applicable training steps `step` (where step is either an integer or
      "final"):

        - Policies are saved to `{log_dir}/policies/{step}/`.
        - Rollouts are saved to `{log_dir}/rollouts/{step}.pkl`.

    Args:
        env_name: The gym.Env name. Loaded as VecEnv.
        total_timesteps: Number of training timesteps in `model.learn()`.
        log_dir: The root directory to save metrics and checkpoints to.
        num_vec: Number of environments in VecEnv.
        parallel: If True, then use DummyVecEnv. Otherwise use SubprocVecEnv.
        max_episode_steps: If not None, then environments are wrapped by
            TimeLimit so that they have at most `max_episode_steps` steps per
            episode.
        normalize: If True, then rescale observations and reward.
        normalize_kwargs: kwargs for `VecNormalize`.
        init_rl_kwargs: kwargs for `init_rl`.

        n_episodes_eval: The number of episodes to average over when calculating
            the average ground truth reward return of the final policy.

        reward_type: If provided, then load the serialized reward of this type,
            wrapping the environment in this reward. This is useful to test
            whether a reward model transfers. For more information, see
            `imitation.rewards.serialize.load_reward`.
        reward_path: A specifier, such as a path to a file on disk, used by
            reward_type to load the reward model. For more information, see
            `imitation.rewards.serialize.load_reward`.

        rollout_save_interval: The number of training updates in between
            intermediate rollout saves. If the argument is nonpositive, then
            don't save intermediate updates.
        rollout_save_final: If True, then save rollouts right after training is
            finished.
        rollout_save_n_timesteps: The minimum number of timesteps saved in every
            file. Could be more than `rollout_save_n_timesteps` because
            trajectories are saved by episode rather than by transition.
            Must set exactly one of `rollout_save_n_timesteps`
            and `rollout_save_n_episodes`.
        rollout_save_n_episodes: The number of episodes saved in every
            file. Must set exactly one of `rollout_save_n_timesteps` and
            `rollout_save_n_episodes`.

        policy_save_interval: The number of training updates between saves. Has
            the same semantics are `rollout_save_interval`.
        policy_save_final: If True, then save the policy right after training is
            finished.

        init_tensorboard: If True, then write tensorboard logs to {log_dir}/sb_tb
            and "output/summary/...".

    Returns:
      The return value of `rollout_stats()` using the final policy.
    """
    os.makedirs(log_dir, exist_ok=True)
    sacred_util.build_sacred_symlink(log_dir, _run)

    sample_until = rollout.make_sample_until(rollout_save_n_timesteps,
                                             rollout_save_n_episodes)
    eval_sample_until = rollout.min_episodes(n_episodes_eval)

    with networks.make_session():
        tf.logging.set_verbosity(tf.logging.INFO)
        logger.configure(folder=osp.join(log_dir, "rl"),
                         format_strs=["tensorboard", "stdout"])

        rollout_dir = osp.join(log_dir, "rollouts")
        policy_dir = osp.join(log_dir, "policies")
        os.makedirs(rollout_dir, exist_ok=True)
        os.makedirs(policy_dir, exist_ok=True)

        if init_tensorboard:
            sb_tensorboard_dir = osp.join(log_dir, "sb_tb")
            # Convert sacred's ReadOnlyDict to dict so we can modify on next line.
            init_rl_kwargs = dict(init_rl_kwargs)
            init_rl_kwargs["tensorboard_log"] = sb_tensorboard_dir

        venv = util.make_vec_env(
            env_name,
            num_vec,
            seed=_seed,
            parallel=parallel,
            log_dir=log_dir,
            max_episode_steps=max_episode_steps,
        )

        log_callbacks = []
        with contextlib.ExitStack() as stack:
            if reward_type is not None:
                reward_fn_ctx = load_reward(reward_type, reward_path, venv)
                reward_fn = stack.enter_context(reward_fn_ctx)
                venv = RewardVecEnvWrapper(venv, reward_fn)
                log_callbacks.append(venv.log_callback)
                tf.logging.info(
                    f"Wrapped env in reward {reward_type} from {reward_path}.")

            vec_normalize = None
            if normalize:
                venv = vec_normalize = VecNormalize(venv, **normalize_kwargs)

            policy = util.init_rl(venv, verbose=1, **init_rl_kwargs)

            # Make callback to save intermediate artifacts during training.
            step = 0

            def callback(locals_: dict, _) -> bool:
                nonlocal step
                step += 1
                policy = locals_["self"]

                # TODO(adam): make logging frequency configurable
                for callback in log_callbacks:
                    callback(sb_logger)

                if rollout_save_interval > 0 and step % rollout_save_interval == 0:
                    save_path = osp.join(rollout_dir, f"{step}.pkl")
                    rollout.rollout_and_save(save_path, policy, venv,
                                             sample_until)
                if policy_save_interval > 0 and step % policy_save_interval == 0:
                    output_dir = os.path.join(policy_dir, f"{step:05d}")
                    serialize.save_stable_model(output_dir, policy,
                                                vec_normalize)

            policy.learn(total_timesteps, callback=callback)

            # Save final artifacts after training is complete.
            if rollout_save_final:
                save_path = osp.join(rollout_dir, "final.pkl")
                rollout.rollout_and_save(save_path, policy, venv, sample_until)
            if policy_save_final:
                output_dir = os.path.join(policy_dir, "final")
                serialize.save_stable_model(output_dir, policy, vec_normalize)

            # Final evaluation of expert policy.
            trajs = rollout.generate_trajectories(policy, venv,
                                                  eval_sample_until)
            stats = rollout.rollout_stats(trajs)

    return stats
예제 #30
0
def eval_policy(
    _run,
    _seed: int,
    env_name: str,
    eval_n_timesteps: Optional[int],
    eval_n_episodes: Optional[int],
    num_vec: int,
    parallel: bool,
    render: bool,
    render_fps: int,
    log_dir: str,
    policy_type: str,
    policy_path: str,
    reward_type: Optional[str] = None,
    reward_path: Optional[str] = None,
    max_episode_steps: Optional[int] = None,
):
    """Rolls a policy out in an environment, collecting statistics.

  Args:
    _seed: generated by Sacred.
    env_name: Gym environment identifier.
    eval_n_timesteps: Minimum number of timesteps to evaluate for. Set exactly
        one of `eval_n_episodes` and `eval_n_timesteps`.
    eval_n_episodes: Minimum number of episodes to evaluate for. Set exactly
        one of `eval_n_episodes` and `eval_n_timesteps`.
    num_vec: Number of environments to run simultaneously.
    parallel: If True, use `SubprocVecEnv` for true parallelism; otherwise,
        uses `DummyVecEnv`.
    max_episode_steps: If not None, then environments are wrapped by
        TimeLimit so that they have at most `max_episode_steps` steps per
        episode.
    render: If True, renders interactively to the screen.
    log_dir: The directory to log intermediate output to. (As of 2019-07-19
        this is just episode-by-episode reward from bench.Monitor.)
    policy_type: A unique identifier for the saved policy,
        defined in POLICY_CLASSES.
    policy_path: A path to the serialized policy.
    reward_type: If specified, overrides the environment reward with
        a reward of this.
    reward_path: If reward_type is specified, the path to a serialized reward
        of `reward_type` to override the environment reward with.

  Returns:
    Return value of `imitation.util.rollout.rollout_stats()`.
  """

    os.makedirs(log_dir, exist_ok=True)
    sacred_util.build_sacred_symlink(log_dir, _run)

    tf.logging.set_verbosity(tf.logging.INFO)
    tf.logging.info('Logging to %s', log_dir)
    sample_until = rollout.make_sample_until(eval_n_timesteps, eval_n_episodes)
    venv = util.make_vec_env(env_name,
                             num_vec,
                             seed=_seed,
                             parallel=parallel,
                             log_dir=log_dir,
                             max_episode_steps=max_episode_steps)
    venv = VecNormalize(venv, training=False, norm_reward=False)
    venv = venv.load(policy_path + "/vec_normalize.pkl", venv)

    if render:
        venv = InteractiveRender(venv, render_fps)
    # TODO(adam): add support for videos using VideoRecorder?

    with contextlib.ExitStack() as stack:
        if reward_type is not None:
            reward_fn_ctx = load_reward(reward_type, reward_path, venv)
            reward_fn = stack.enter_context(reward_fn_ctx)
            venv = reward_wrapper.RewardVecEnvWrapper(venv, reward_fn)
            tf.logging.info(
                f"Wrapped env in reward {reward_type} from {reward_path}.")

        with serialize.load_policy(policy_type, policy_path, venv) as policy:
            trajs = rollout.generate_trajectories(policy, venv, sample_until)
    return rollout.rollout_stats(trajs)