Exemplo n.º 1
0
def test_evaluate_actions_sizes() -> None:
    """ Test the sizes of returned tensors from ppo.evaluate_actions(). """

    settings = dict(DEFAULT_SETTINGS)
    env = get_env(settings["env_name"], settings["num_processes"])
    policy = get_policy(env, settings)
    minibatch_size = (settings["rollout_length"] * settings["num_processes"] //
                      settings["num_minibatch"])
    obs_list = [
        torch.Tensor(env.observation_space.sample())
        for _ in range(minibatch_size)
    ]
    obs_batch = torch.stack(obs_list)
    actions_list = [
        torch.Tensor([float(env.action_space.sample())])
        for _ in range(minibatch_size)
    ]
    actions_batch = torch.stack(actions_list)

    value_pred, action_log_prob, action_dist_entropy, _ = policy.evaluate_actions(
        obs_batch, None, actions_batch, None)

    assert isinstance(value_pred, torch.Tensor)
    assert value_pred.shape == torch.Size([minibatch_size])
    assert isinstance(action_log_prob, torch.Tensor)
    assert action_log_prob.shape == torch.Size([minibatch_size])
    assert isinstance(action_log_prob, torch.Tensor)
    assert action_dist_entropy.shape == torch.Size([minibatch_size])
Exemplo n.º 2
0
def test_get_value_sizes() -> None:
    """ Test the sizes of returned tensors from ppo.get_value(). """

    settings = dict(DEFAULT_SETTINGS)
    env = get_env(settings["env_name"], settings["num_processes"])
    policy = get_policy(env, settings)
    obs = torch.Tensor(env.observation_space.sample())

    value_pred = policy.get_value(obs, None, None)

    assert isinstance(value_pred, torch.Tensor)
    assert value_pred.shape == torch.Size([1])
Exemplo n.º 3
0
def get_metaworld_rollout(
        settings: Dict[str, Any]) -> Tuple[RolloutStorage, np.ndarray]:
    """
    Execute and return a single rollout over a MetaWorld environment using configuration
    in `settings`.
    """

    # Construct environment and policy.
    env = get_env(
        settings["env_name"],
        num_processes=settings["num_processes"],
        seed=settings["seed"],
        time_limit=settings["time_limit"],
        normalize_transition=settings["normalize_transition"],
        normalize_first_n=settings["normalize_first_n"],
        allow_early_resets=True,
        same_np_seed=settings["same_np_seed"],
        add_observability=settings["add_observability"],
        save_memory=settings["save_memory"],
    )
    policy = get_policy(env, settings)
    rollout = RolloutStorage(
        rollout_length=settings["rollout_length"],
        observation_space=env.observation_space,
        action_space=env.action_space,
        num_processes=settings["num_processes"],
        hidden_state_size=1,
        device=settings["device"],
    )
    rollout.set_initial_obs(env.reset())

    # Collect rollout.
    for rollout_step in range(rollout.rollout_length):

        # Sample actions.
        with torch.no_grad():
            values, actions, action_log_probs, hidden_states = policy.act(
                rollout.obs[rollout_step],
                rollout.hidden_states[rollout_step],
                rollout.dones[rollout_step],
            )

        # Perform step and record in ``rollout``.
        obs, rewards, dones, infos = env.step(actions)
        rollout.add_step(obs, actions, dones, action_log_probs, values,
                         rewards, hidden_states)

    env.close()
    return rollout
Exemplo n.º 4
0
def test_collect_rollout_values() -> None:
    """
    Test the values of the returned RolloutStorage objects from train.collect_rollout().
    """

    settings = dict(DEFAULT_SETTINGS)
    settings["env_name"] = "unique-env"

    env = get_env(
        settings["env_name"],
        normalize_transition=settings["normalize_transition"],
        allow_early_resets=True,
    )
    policy = get_policy(env, settings)
    rollout = RolloutStorage(
        rollout_length=settings["rollout_length"],
        observation_space=env.observation_space,
        action_space=env.action_space,
        num_processes=settings["num_processes"],
        hidden_state_size=1,
        device=settings["device"],
    )
    rollout.set_initial_obs(env.reset())
    rollout, _, _ = collect_rollout(rollout, env, policy)

    # Check if rollout info came from UniqueEnv.
    for step in range(rollout.rollout_step):

        obs = rollout.obs[step]
        value_pred = rollout.value_preds[step]
        action = rollout.actions[step]
        action_log_prob = rollout.action_log_probs[step]
        reward = rollout.rewards[step]

        # Check shapes.
        assert obs.shape == torch.Size([settings["num_processes"], 1])
        assert value_pred.shape == torch.Size([settings["num_processes"], 1])
        assert action.shape == torch.Size([settings["num_processes"], 1])
        assert action_log_prob.shape == torch.Size(
            [settings["num_processes"], 1])
        assert reward.shape == torch.Size([settings["num_processes"], 1])

        # Check consistency of values.
        assert float(obs) == float(step + 1)
        assert float(action) - int(action) == 0 and int(
            action) in env.action_space
        assert float(obs) == float(reward)

    env.close()
Exemplo n.º 5
0
def test_lr_schedule_cosine() -> None:
    """
    Tests learning rate schedule in the case where the schedule type is cosine.
    """

    # Initialize environment and policy.
    settings = dict(DEFAULT_SETTINGS)
    settings["lr_schedule_type"] = "cosine"
    env = get_env(settings["env_name"],
                  settings["num_processes"],
                  allow_early_resets=True)
    policy = get_policy(env, settings)

    # Define helper function to check learning rate.
    def check_lr(optimizer: Optimizer, lr: float) -> None:
        for param_group in optimizer.param_groups:
            assert abs(param_group["lr"] - lr) < TOL

    # Run training and test values of learning rate along the way.
    check_lr(policy.optimizer, settings["initial_lr"])
    for i in range(settings["num_updates"]):

        # Perform update.
        rollout = get_rollout(
            env,
            policy,
            settings["num_episodes"],
            settings["episode_len"],
            settings["num_processes"],
            settings["device"],
        )
        for step_loss in policy.get_loss(rollout):
            step_loss.backward()
            policy.optimizer.step()
        policy.after_step()

        # Check learning rate.
        interval_pos = math.pi * float(i + 1) / settings["num_updates"]
        offset = (0.5 * (settings["initial_lr"] - settings["final_lr"]) *
                  (1.0 + math.cos(interval_pos)))
        expected_lr = settings["final_lr"] + offset
        check_lr(policy.optimizer, expected_lr)
Exemplo n.º 6
0
def test_update_values() -> None:
    """
    Tests whether PPOPolicy.get_loss() calculates correct updates in the case of
    a linear actor/critic network and a dummy environment.
    """

    # Initialize environment and policy.
    settings = dict(DEFAULT_SETTINGS)
    env = get_env(settings["env_name"],
                  settings["num_processes"],
                  allow_early_resets=True)
    policy = get_policy(env, settings)

    # Initialize rollout storage.
    rollout = get_rollout(
        env,
        policy,
        settings["num_episodes"],
        settings["episode_len"],
        settings["num_processes"],
        settings["device"],
    )

    # Compute expected losses.
    expected_loss_items = get_losses(rollout, policy, settings)

    # Compute actual losses.
    actual_loss = 0
    for step_loss in policy.get_loss(rollout):
        actual_loss += step_loss.item()
        step_loss.backward()
        policy.optimizer.step()
    policy.after_step()

    # Compare expected vs. actual.
    diff = abs(float(actual_loss - expected_loss_items["total"]))
    print("loss diff: %.8f" % diff)
    assert diff < BIG_TOL
Exemplo n.º 7
0
def test_lr_schedule_null() -> None:
    """
    Tests learning rate schedule in the case where no schedule type is given (learning
    rate should be constant).
    """

    # Initialize environment and policy.
    settings = dict(DEFAULT_SETTINGS)
    env = get_env(settings["env_name"],
                  settings["num_processes"],
                  allow_early_resets=True)
    policy = get_policy(env, settings)

    # Define helper function to check learning rate.
    def check_lr(optimizer: Optimizer, lr: float) -> None:
        for param_group in optimizer.param_groups:
            assert param_group["lr"] == lr

    # Run training and test values of learning rate along the way.
    check_lr(policy.optimizer, settings["initial_lr"])
    for _ in range(settings["num_updates"]):

        # Perform update.
        rollout = get_rollout(
            env,
            policy,
            settings["num_episodes"],
            settings["episode_len"],
            settings["num_processes"],
            settings["device"],
        )
        for step_loss in policy.get_loss(rollout):
            step_loss.backward()
            policy.optimizer.step()
        policy.after_step()

        # Check learning rate.
        check_lr(policy.optimizer, settings["initial_lr"])
Exemplo n.º 8
0
def test_multitask_losses() -> None:
    """
    Tests that PPOPolicy.get_loss() correctly computes task specific losses when
    multi-task training.
    """

    # Initialize environment and policy. Note that we set `normalize_first_n` to 39,
    # since it is the size of the total observation minus the number of tasks. We also
    # set `normalize_advantages` to False, as this makes it possible to compute the task
    # specific losses while only considering each task's own transitions. Changing this
    # setting to True will cause this test to fail.
    settings = dict(DEFAULT_SETTINGS)
    settings["env_name"] = "MT10"
    settings["num_tasks"] = get_num_tasks(settings["env_name"])
    settings["num_processes"] = 10
    settings["num_episodes"] = 1
    settings["episode_len"] = 100
    settings["normalize_advantages"] = False
    env = get_env(
        settings["env_name"],
        settings["num_processes"],
        allow_early_resets=True,
        normalize_first_n=39,
    )
    policy = get_policy(env, settings)

    # Initialize rollout and task specific rollouts.
    rollout, task_rollouts = get_task_rollouts(
        env,
        policy,
        settings["num_tasks"],
        settings["num_episodes"],
        settings["episode_len"],
        settings["num_processes"],
        settings["device"],
    )

    # Compute expected task losses.
    expected_task_losses = []
    for task, task_rollout in enumerate(task_rollouts):
        if task_rollout is None:
            expected_task_losses.append(0)
        else:
            task_settings = dict(settings)
            task_settings["num_processes"] = task_rollout.num_processes
            expected_loss_items = get_losses(task_rollout, policy,
                                             task_settings)
            expected_task_losses.append(expected_loss_items["total"])

    # Compute actual losses.
    actual_task_losses = [0] * settings["num_tasks"]
    for step_loss in policy.get_loss(rollout):
        actual_task_losses = [
            actual_task_losses[i] + step_loss[i].item()
            for i in range(settings["num_tasks"])
        ]

        # Aggregate task losses to execute backward pass.
        step_loss = sum(step_loss)
        step_loss.backward()
        policy.optimizer.step()

    policy.after_step()

    # Compare expected vs. actual.
    for task in range(settings["num_tasks"]):
        diff = abs(actual_task_losses[task] - expected_task_losses[task])
        print("loss diff: %.8f, %.8f, %.8f" %
              (actual_task_losses[task], expected_task_losses[task], diff))
        assert diff < BIG_TOL