예제 #1
0
def run_test_online_episode(
    env: Env__Union,
    model: ModelManager__Union,
    num_train_episodes: int,
    passing_score_bar: float,
    num_eval_episodes: int,
    use_gpu: bool,
):
    """
    Run an online learning test. At the end of each episode training is run on the trajectory.
    """
    env = env.value
    pl.seed_everything(SEED)
    env.seed(SEED)
    env.action_space.seed(SEED)

    normalization = build_normalizer(env)
    logger.info(f"Normalization is: \n{pprint.pformat(normalization)}")

    manager = model.value
    trainer = manager.build_trainer(
        use_gpu=use_gpu,
        normalization_data_map=normalization,
    )
    policy = manager.create_policy(trainer, serving=False)

    device = torch.device("cuda") if use_gpu else torch.device("cpu")

    agent = Agent.create_for_env(env, policy, device=device)

    pl_trainer = pl.Trainer(
        max_epochs=1,
        gpus=int(use_gpu),
        deterministic=True,
        default_root_dir=f"lightning_log_{str(uuid.uuid4())}",
    )
    dataset = EpisodicDataset(env=env,
                              agent=agent,
                              num_episodes=num_train_episodes,
                              seed=SEED)
    data_loader = torch.utils.data.DataLoader(dataset,
                                              collate_fn=identity_collate)
    pl_trainer.fit(trainer, data_loader)

    eval_rewards = evaluate_for_n_episodes(
        n=num_eval_episodes,
        env=env,
        agent=agent,
        max_steps=env.max_steps,
        num_processes=1,
    ).squeeze(1)
    assert (
        eval_rewards.mean() >= passing_score_bar
    ), f"Eval reward is {eval_rewards.mean()}, less than < {passing_score_bar}.\n"
예제 #2
0
def run_test(
    env: Env__Union,
    model: ModelManager__Union,
    replay_memory_size: int,
    train_every_ts: int,
    train_after_ts: int,
    num_train_episodes: int,
    passing_score_bar: float,
    num_eval_episodes: int,
    use_gpu: bool,
    minibatch_size: Optional[int] = None,
):
    env = env.value

    normalization = build_normalizer(env)
    logger.info(f"Normalization is: \n{pprint.pformat(normalization)}")

    manager = model.value
    trainer = manager.initialize_trainer(
        use_gpu=use_gpu,
        reward_options=RewardOptions(),
        normalization_data_map=normalization,
    )
    training_policy = manager.create_policy(serving=False)

    # pyre-fixme[16]: Module `pl` has no attribute `LightningModule`.
    if not isinstance(trainer, pl.LightningModule):
        if minibatch_size is None:
            minibatch_size = trainer.minibatch_size
        assert minibatch_size == trainer.minibatch_size

    assert minibatch_size is not None

    replay_buffer = ReplayBuffer(replay_capacity=replay_memory_size,
                                 batch_size=minibatch_size)

    device = torch.device("cuda") if use_gpu else torch.device("cpu")
    # first fill the replay buffer to burn_in
    train_after_ts = max(train_after_ts, minibatch_size)
    fill_replay_buffer(env=env,
                       replay_buffer=replay_buffer,
                       desired_size=train_after_ts)

    # pyre-fixme[16]: Module `pl` has no attribute `LightningModule`.
    if isinstance(trainer, pl.LightningModule):
        agent = Agent.create_for_env(env, policy=training_policy)
        # TODO: Simplify this setup by creating LightningDataModule
        dataset = ReplayBufferDataset.create_for_trainer(
            trainer,
            env,
            agent,
            replay_buffer,
            batch_size=minibatch_size,
            training_frequency=train_every_ts,
            num_episodes=num_train_episodes,
            max_steps=200,
        )
        data_loader = torch.utils.data.DataLoader(dataset,
                                                  collate_fn=identity_collate)
        # pyre-fixme[16]: Module `pl` has no attribute `Trainer`.
        pl_trainer = pl.Trainer(max_epochs=1, gpus=int(use_gpu))
        pl_trainer.fit(trainer, data_loader)

        # TODO: Also check train_reward
    else:
        post_step = train_with_replay_buffer_post_step(
            replay_buffer=replay_buffer,
            env=env,
            trainer=trainer,
            training_freq=train_every_ts,
            batch_size=trainer.minibatch_size,
            device=device,
        )

        env.seed(SEED)
        env.action_space.seed(SEED)

        train_rewards = train_policy(
            env,
            training_policy,
            num_train_episodes,
            post_step=post_step,
            post_episode=None,
            use_gpu=use_gpu,
        )

        # Check whether the max score passed the score bar; we explore during training
        # the return could be bad (leading to flakiness in C51 and QRDQN).
        assert np.max(train_rewards) >= passing_score_bar, (
            f"max reward ({np.max(train_rewards)}) after training for "
            f"{len(train_rewards)} episodes is less than < {passing_score_bar}.\n"
        )

    serving_policy = manager.create_policy(serving=True)

    eval_rewards = eval_policy(env,
                               serving_policy,
                               num_eval_episodes,
                               serving=True)
    assert (
        eval_rewards.mean() >= passing_score_bar
    ), f"Eval reward is {eval_rewards.mean()}, less than < {passing_score_bar}.\n"
예제 #3
0
def run_test_online_episode(
    env: Env__Union,
    model: ModelManager__Union,
    num_train_episodes: int,
    passing_score_bar: float,
    num_eval_episodes: int,
    use_gpu: bool,
):
    """
    Run an online learning test. At the end of each episode training is run on the trajectory.
    """
    env = env.value
    # pyre-fixme[16]: Module `pl` has no attribute `seed_everything`.
    pl.seed_everything(SEED)
    env.seed(SEED)
    env.action_space.seed(SEED)

    normalization = build_normalizer(env)
    logger.info(f"Normalization is: \n{pprint.pformat(normalization)}")

    manager = model.value
    trainer = manager.initialize_trainer(
        use_gpu=use_gpu,
        reward_options=RewardOptions(),
        normalization_data_map=normalization,
    )
    policy = manager.create_policy(serving=False)

    device = torch.device("cuda") if use_gpu else torch.device("cpu")

    agent = Agent.create_for_env(env, policy, device=device)

    # pyre-fixme[16]: Module `pl` has no attribute `LightningModule`.
    if isinstance(trainer, pl.LightningModule):
        # pyre-fixme[16]: Module `pl` has no attribute `Trainer`.
        pl_trainer = pl.Trainer(max_epochs=1,
                                gpus=int(use_gpu),
                                deterministic=True)
        dataset = EpisodicDataset(env=env,
                                  agent=agent,
                                  num_episodes=num_train_episodes,
                                  seed=SEED)
        pl_trainer.fit(trainer, dataset)
    else:
        post_episode_callback = train_post_episode(env, trainer, use_gpu)
        _ = train_policy(
            env,
            policy,
            num_train_episodes,
            post_step=None,
            post_episode=post_episode_callback,
            use_gpu=use_gpu,
        )

    eval_rewards = evaluate_for_n_episodes(
        n=num_eval_episodes,
        env=env,
        agent=agent,
        max_steps=env.max_steps,
        num_processes=1,
    ).squeeze(1)
    assert (
        eval_rewards.mean() >= passing_score_bar
    ), f"Eval reward is {eval_rewards.mean()}, less than < {passing_score_bar}.\n"
예제 #4
0
def run_test_replay_buffer(
    env: Env__Union,
    model: ModelManager__Union,
    replay_memory_size: int,
    train_every_ts: int,
    train_after_ts: int,
    num_train_episodes: int,
    passing_score_bar: float,
    num_eval_episodes: int,
    use_gpu: bool,
    minibatch_size: Optional[int] = None,
):
    """
    Run an online learning test with a replay buffer. The replay buffer is pre-filled, then the training starts.
    Each transition is added to the replay buffer immediately after it takes place.
    """
    env = env.value
    # pyre-fixme[16]: Module `pl` has no attribute `seed_everything`.
    pl.seed_everything(SEED)
    env.seed(SEED)
    env.action_space.seed(SEED)

    normalization = build_normalizer(env)
    logger.info(f"Normalization is: \n{pprint.pformat(normalization)}")

    manager = model.value
    trainer = manager.initialize_trainer(
        use_gpu=use_gpu,
        reward_options=RewardOptions(),
        normalization_data_map=normalization,
    )
    training_policy = manager.create_policy(serving=False)

    # pyre-fixme[16]: Module `pl` has no attribute `LightningModule`.
    if not isinstance(trainer, pl.LightningModule):
        if minibatch_size is None:
            minibatch_size = trainer.minibatch_size
        assert minibatch_size == trainer.minibatch_size

    assert minibatch_size is not None

    replay_buffer = ReplayBuffer(replay_capacity=replay_memory_size,
                                 batch_size=minibatch_size)

    device = torch.device("cuda") if use_gpu else torch.device("cpu")
    # first fill the replay buffer using random policy
    train_after_ts = max(train_after_ts, minibatch_size)
    fill_replay_buffer(env=env,
                       replay_buffer=replay_buffer,
                       desired_size=train_after_ts)

    agent = Agent.create_for_env(env, policy=training_policy, device=device)
    # TODO: Simplify this setup by creating LightningDataModule
    dataset = ReplayBufferDataset.create_for_trainer(
        trainer,
        env,
        agent,
        replay_buffer,
        batch_size=minibatch_size,
        training_frequency=train_every_ts,
        num_episodes=num_train_episodes,
        max_steps=200,
        device=device,
    )
    data_loader = torch.utils.data.DataLoader(dataset,
                                              collate_fn=identity_collate)
    # pyre-fixme[16]: Module `pl` has no attribute `Trainer`.
    pl_trainer = pl.Trainer(max_epochs=1, gpus=int(use_gpu))
    # Note: the fit() function below also evaluates the agent along the way
    # and adds the new transitions to the replay buffer, so it is training
    # on incrementally larger and larger buffers.
    pl_trainer.fit(trainer, data_loader)

    # TODO: Also check train_reward

    serving_policy = manager.create_policy(serving=True)

    eval_rewards = eval_policy(env,
                               serving_policy,
                               num_eval_episodes,
                               serving=True)
    assert (
        eval_rewards.mean() >= passing_score_bar
    ), f"Eval reward is {eval_rewards.mean()}, less than < {passing_score_bar}.\n"
예제 #5
0
def run_test(
    env: Env__Union,
    model: ModelManager__Union,
    replay_memory_size: int,
    train_every_ts: int,
    train_after_ts: int,
    num_train_episodes: int,
    passing_score_bar: float,
    num_eval_episodes: int,
    use_gpu: bool,
):
    env = env.value
    env.seed(SEED)
    env.action_space.seed(SEED)
    normalization = build_normalizer(env)
    logger.info(f"Normalization is: \n{pprint.pformat(normalization)}")

    manager = model.value
    trainer = manager.initialize_trainer(
        use_gpu=use_gpu,
        reward_options=RewardOptions(),
        normalization_data_map=normalization,
    )
    training_policy = manager.create_policy(serving=False)

    replay_buffer = ReplayBuffer(replay_capacity=replay_memory_size,
                                 batch_size=trainer.minibatch_size)

    device = torch.device("cuda") if use_gpu else torch.device("cpu")
    # first fill the replay buffer to burn_in
    train_after_ts = max(train_after_ts, trainer.minibatch_size)
    fill_replay_buffer(env=env,
                       replay_buffer=replay_buffer,
                       desired_size=train_after_ts)

    post_step = train_with_replay_buffer_post_step(
        replay_buffer=replay_buffer,
        env=env,
        trainer=trainer,
        training_freq=train_every_ts,
        batch_size=trainer.minibatch_size,
        device=device,
    )

    agent = Agent.create_for_env(env,
                                 policy=training_policy,
                                 post_transition_callback=post_step,
                                 device=device)

    writer = SummaryWriter()
    with summary_writer_context(writer):
        train_rewards = []
        for i in range(num_train_episodes):
            trajectory = run_episode(env=env,
                                     agent=agent,
                                     mdp_id=i,
                                     max_steps=env.max_steps)
            ep_reward = trajectory.calculate_cumulative_reward()
            train_rewards.append(ep_reward)
            logger.info(
                f"Finished training episode {i} (len {len(trajectory)})"
                f" with reward {ep_reward}.")

    logger.info("============Train rewards=============")
    logger.info(train_rewards)
    logger.info(
        f"average: {np.mean(train_rewards)};\tmax: {np.max(train_rewards)}")

    # Check whether the max score passed the score bar; we explore during training
    # the return could be bad (leading to flakiness in C51 and QRDQN).
    assert np.max(train_rewards) >= passing_score_bar, (
        f"max reward ({np.max(train_rewards)})after training for "
        f"{len(train_rewards)} episodes is less than < {passing_score_bar}.\n")

    serving_policy = manager.create_policy(serving=True)
    agent = Agent.create_for_env_with_serving_policy(env, serving_policy)

    eval_rewards = evaluate_for_n_episodes(n=num_eval_episodes,
                                           env=env,
                                           agent=agent,
                                           max_steps=env.max_steps).squeeze(1)

    logger.info("============Eval rewards==============")
    logger.info(eval_rewards)
    logger.info(
        f"average: {np.mean(eval_rewards)};\tmax: {np.max(eval_rewards)}")
    assert np.mean(eval_rewards) >= passing_score_bar, (
        f"Predictor reward is {np.mean(eval_rewards)},"
        f"less than < {passing_score_bar}.\n")
예제 #6
0
파일: test_gym.py 프로젝트: saonam/ReAgent
def run_test(
    env: Env__Union,
    model: ModelManager__Union,
    replay_memory_size: int,
    train_every_ts: int,
    train_after_ts: int,
    num_train_episodes: int,
    passing_score_bar: float,
    num_eval_episodes: int,
    use_gpu: bool,
):
    env = env.value

    normalization = build_normalizer(env)
    logger.info(f"Normalization is: \n{pprint.pformat(normalization)}")

    manager = model.value
    trainer = manager.initialize_trainer(
        use_gpu=use_gpu,
        reward_options=RewardOptions(),
        normalization_data_map=normalization,
    )
    training_policy = manager.create_policy(serving=False)

    replay_buffer = ReplayBuffer(replay_capacity=replay_memory_size,
                                 batch_size=trainer.minibatch_size)

    device = torch.device("cuda") if use_gpu else torch.device("cpu")
    # first fill the replay buffer to burn_in
    train_after_ts = max(train_after_ts, trainer.minibatch_size)
    fill_replay_buffer(env=env,
                       replay_buffer=replay_buffer,
                       desired_size=train_after_ts)

    post_step = train_with_replay_buffer_post_step(
        replay_buffer=replay_buffer,
        env=env,
        trainer=trainer,
        training_freq=train_every_ts,
        batch_size=trainer.minibatch_size,
        device=device,
    )

    env.seed(SEED)
    env.action_space.seed(SEED)

    train_rewards = train_policy(
        env,
        training_policy,
        num_train_episodes,
        post_step=post_step,
        post_episode=None,
        use_gpu=use_gpu,
    )

    # Check whether the max score passed the score bar; we explore during training
    # the return could be bad (leading to flakiness in C51 and QRDQN).
    assert np.max(train_rewards) >= passing_score_bar, (
        f"max reward ({np.max(train_rewards)}) after training for "
        f"{len(train_rewards)} episodes is less than < {passing_score_bar}.\n")

    serving_policy = manager.create_policy(serving=True)

    eval_rewards = eval_policy(env,
                               serving_policy,
                               num_eval_episodes,
                               serving=True)
    assert (
        eval_rewards.mean() >= passing_score_bar
    ), f"Eval reward is {eval_rewards.mean()}, less than < {passing_score_bar}.\n"