Exemplo n.º 1
0
def train_seq2reward_and_compute_reward_mse(
    env_name: str,
    model: ModelManager__Union,
    num_train_transitions: int,
    num_test_transitions: int,
    seq_len: int,
    batch_size: int,
    num_train_epochs: int,
    use_gpu: bool,
    saved_seq2reward_path: Optional[str] = None,
):
    """ Train Seq2Reward Network and compute reward mse. """
    env = Gym(env_name=env_name)
    env.seed(SEED)

    manager = model.value
    trainer = manager.initialize_trainer(
        use_gpu=use_gpu,
        reward_options=RewardOptions(),
        normalization_data_map=build_normalizer(env),
    )

    device = "cuda" if use_gpu else "cpu"
    # pyre-fixme[6]: Expected `device` for 2nd param but got `str`.
    trainer_preprocessor = make_replay_buffer_trainer_preprocessor(trainer, device, env)
    test_replay_buffer = ReplayBuffer(
        replay_capacity=num_test_transitions,
        batch_size=batch_size,
        stack_size=seq_len,
        return_everything_as_stack=True,
    )
    fill_replay_buffer(env, test_replay_buffer, num_test_transitions)

    if saved_seq2reward_path is None:
        # train from scratch
        trainer = train_seq2reward(
            env=env,
            trainer=trainer,
            trainer_preprocessor=trainer_preprocessor,
            num_train_transitions=num_train_transitions,
            seq_len=seq_len,
            batch_size=batch_size,
            num_train_epochs=num_train_epochs,
            test_replay_buffer=test_replay_buffer,
        )
    else:
        # load a pretrained model, and just evaluate it
        trainer.seq2reward_network.load_state_dict(torch.load(saved_seq2reward_path))
    state_dim = env.observation_space.shape[0]
    with torch.no_grad():
        trainer.seq2reward_network.eval()
        test_batch = test_replay_buffer.sample_transition_batch(
            batch_size=test_replay_buffer.size
        )
        preprocessed_test_batch = trainer_preprocessor(test_batch)
        adhoc_action_padding(preprocessed_test_batch, state_dim=state_dim)
        losses = trainer.get_loss(preprocessed_test_batch)
        detached_losses = losses.cpu().detach().item()
        trainer.seq2reward_network.train()
    return detached_losses
Exemplo n.º 2
0
    def create_for_trainer(
        cls,
        trainer,
        env: EnvWrapper,
        agent: Agent,
        replay_buffer: ReplayBuffer,
        batch_size: int,
        training_frequency: int = 1,
        num_episodes: Optional[int] = None,
        max_steps: Optional[int] = None,
        trainer_preprocessor=None,
        replay_buffer_inserter=None,
    ):
        device = torch.device("cpu")
        if trainer_preprocessor is None:
            trainer_preprocessor = make_replay_buffer_trainer_preprocessor(
                trainer, device, env
            )

        if replay_buffer_inserter is None:
            replay_buffer_inserter = make_replay_buffer_inserter(env)

        return cls(
            env=env,
            agent=agent,
            replay_buffer=replay_buffer,
            batch_size=batch_size,
            training_frequency=training_frequency,
            num_episodes=num_episodes,
            max_steps=max_steps,
            trainer_preprocessor=trainer_preprocessor,
            replay_buffer_inserter=replay_buffer_inserter,
        )
Exemplo n.º 3
0
def run_test_offline(
    env_name: str,
    model: ModelManager__Union,
    replay_memory_size: int,
    num_batches_per_epoch: int,
    num_train_epochs: int,
    passing_score_bar: float,
    num_eval_episodes: int,
    minibatch_size: int,
    use_gpu: bool,
):
    env = Gym(env_name=env_name)
    env.seed(SEED)
    env.action_space.seed(SEED)
    normalization = build_normalizer(env)
    logger.info(f"Normalization is: \n{pprint.pformat(normalization)}")

    manager = model.value
    trainer = manager.initialize_trainer(
        use_gpu=use_gpu,
        reward_options=RewardOptions(),
        normalization_data_map=normalization,
    )

    # first fill the replay buffer to burn_in
    replay_buffer = ReplayBuffer(replay_capacity=replay_memory_size,
                                 batch_size=minibatch_size)
    # always fill full RB
    random_policy = make_random_policy_for_env(env)
    agent = Agent.create_for_env(env, policy=random_policy)
    fill_replay_buffer(
        env=env,
        replay_buffer=replay_buffer,
        desired_size=replay_memory_size,
        agent=agent,
    )

    device = torch.device("cuda") if use_gpu else None
    # pyre-fixme[6]: Expected `device` for 2nd param but got `Optional[torch.device]`.
    trainer_preprocessor = make_replay_buffer_trainer_preprocessor(
        trainer, device, env)

    writer = SummaryWriter()
    with summary_writer_context(writer):
        for epoch in range(num_train_epochs):
            logger.info(f"Evaluating before epoch {epoch}: ")
            eval_rewards = evaluate_cem(env, manager, 1)
            for _ in tqdm(range(num_batches_per_epoch)):
                train_batch = replay_buffer.sample_transition_batch()
                preprocessed_batch = trainer_preprocessor(train_batch)
                trainer.train(preprocessed_batch)

    logger.info(f"Evaluating after training for {num_train_epochs} epochs: ")
    eval_rewards = evaluate_cem(env, manager, num_eval_episodes)
    mean_rewards = np.mean(eval_rewards)
    assert (mean_rewards >= passing_score_bar
            ), f"{mean_rewards} doesn't pass the bar {passing_score_bar}."
Exemplo n.º 4
0
def train_with_replay_buffer_post_step(
    replay_buffer: ReplayBuffer,
    env: gym.Env,
    trainer: Trainer,
    training_freq: int,
    batch_size: int,
    trainer_preprocessor=None,
    device: Union[str, torch.device] = "cpu",
    replay_buffer_inserter=None,
) -> PostStep:
    """ Called in post_step of agent to train based on replay buffer (RB).
        Args:
            trainer: responsible for having a .train method to train the model
            trainer_preprocessor: format RB output for trainer.train
            training_freq: how many steps in between trains
            batch_size: how big of a batch to sample
    """
    if isinstance(device, str):
        device = torch.device(device)

    if trainer_preprocessor is None:
        trainer_preprocessor = make_replay_buffer_trainer_preprocessor(
            trainer, device, env)

    if replay_buffer_inserter is None:
        replay_buffer_inserter = make_replay_buffer_inserter(env)

    _num_steps = 0

    def post_step(obs: Any, action: Any, reward: float, terminal: bool,
                  log_prob: float) -> None:
        nonlocal _num_steps

        replay_buffer_inserter(replay_buffer, obs, action, reward, terminal,
                               log_prob)

        if _num_steps % training_freq == 0:
            assert replay_buffer.size >= batch_size
            train_batch = replay_buffer.sample_transition_batch_tensor(
                batch_size=batch_size)
            preprocessed_batch = trainer_preprocessor(train_batch)
            trainer.train(preprocessed_batch)
        _num_steps += 1
        return

    return post_step
Exemplo n.º 5
0
    def create_for_trainer(
        cls,
        trainer,
        env: EnvWrapper,
        replay_buffer: ReplayBuffer,
        batch_size: int,
        num_batches: int,
        trainer_preprocessor=None,
        device=None,
    ):
        device = device or torch.device("cpu")
        if trainer_preprocessor is None:
            trainer_preprocessor = make_replay_buffer_trainer_preprocessor(
                trainer, device, env)

        return cls(
            env=env,
            replay_buffer=replay_buffer,
            batch_size=batch_size,
            num_batches=num_batches,
            trainer_preprocessor=trainer_preprocessor,
        )
Exemplo n.º 6
0
def train_mdnrnn_and_train_on_embedded_env(
    env_name: str,
    embedding_model: ModelManager__Union,
    num_embedding_train_transitions: int,
    seq_len: int,
    batch_size: int,
    num_embedding_train_epochs: int,
    train_model: ModelManager__Union,
    num_state_embed_transitions: int,
    num_agent_train_epochs: int,
    num_agent_eval_epochs: int,
    use_gpu: bool,
    passing_score_bar: float,
    # pyre-fixme[9]: saved_mdnrnn_path has type `str`; used as `None`.
    saved_mdnrnn_path: str = None,
):
    """ Train an agent on embedded states by the MDNRNN. """
    env = EnvFactory.make(env_name)
    env.seed(SEED)

    embedding_manager = embedding_model.value
    embedding_trainer = embedding_manager.initialize_trainer(
        use_gpu=use_gpu,
        reward_options=RewardOptions(),
        normalization_data_map=build_normalizer(env),
    )

    device = "cuda" if use_gpu else "cpu"
    embedding_trainer_preprocessor = make_replay_buffer_trainer_preprocessor(
        embedding_trainer, device, env
    )
    if saved_mdnrnn_path is None:
        # train from scratch
        embedding_trainer = train_mdnrnn(
            env=env,
            trainer=embedding_trainer,
            trainer_preprocessor=embedding_trainer_preprocessor,
            num_train_transitions=num_embedding_train_transitions,
            seq_len=seq_len,
            batch_size=batch_size,
            num_train_epochs=num_embedding_train_epochs,
        )
    else:
        # load a pretrained model, and just evaluate it
        embedding_trainer.memory_network.mdnrnn.load_state_dict(
            torch.load(saved_mdnrnn_path)
        )

    # create embedding dataset
    embed_rb, state_min, state_max = create_embed_rl_dataset(
        env=env,
        memory_network=embedding_trainer.memory_network,
        num_state_embed_transitions=num_state_embed_transitions,
        batch_size=batch_size,
        seq_len=seq_len,
        hidden_dim=embedding_trainer.params.hidden_size,
        use_gpu=use_gpu,
    )
    embed_env = StateEmbedEnvironment(
        gym_env=env,
        mdnrnn=embedding_trainer.memory_network,
        max_embed_seq_len=seq_len,
        state_min_value=state_min,
        state_max_value=state_max,
    )
    agent_manager = train_model.value
    agent_trainer = agent_manager.initialize_trainer(
        use_gpu=use_gpu,
        reward_options=RewardOptions(),
        normalization_data_map=build_normalizer(embed_env),
    )
    device = "cuda" if use_gpu else "cpu"
    agent_trainer_preprocessor = make_replay_buffer_trainer_preprocessor(
        agent_trainer, device, env
    )
    num_batch_per_epoch = embed_rb.size // batch_size
    for epoch in range(num_agent_train_epochs):
        for _ in tqdm(range(num_batch_per_epoch), desc=f"epoch {epoch}"):
            batch = embed_rb.sample_transition_batch_tensor(batch_size=batch_size)
            preprocessed_batch = agent_trainer_preprocessor(batch)
            agent_trainer.train(preprocessed_batch)

    # evaluate model
    rewards = []
    policy = agent_manager.create_policy(serving=False)
    agent = Agent.create_for_env(embed_env, policy=policy, device=device)
    for i in range(num_agent_eval_epochs):
        ep_reward = run_episode(env=embed_env, agent=agent)
        rewards.append(ep_reward)
        logger.info(f"Finished eval episode {i} with reward {ep_reward}.")
    logger.info(f"Average eval reward is {np.mean(rewards)}.")
    assert (
        np.mean(rewards) >= passing_score_bar
    ), f"average reward doesn't pass our bar {passing_score_bar}"
    return rewards
Exemplo n.º 7
0
def train_mdnrnn_and_compute_feature_stats(
    env_name: str,
    model: ModelManager__Union,
    num_train_transitions: int,
    num_test_transitions: int,
    seq_len: int,
    batch_size: int,
    num_train_epochs: int,
    use_gpu: bool,
    saved_mdnrnn_path: Optional[str] = None,
):
    """ Train MDNRNN Memory Network and compute feature importance/sensitivity. """
    env: gym.Env = EnvFactory.make(env_name)
    env.seed(SEED)

    manager = model.value
    trainer = manager.initialize_trainer(
        use_gpu=use_gpu,
        reward_options=RewardOptions(),
        normalization_data_map=build_normalizer(env),
    )

    device = "cuda" if use_gpu else "cpu"
    trainer_preprocessor = make_replay_buffer_trainer_preprocessor(trainer, device, env)
    test_replay_buffer = ReplayBuffer.create_from_env(
        env=env,
        replay_memory_size=num_test_transitions,
        batch_size=batch_size,
        stack_size=seq_len,
        return_everything_as_stack=True,
    )
    fill_replay_buffer(env, test_replay_buffer, num_test_transitions)

    if saved_mdnrnn_path is None:
        # train from scratch
        trainer = train_mdnrnn(
            env=env,
            trainer=trainer,
            trainer_preprocessor=trainer_preprocessor,
            num_train_transitions=num_train_transitions,
            seq_len=seq_len,
            batch_size=batch_size,
            num_train_epochs=num_train_epochs,
            test_replay_buffer=test_replay_buffer,
        )
    else:
        # load a pretrained model, and just evaluate it
        trainer.memory_network.mdnrnn.load_state_dict(torch.load(saved_mdnrnn_path))

    with torch.no_grad():
        trainer.memory_network.mdnrnn.eval()
        test_batch = test_replay_buffer.sample_transition_batch_tensor(
            batch_size=test_replay_buffer.size
        )
        preprocessed_test_batch = trainer_preprocessor(test_batch)
        feature_importance = calculate_feature_importance(
            env=env,
            trainer=trainer,
            use_gpu=use_gpu,
            test_batch=preprocessed_test_batch,
        )

        feature_sensitivity = calculate_feature_sensitivity(
            env=env,
            trainer=trainer,
            use_gpu=use_gpu,
            test_batch=preprocessed_test_batch,
        )

        trainer.memory_network.mdnrnn.train()
    return feature_importance, feature_sensitivity
Exemplo n.º 8
0
def train_mdnrnn_and_train_on_embedded_env(
    env_name: str,
    embedding_model: ModelManager__Union,
    num_embedding_train_transitions: int,
    seq_len: int,
    batch_size: int,
    num_embedding_train_epochs: int,
    train_model: ModelManager__Union,
    num_state_embed_transitions: int,
    num_agent_train_epochs: int,
    num_agent_eval_epochs: int,
    use_gpu: bool,
    passing_score_bar: float,
    # pyre-fixme[9]: saved_mdnrnn_path has type `str`; used as `None`.
    saved_mdnrnn_path: str = None,
):
    """ Train an agent on embedded states by the MDNRNN. """
    env = Gym(env_name=env_name)
    env.seed(SEED)

    embedding_manager = embedding_model.value
    embedding_trainer = embedding_manager.initialize_trainer(
        use_gpu=use_gpu,
        reward_options=RewardOptions(),
        normalization_data_map=build_normalizer(env),
    )

    device = "cuda" if use_gpu else "cpu"
    embedding_trainer_preprocessor = make_replay_buffer_trainer_preprocessor(
        embedding_trainer,
        # pyre-fixme[6]: Expected `device` for 2nd param but got `str`.
        device,
        env,
    )
    if saved_mdnrnn_path is None:
        # train from scratch
        embedding_trainer = train_mdnrnn(
            env=env,
            trainer=embedding_trainer,
            trainer_preprocessor=embedding_trainer_preprocessor,
            num_train_transitions=num_embedding_train_transitions,
            seq_len=seq_len,
            batch_size=batch_size,
            num_train_epochs=num_embedding_train_epochs,
        )
    else:
        # load a pretrained model, and just evaluate it
        embedding_trainer.memory_network.mdnrnn.load_state_dict(
            torch.load(saved_mdnrnn_path))

    # create embedding dataset
    embed_rb, state_min, state_max = create_embed_rl_dataset(
        env=env,
        memory_network=embedding_trainer.memory_network,
        num_state_embed_transitions=num_state_embed_transitions,
        batch_size=batch_size,
        seq_len=seq_len,
        hidden_dim=embedding_trainer.params.hidden_size,
        use_gpu=use_gpu,
    )
    embed_env = StateEmbedEnvironment(
        gym_env=env,
        mdnrnn=embedding_trainer.memory_network,
        max_embed_seq_len=seq_len,
        state_min_value=state_min,
        state_max_value=state_max,
    )
    agent_manager = train_model.value
    agent_trainer = agent_manager.initialize_trainer(
        use_gpu=use_gpu,
        reward_options=RewardOptions(),
        # pyre-fixme[6]: Expected `EnvWrapper` for 1st param but got
        #  `StateEmbedEnvironment`.
        normalization_data_map=build_normalizer(embed_env),
    )
    device = "cuda" if use_gpu else "cpu"
    agent_trainer_preprocessor = make_replay_buffer_trainer_preprocessor(
        agent_trainer,
        # pyre-fixme[6]: Expected `device` for 2nd param but got `str`.
        device,
        env,
    )
    num_batch_per_epoch = embed_rb.size // batch_size
    # FIXME: This has to be wrapped in dataloader
    for epoch in range(num_agent_train_epochs):
        for _ in tqdm(range(num_batch_per_epoch), desc=f"epoch {epoch}"):
            batch = embed_rb.sample_transition_batch(batch_size=batch_size)
            preprocessed_batch = agent_trainer_preprocessor(batch)
            # FIXME: This should be fitted with Lightning's trainer
            agent_trainer.train(preprocessed_batch)

    # evaluate model
    rewards = []
    policy = agent_manager.create_policy(serving=False)
    # pyre-fixme[6]: Expected `EnvWrapper` for 1st param but got
    #  `StateEmbedEnvironment`.
    agent = Agent.create_for_env(embed_env, policy=policy, device=device)
    # num_processes=1 needed to avoid workers from dying on CircleCI tests
    rewards = evaluate_for_n_episodes(
        n=num_agent_eval_epochs,
        # pyre-fixme[6]: Expected `EnvWrapper` for 2nd param but got
        #  `StateEmbedEnvironment`.
        env=embed_env,
        agent=agent,
        num_processes=1,
    )
    assert (np.mean(rewards) >= passing_score_bar
            ), f"average reward doesn't pass our bar {passing_score_bar}"
    return rewards