示例#1
0
def run_gym(
    params: OpenAiGymParameters,
    score_bar,
    embed_rl_dataset: RLDataset,
    gym_env: Env,
    mdnrnn: MemoryNetwork,
    max_embed_seq_len: int,
):
    assert params.rl is not None
    rl_parameters = params.rl

    env_type = params.env
    model_type = params.model_type
    epsilon, epsilon_decay, minimum_epsilon = create_epsilon(
        offline_train=True, rl_parameters=rl_parameters, params=params
    )

    replay_buffer = OpenAIGymMemoryPool(params.max_replay_memory_size)
    for row in embed_rl_dataset.rows:
        replay_buffer.insert_into_memory(**row)

    assert replay_buffer.memory_buffer is not None
    state_mem = replay_buffer.memory_buffer.state
    state_min_value = torch.min(state_mem).item()
    state_max_value = torch.max(state_mem).item()
    state_embed_env = StateEmbedGymEnvironment(
        gym_env, mdnrnn, max_embed_seq_len, state_min_value, state_max_value
    )
    open_ai_env = OpenAIGymEnvironment(
        state_embed_env,
        epsilon,
        rl_parameters.softmax_policy,
        rl_parameters.gamma,
        epsilon_decay,
        minimum_epsilon,
    )
    rl_trainer = create_trainer(params, open_ai_env)
    rl_predictor = create_predictor(
        rl_trainer, model_type, params.use_gpu, open_ai_env.action_dim
    )

    assert (
        params.run_details.max_steps is not None
        and params.run_details.offline_train_epochs is not None
    ), "Missing data required for offline training: {}".format(str(params.run_details))
    return train_gym_offline_rl(
        gym_env=open_ai_env,
        replay_buffer=replay_buffer,
        model_type=model_type,
        trainer=rl_trainer,
        predictor=rl_predictor,
        test_run_name="{} offline rl state embed".format(env_type),
        score_bar=score_bar,
        max_steps=params.run_details.max_steps,
        avg_over_num_episodes=params.run_details.avg_over_num_episodes,
        offline_train_epochs=params.run_details.offline_train_epochs,
        num_batch_per_epoch=None,
    )
def run_gym(
    params,
    use_gpu,
    score_bar,
    embed_rl_dataset: RLDataset,
    gym_env: Env,
    mdnrnn: MemoryNetwork,
    max_embed_seq_len: int,
):
    rl_parameters = RLParameters(**params["rl"])
    env_type = params["env"]
    model_type = params["model_type"]
    epsilon, epsilon_decay, minimum_epsilon = create_epsilon(
        offline_train=True, rl_parameters=rl_parameters, params=params
    )

    replay_buffer = OpenAIGymMemoryPool(params["max_replay_memory_size"])
    for row in embed_rl_dataset.rows:
        replay_buffer.insert_into_memory(**row)

    state_mem = torch.cat([m[0] for m in replay_buffer.replay_memory])
    state_min_value = torch.min(state_mem).item()
    state_max_value = torch.max(state_mem).item()
    state_embed_env = StateEmbedGymEnvironment(
        gym_env, mdnrnn, max_embed_seq_len, state_min_value, state_max_value
    )
    open_ai_env = OpenAIGymEnvironment(
        state_embed_env,
        epsilon,
        rl_parameters.softmax_policy,
        rl_parameters.gamma,
        epsilon_decay,
        minimum_epsilon,
    )
    rl_trainer = create_trainer(
        params["model_type"], params, rl_parameters, use_gpu, open_ai_env
    )
    rl_predictor = create_predictor(
        rl_trainer, model_type, use_gpu, open_ai_env.action_dim
    )

    return train_gym_offline_rl(
        open_ai_env,
        replay_buffer,
        model_type,
        rl_trainer,
        rl_predictor,
        "{} offline rl state embed".format(env_type),
        score_bar,
        max_steps=params["run_details"]["max_steps"],
        avg_over_num_episodes=params["run_details"]["avg_over_num_episodes"],
        offline_train_epochs=params["run_details"]["offline_train_epochs"],
        bcq_imitator_hyper_params=None,
    )
示例#3
0
    def test_slate_q_trainer(self):
        recsim = RecSim(num_users=10)

        # Build memory pool with random policy
        memory_pool = OpenAIGymMemoryPool(10000000)
        random_reward = recsim.rollout_policy(random_policy, memory_pool)

        # Train a model
        q_network = FullyConnectedParametricDQN(
            state_dim=memory_pool.state_dim,
            action_dim=memory_pool.action_dim,
            sizes=[64, 32],
            activations=["relu", "relu"],
        )

        q_network = q_network.eval()
        recsim.reset()
        untrained_policy_reward = recsim.rollout_policy(
            partial(top_k_policy, q_network))
        q_network = q_network.train()

        q_network_target = q_network.get_target_network()
        parameters = SlateQTrainerParameters()
        trainer = SlateQTrainer(q_network, q_network_target, parameters)

        for _i in range(1000):
            tdp = memory_pool.sample_memories(
                128, model_type=ModelType.PYTORCH_PARAMETRIC_DQN.value)
            training_batch = tdp.as_slate_q_training_batch()
            trainer.train(training_batch)

        q_network = q_network.eval()
        recsim.reset()
        trained_policy_reward = recsim.rollout_policy(
            partial(top_k_policy, q_network))

        print(
            f"Reward; random: {random_reward}; untrained: {untrained_policy_reward}; "
            f"trained: {trained_policy_reward}")

        self.assertGreater(trained_policy_reward, untrained_policy_reward)
        self.assertGreater(trained_policy_reward, random_reward)
        self.assertEqual(random_reward, 1384.0)
        self.assertEqual(untrained_policy_reward, 1200.0)
        self.assertEqual(trained_policy_reward, 1432.0)
示例#4
0
def run_gym(
    params,
    score_bar,
    gpu_id,
    save_timesteps_to_dataset=None,
    start_saving_from_episode=0,
):

    # Caffe2 core uses the min of caffe2_log_level and minloglevel
    # to determine loglevel. See caffe2/caffe2/core/logging.cc for more info.
    core.GlobalInit(["caffe2", "--caffe2_log_level=2", "--minloglevel=2"])

    logger.info("Running gym with params")
    logger.info(params)
    rl_parameters = RLParameters(**params["rl"])

    env_type = params["env"]
    env = OpenAIGymEnvironment(
        env_type,
        rl_parameters.epsilon,
        rl_parameters.softmax_policy,
        rl_parameters.gamma,
    )
    replay_buffer = OpenAIGymMemoryPool(params["max_replay_memory_size"])
    model_type = params["model_type"]

    use_gpu = gpu_id != USE_CPU
    trainer = create_trainer(params["model_type"], params, rl_parameters, use_gpu, env)
    predictor = create_predictor(trainer, model_type, use_gpu)

    c2_device = core.DeviceOption(
        caffe2_pb2.CUDA if use_gpu else caffe2_pb2.CPU, gpu_id
    )
    return train_sgd(
        c2_device,
        env,
        replay_buffer,
        model_type,
        trainer,
        predictor,
        "{} test run".format(env_type),
        score_bar,
        **params["run_details"],
        save_timesteps_to_dataset=save_timesteps_to_dataset,
        start_saving_from_episode=start_saving_from_episode,
    )
示例#5
0
def run_gym(
    params,
    offline_train,
    score_bar,
    gpu_id,
    save_timesteps_to_dataset=None,
    start_saving_from_episode=0,
):
    logger.info("Running gym with params")
    logger.info(params)
    rl_parameters = RLParameters(**params["rl"])

    env_type = params["env"]
    if offline_train:
        # take random actions during data collection
        epsilon = 1.0
    else:
        epsilon = rl_parameters.epsilon
    env = OpenAIGymEnvironment(
        env_type, epsilon, rl_parameters.softmax_policy, rl_parameters.gamma
    )
    replay_buffer = OpenAIGymMemoryPool(params["max_replay_memory_size"])
    model_type = params["model_type"]

    use_gpu = gpu_id != USE_CPU
    trainer = create_trainer(params["model_type"], params, rl_parameters, use_gpu, env)
    predictor = create_predictor(trainer, model_type, use_gpu)

    c2_device = core.DeviceOption(
        caffe2_pb2.CUDA if use_gpu else caffe2_pb2.CPU, int(gpu_id)
    )
    return train_sgd(
        c2_device,
        env,
        offline_train,
        replay_buffer,
        model_type,
        trainer,
        predictor,
        "{} test run".format(env_type),
        score_bar,
        **params["run_details"],
        save_timesteps_to_dataset=save_timesteps_to_dataset,
        start_saving_from_episode=start_saving_from_episode,
    )
示例#6
0
def create_replay_buffer(env, params, model_type, offline_train,
                         path_to_pickled_transitions) -> OpenAIGymMemoryPool:
    """
    Train on transitions generated from a random policy live or
    read transitions from a pickle file and load into replay buffer.
    """
    replay_buffer = OpenAIGymMemoryPool(params["max_replay_memory_size"])
    if path_to_pickled_transitions:
        create_stored_policy_offline_dataset(replay_buffer,
                                             path_to_pickled_transitions)
        replay_state_dim = replay_buffer.replay_memory[0][0].shape[0]
        replay_action_dim = replay_buffer.replay_memory[0][1].shape[0]
        assert replay_state_dim == env.state_dim
        assert replay_action_dim == env.action_dim
    elif offline_train:
        create_random_policy_offline_dataset(
            env, replay_buffer, params["run_details"]["max_steps"], model_type)
    return replay_buffer
示例#7
0
def train_gym_offline_rl(
    gym_env: OpenAIGymEnvironment,
    replay_buffer: OpenAIGymMemoryPool,
    model_type: str,
    trainer: RLTrainer,
    predictor: OnPolicyPredictor,
    test_run_name: str,
    score_bar: Optional[float],
    max_steps: int,
    avg_over_num_episodes: int,
    offline_train_epochs: int,
    num_batch_per_epoch: Optional[int],
    bcq_imitator_hyper_params: Optional[Dict[str, Any]] = None,
):
    if num_batch_per_epoch is None:
        num_batch_per_epoch = replay_buffer.size // trainer.minibatch_size
    assert num_batch_per_epoch > 0, "The size of replay buffer is not sufficient"

    logger.info(
        "{} offline transitions in replay buffer.\n"
        "Training will take {} epochs, with each epoch having {} mini-batches"
        " and each mini-batch having {} samples".format(
            replay_buffer.size,
            offline_train_epochs,
            num_batch_per_epoch,
            trainer.minibatch_size,
        )
    )

    avg_reward_history, epoch_history = [], []

    # Pre-train a GBDT imitator if doing batch constrained q-learning in Gym
    if getattr(trainer, "bcq", None):
        assert bcq_imitator_hyper_params is not None
        gbdt = GradientBoostingClassifier(
            n_estimators=bcq_imitator_hyper_params["gbdt_trees"],
            max_depth=bcq_imitator_hyper_params["max_depth"],
        )
        samples = replay_buffer.sample_memories(replay_buffer.size, model_type)
        X, y = samples.states.numpy(), torch.max(samples.actions, dim=1)[1].numpy()
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
        logger.info("Fitting GBDT...")
        gbdt.fit(X_train, y_train)
        train_score = round(gbdt.score(X_train, y_train) * 100, 1)
        test_score = round(gbdt.score(X_test, y_test) * 100, 1)
        logger.info(
            "GBDT train accuracy {}% || test accuracy {}%".format(
                train_score, test_score
            )
        )
        trainer.bcq_imitator = gbdt.predict_proba  # type: ignore

    # Offline training
    for i_epoch in range(offline_train_epochs):
        for _ in range(num_batch_per_epoch):
            samples = replay_buffer.sample_memories(trainer.minibatch_size, model_type)
            samples.set_device(trainer.device)
            trainer.train(samples)

        batch_td_loss = float(
            torch.mean(
                torch.tensor(
                    [stat.td_loss for stat in trainer.loss_reporter.incoming_stats]
                )
            )
        )
        trainer.loss_reporter.flush()
        logger.info(
            "Average TD loss: {} in epoch {}".format(batch_td_loss, i_epoch + 1)
        )

        # test model performance for this epoch
        avg_rewards, avg_discounted_rewards = gym_env.run_ep_n_times(
            avg_over_num_episodes, predictor, test=True, max_steps=max_steps
        )
        avg_reward_history.append(avg_rewards)

        # For offline training, use epoch number as timestep history since
        # we have a fixed batch of data to count epochs over.
        epoch_history.append(i_epoch)
        logger.info(
            "Achieved an average reward score of {} over {} evaluations"
            " after epoch {}.".format(avg_rewards, avg_over_num_episodes, i_epoch)
        )
        if score_bar is not None and avg_rewards > score_bar:
            logger.info(
                "Avg. reward history for {}: {}".format(
                    test_run_name, avg_reward_history
                )
            )
            return avg_reward_history, epoch_history, trainer, predictor, gym_env

    logger.info(
        "Avg. reward history for {}: {}".format(test_run_name, avg_reward_history)
    )
    return avg_reward_history, epoch_history, trainer, predictor, gym_env