def run_gym( params: OpenAiGymParameters, score_bar, embed_rl_dataset: RLDataset, gym_env: Env, mdnrnn: MemoryNetwork, max_embed_seq_len: int, ): assert params.rl is not None rl_parameters = params.rl env_type = params.env model_type = params.model_type epsilon, epsilon_decay, minimum_epsilon = create_epsilon( offline_train=True, rl_parameters=rl_parameters, params=params ) replay_buffer = OpenAIGymMemoryPool(params.max_replay_memory_size) for row in embed_rl_dataset.rows: replay_buffer.insert_into_memory(**row) assert replay_buffer.memory_buffer is not None state_mem = replay_buffer.memory_buffer.state state_min_value = torch.min(state_mem).item() state_max_value = torch.max(state_mem).item() state_embed_env = StateEmbedGymEnvironment( gym_env, mdnrnn, max_embed_seq_len, state_min_value, state_max_value ) open_ai_env = OpenAIGymEnvironment( state_embed_env, epsilon, rl_parameters.softmax_policy, rl_parameters.gamma, epsilon_decay, minimum_epsilon, ) rl_trainer = create_trainer(params, open_ai_env) rl_predictor = create_predictor( rl_trainer, model_type, params.use_gpu, open_ai_env.action_dim ) assert ( params.run_details.max_steps is not None and params.run_details.offline_train_epochs is not None ), "Missing data required for offline training: {}".format(str(params.run_details)) return train_gym_offline_rl( gym_env=open_ai_env, replay_buffer=replay_buffer, model_type=model_type, trainer=rl_trainer, predictor=rl_predictor, test_run_name="{} offline rl state embed".format(env_type), score_bar=score_bar, max_steps=params.run_details.max_steps, avg_over_num_episodes=params.run_details.avg_over_num_episodes, offline_train_epochs=params.run_details.offline_train_epochs, num_batch_per_epoch=None, )
def run_gym( params, use_gpu, score_bar, embed_rl_dataset: RLDataset, gym_env: Env, mdnrnn: MemoryNetwork, max_embed_seq_len: int, ): rl_parameters = RLParameters(**params["rl"]) env_type = params["env"] model_type = params["model_type"] epsilon, epsilon_decay, minimum_epsilon = create_epsilon( offline_train=True, rl_parameters=rl_parameters, params=params ) replay_buffer = OpenAIGymMemoryPool(params["max_replay_memory_size"]) for row in embed_rl_dataset.rows: replay_buffer.insert_into_memory(**row) state_mem = torch.cat([m[0] for m in replay_buffer.replay_memory]) state_min_value = torch.min(state_mem).item() state_max_value = torch.max(state_mem).item() state_embed_env = StateEmbedGymEnvironment( gym_env, mdnrnn, max_embed_seq_len, state_min_value, state_max_value ) open_ai_env = OpenAIGymEnvironment( state_embed_env, epsilon, rl_parameters.softmax_policy, rl_parameters.gamma, epsilon_decay, minimum_epsilon, ) rl_trainer = create_trainer( params["model_type"], params, rl_parameters, use_gpu, open_ai_env ) rl_predictor = create_predictor( rl_trainer, model_type, use_gpu, open_ai_env.action_dim ) return train_gym_offline_rl( open_ai_env, replay_buffer, model_type, rl_trainer, rl_predictor, "{} offline rl state embed".format(env_type), score_bar, max_steps=params["run_details"]["max_steps"], avg_over_num_episodes=params["run_details"]["avg_over_num_episodes"], offline_train_epochs=params["run_details"]["offline_train_epochs"], bcq_imitator_hyper_params=None, )
def test_slate_q_trainer(self): recsim = RecSim(num_users=10) # Build memory pool with random policy memory_pool = OpenAIGymMemoryPool(10000000) random_reward = recsim.rollout_policy(random_policy, memory_pool) # Train a model q_network = FullyConnectedParametricDQN( state_dim=memory_pool.state_dim, action_dim=memory_pool.action_dim, sizes=[64, 32], activations=["relu", "relu"], ) q_network = q_network.eval() recsim.reset() untrained_policy_reward = recsim.rollout_policy( partial(top_k_policy, q_network)) q_network = q_network.train() q_network_target = q_network.get_target_network() parameters = SlateQTrainerParameters() trainer = SlateQTrainer(q_network, q_network_target, parameters) for _i in range(1000): tdp = memory_pool.sample_memories( 128, model_type=ModelType.PYTORCH_PARAMETRIC_DQN.value) training_batch = tdp.as_slate_q_training_batch() trainer.train(training_batch) q_network = q_network.eval() recsim.reset() trained_policy_reward = recsim.rollout_policy( partial(top_k_policy, q_network)) print( f"Reward; random: {random_reward}; untrained: {untrained_policy_reward}; " f"trained: {trained_policy_reward}") self.assertGreater(trained_policy_reward, untrained_policy_reward) self.assertGreater(trained_policy_reward, random_reward) self.assertEqual(random_reward, 1384.0) self.assertEqual(untrained_policy_reward, 1200.0) self.assertEqual(trained_policy_reward, 1432.0)
def run_gym( params, score_bar, gpu_id, save_timesteps_to_dataset=None, start_saving_from_episode=0, ): # Caffe2 core uses the min of caffe2_log_level and minloglevel # to determine loglevel. See caffe2/caffe2/core/logging.cc for more info. core.GlobalInit(["caffe2", "--caffe2_log_level=2", "--minloglevel=2"]) logger.info("Running gym with params") logger.info(params) rl_parameters = RLParameters(**params["rl"]) env_type = params["env"] env = OpenAIGymEnvironment( env_type, rl_parameters.epsilon, rl_parameters.softmax_policy, rl_parameters.gamma, ) replay_buffer = OpenAIGymMemoryPool(params["max_replay_memory_size"]) model_type = params["model_type"] use_gpu = gpu_id != USE_CPU trainer = create_trainer(params["model_type"], params, rl_parameters, use_gpu, env) predictor = create_predictor(trainer, model_type, use_gpu) c2_device = core.DeviceOption( caffe2_pb2.CUDA if use_gpu else caffe2_pb2.CPU, gpu_id ) return train_sgd( c2_device, env, replay_buffer, model_type, trainer, predictor, "{} test run".format(env_type), score_bar, **params["run_details"], save_timesteps_to_dataset=save_timesteps_to_dataset, start_saving_from_episode=start_saving_from_episode, )
def run_gym( params, offline_train, score_bar, gpu_id, save_timesteps_to_dataset=None, start_saving_from_episode=0, ): logger.info("Running gym with params") logger.info(params) rl_parameters = RLParameters(**params["rl"]) env_type = params["env"] if offline_train: # take random actions during data collection epsilon = 1.0 else: epsilon = rl_parameters.epsilon env = OpenAIGymEnvironment( env_type, epsilon, rl_parameters.softmax_policy, rl_parameters.gamma ) replay_buffer = OpenAIGymMemoryPool(params["max_replay_memory_size"]) model_type = params["model_type"] use_gpu = gpu_id != USE_CPU trainer = create_trainer(params["model_type"], params, rl_parameters, use_gpu, env) predictor = create_predictor(trainer, model_type, use_gpu) c2_device = core.DeviceOption( caffe2_pb2.CUDA if use_gpu else caffe2_pb2.CPU, int(gpu_id) ) return train_sgd( c2_device, env, offline_train, replay_buffer, model_type, trainer, predictor, "{} test run".format(env_type), score_bar, **params["run_details"], save_timesteps_to_dataset=save_timesteps_to_dataset, start_saving_from_episode=start_saving_from_episode, )
def create_replay_buffer(env, params, model_type, offline_train, path_to_pickled_transitions) -> OpenAIGymMemoryPool: """ Train on transitions generated from a random policy live or read transitions from a pickle file and load into replay buffer. """ replay_buffer = OpenAIGymMemoryPool(params["max_replay_memory_size"]) if path_to_pickled_transitions: create_stored_policy_offline_dataset(replay_buffer, path_to_pickled_transitions) replay_state_dim = replay_buffer.replay_memory[0][0].shape[0] replay_action_dim = replay_buffer.replay_memory[0][1].shape[0] assert replay_state_dim == env.state_dim assert replay_action_dim == env.action_dim elif offline_train: create_random_policy_offline_dataset( env, replay_buffer, params["run_details"]["max_steps"], model_type) return replay_buffer
def train_gym_offline_rl( gym_env: OpenAIGymEnvironment, replay_buffer: OpenAIGymMemoryPool, model_type: str, trainer: RLTrainer, predictor: OnPolicyPredictor, test_run_name: str, score_bar: Optional[float], max_steps: int, avg_over_num_episodes: int, offline_train_epochs: int, num_batch_per_epoch: Optional[int], bcq_imitator_hyper_params: Optional[Dict[str, Any]] = None, ): if num_batch_per_epoch is None: num_batch_per_epoch = replay_buffer.size // trainer.minibatch_size assert num_batch_per_epoch > 0, "The size of replay buffer is not sufficient" logger.info( "{} offline transitions in replay buffer.\n" "Training will take {} epochs, with each epoch having {} mini-batches" " and each mini-batch having {} samples".format( replay_buffer.size, offline_train_epochs, num_batch_per_epoch, trainer.minibatch_size, ) ) avg_reward_history, epoch_history = [], [] # Pre-train a GBDT imitator if doing batch constrained q-learning in Gym if getattr(trainer, "bcq", None): assert bcq_imitator_hyper_params is not None gbdt = GradientBoostingClassifier( n_estimators=bcq_imitator_hyper_params["gbdt_trees"], max_depth=bcq_imitator_hyper_params["max_depth"], ) samples = replay_buffer.sample_memories(replay_buffer.size, model_type) X, y = samples.states.numpy(), torch.max(samples.actions, dim=1)[1].numpy() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1) logger.info("Fitting GBDT...") gbdt.fit(X_train, y_train) train_score = round(gbdt.score(X_train, y_train) * 100, 1) test_score = round(gbdt.score(X_test, y_test) * 100, 1) logger.info( "GBDT train accuracy {}% || test accuracy {}%".format( train_score, test_score ) ) trainer.bcq_imitator = gbdt.predict_proba # type: ignore # Offline training for i_epoch in range(offline_train_epochs): for _ in range(num_batch_per_epoch): samples = replay_buffer.sample_memories(trainer.minibatch_size, model_type) samples.set_device(trainer.device) trainer.train(samples) batch_td_loss = float( torch.mean( torch.tensor( [stat.td_loss for stat in trainer.loss_reporter.incoming_stats] ) ) ) trainer.loss_reporter.flush() logger.info( "Average TD loss: {} in epoch {}".format(batch_td_loss, i_epoch + 1) ) # test model performance for this epoch avg_rewards, avg_discounted_rewards = gym_env.run_ep_n_times( avg_over_num_episodes, predictor, test=True, max_steps=max_steps ) avg_reward_history.append(avg_rewards) # For offline training, use epoch number as timestep history since # we have a fixed batch of data to count epochs over. epoch_history.append(i_epoch) logger.info( "Achieved an average reward score of {} over {} evaluations" " after epoch {}.".format(avg_rewards, avg_over_num_episodes, i_epoch) ) if score_bar is not None and avg_rewards > score_bar: logger.info( "Avg. reward history for {}: {}".format( test_run_name, avg_reward_history ) ) return avg_reward_history, epoch_history, trainer, predictor, gym_env logger.info( "Avg. reward history for {}: {}".format(test_run_name, avg_reward_history) ) return avg_reward_history, epoch_history, trainer, predictor, gym_env