def train_seq2reward_and_compute_reward_mse( env_name: str, model: ModelManager__Union, num_train_transitions: int, num_test_transitions: int, seq_len: int, batch_size: int, num_train_epochs: int, use_gpu: bool, saved_seq2reward_path: Optional[str] = None, ): """ Train Seq2Reward Network and compute reward mse. """ env = Gym(env_name=env_name) env.seed(SEED) manager = model.value trainer = manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=build_normalizer(env), ) device = "cuda" if use_gpu else "cpu" # pyre-fixme[6]: Expected `device` for 2nd param but got `str`. trainer_preprocessor = make_replay_buffer_trainer_preprocessor(trainer, device, env) test_replay_buffer = ReplayBuffer( replay_capacity=num_test_transitions, batch_size=batch_size, stack_size=seq_len, return_everything_as_stack=True, ) fill_replay_buffer(env, test_replay_buffer, num_test_transitions) if saved_seq2reward_path is None: # train from scratch trainer = train_seq2reward( env=env, trainer=trainer, trainer_preprocessor=trainer_preprocessor, num_train_transitions=num_train_transitions, seq_len=seq_len, batch_size=batch_size, num_train_epochs=num_train_epochs, test_replay_buffer=test_replay_buffer, ) else: # load a pretrained model, and just evaluate it trainer.seq2reward_network.load_state_dict(torch.load(saved_seq2reward_path)) state_dim = env.observation_space.shape[0] with torch.no_grad(): trainer.seq2reward_network.eval() test_batch = test_replay_buffer.sample_transition_batch( batch_size=test_replay_buffer.size ) preprocessed_test_batch = trainer_preprocessor(test_batch) adhoc_action_padding(preprocessed_test_batch, state_dim=state_dim) losses = trainer.get_loss(preprocessed_test_batch) detached_losses = losses.cpu().detach().item() trainer.seq2reward_network.train() return detached_losses
def setup_buffer(buffer_size, trajectory_lengths, stack_size=None, multi_steps=None): """ We will insert one trajectory into the RB. """ stack_size = stack_size if stack_size is not None else 1 update_horizon = multi_steps if multi_steps is not None else 1 memory = ReplayBuffer( stack_size=stack_size, replay_capacity=buffer_size, batch_size=1, update_horizon=update_horizon, return_everything_as_stack=stack_size is not None, return_as_timeline_format=multi_steps is not None, ) i = 0 for traj_len in trajectory_lengths: for j in range(traj_len): trans = get_add_transition(i) terminal = bool(j == traj_len - 1) memory.add( observation=trans["state"], action=trans["action"], reward=trans["reward"], terminal=terminal, extra1=trans["extra1"], ) i += 1 return memory.sample_all_valid_transitions()
def test_create_from_recsim_interest_evolution(self): env = RecSim(num_candidates=20, slate_size=3, resample_documents=False) replay_buffer = ReplayBuffer(replay_capacity=100, batch_size=10) obs = env.reset() observation = obs["user"] action = env.action_space.sample() log_prob = -1.0 doc_features = np.stack(list(obs["doc"].values()), axis=0) next_obs, reward, terminal, _env = env.step(action) response = next_obs["response"] click = np.array([r["click"] for r in response]) response_quality = np.stack([r["quality"] for r in response], axis=0) repsonse_cluster_id = np.array([r["cluster_id"] for r in response]) response_watch_time = np.stack([r["watch_time"] for r in response], axis=0) response_liked = np.array([r["liked"] for r in response]) replay_buffer.add( observation=observation, action=action, reward=reward, terminal=terminal, mdp_id=0, sequence_number=0, doc=doc_features, response_click=click, response_cluster_id=repsonse_cluster_id, response_quality=response_quality, response_liked=response_liked, response_watch_time=response_watch_time, log_prob=log_prob, )
def run_test_offline( env_name: str, model: ModelManager__Union, replay_memory_size: int, num_batches_per_epoch: int, num_train_epochs: int, passing_score_bar: float, num_eval_episodes: int, minibatch_size: int, use_gpu: bool, ): env = Gym(env_name=env_name) env.seed(SEED) env.action_space.seed(SEED) normalization = build_normalizer(env) logger.info(f"Normalization is: \n{pprint.pformat(normalization)}") manager = model.value trainer = manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=normalization, ) # first fill the replay buffer to burn_in replay_buffer = ReplayBuffer(replay_capacity=replay_memory_size, batch_size=minibatch_size) # always fill full RB random_policy = make_random_policy_for_env(env) agent = Agent.create_for_env(env, policy=random_policy) fill_replay_buffer( env=env, replay_buffer=replay_buffer, desired_size=replay_memory_size, agent=agent, ) device = torch.device("cuda") if use_gpu else None # pyre-fixme[6]: Expected `device` for 2nd param but got `Optional[torch.device]`. trainer_preprocessor = make_replay_buffer_trainer_preprocessor( trainer, device, env) writer = SummaryWriter() with summary_writer_context(writer): for epoch in range(num_train_epochs): logger.info(f"Evaluating before epoch {epoch}: ") eval_rewards = evaluate_cem(env, manager, 1) for _ in tqdm(range(num_batches_per_epoch)): train_batch = replay_buffer.sample_transition_batch() preprocessed_batch = trainer_preprocessor(train_batch) trainer.train(preprocessed_batch) logger.info(f"Evaluating after training for {num_train_epochs} epochs: ") eval_rewards = evaluate_cem(env, manager, num_eval_episodes) mean_rewards = np.mean(eval_rewards) assert (mean_rewards >= passing_score_bar ), f"{mean_rewards} doesn't pass the bar {passing_score_bar}."
def __call__( self, replay_buffer: ReplayBuffer, obs: Any, action: Any, reward: float, terminal: bool, log_prob: float, ): replay_buffer.add(obs, action, reward, terminal, log_prob=log_prob)
def replay_buffer_to_pre_timeline_df( is_discrete_action: bool, replay_buffer: ReplayBuffer # pyre-fixme[11]: Annotation `DataFrame` is not defined as a type. ) -> pd.DataFrame: """ Format needed for uploading dataset to Hive, and then run timeline. """ n = replay_buffer.size batch = replay_buffer.sample_transition_batch(batch_size=n) # actions is inconsistent between models, so let's infer them. possible_actions_mask = getattr(batch, "possible_actions_mask", None) possible_actions = getattr(batch, "possible_actions", None) terminal = batch.terminal.squeeze(1).tolist() assert len(batch.action.shape) == 2 if is_discrete_action: assert (batch.action.shape[1] == 1 ), f"discrete action batch with shape {batch.action.shape}" # Discrete action space, should be str action = [str(a.item()) for a in batch.action] # assuming we've explored the whole action space unique_actions = np.unique(batch.action) possible_actions_mask = [ [1 for _ in range(len(unique_actions))] if not elem_terminal else [] for elem_terminal in terminal ] possible_actions = [ [str(a) for a in unique_actions] if not elem_terminal else [] for elem_terminal in terminal ] else: # Box (parametric) action space, should be map<str, double> action = _dense_to_sparse(batch.action) # TODO: handle possible actions/mask here sequence_number = batch.sequence_number.squeeze(1).tolist() action_probability = np.exp(batch.log_prob.squeeze(1)).tolist() reward = batch.reward.squeeze(1).tolist() rows = { "ds": [DEFAULT_DS for _ in range(n)], "state_features": _dense_to_sparse(batch.state), "action": action, "mdp_id": batch.mdp_id.tolist(), "sequence_number": sequence_number, "action_probability": action_probability, "reward": reward, "metrics": [{ "reward": r } for r in reward], } if possible_actions_mask is not None: rows["possible_actions_mask"] = possible_actions_mask if possible_actions is not None: rows["possible_actions"] = possible_actions # pyre-fixme[16]: Module `pd` has no attribute `DataFrame`. return pd.DataFrame.from_dict(rows)
def offline_gym( env_name: str, pkl_path: str, num_train_transitions: int, max_steps: Optional[int], seed: Optional[int] = None, ): """ Generate samples from a DiscreteRandomPolicy on the Gym environment and saves results in a pandas df parquet. """ initialize_seed(seed) env = Gym(env_name=env_name) replay_buffer = ReplayBuffer(replay_capacity=num_train_transitions, batch_size=1) fill_replay_buffer(env, replay_buffer, num_train_transitions) if isinstance(env.action_space, gym.spaces.Discrete): is_discrete_action = True else: assert isinstance(env.action_space, gym.spaces.Box) is_discrete_action = False df = replay_buffer_to_pre_timeline_df(is_discrete_action, replay_buffer) logger.info(f"Saving dataset with {len(df)} samples to {pkl_path}") df.to_pickle(pkl_path)
def __call__(self, replay_buffer: ReplayBuffer, transition: Transition): transition_dict = transition.asdict() obs = transition_dict.pop("observation") user = obs["user"] kwargs = {} if self.box_keys or self.discrete_keys: doc_obs = obs["doc"] for k in self.box_keys: kwargs[f"doc_{k}"] = np.stack([v[k] for v in doc_obs.values()]) for k in self.discrete_keys: kwargs[f"doc_{k}"] = np.array([v[k] for v in doc_obs.values()]) else: kwargs["doc"] = np.stack(list(obs["doc"].values())) # Augmentation if self.augmentation_box_keys or self.augmentation_discrete_keys: aug_obs = obs["augmentation"] for k in self.augmentation_box_keys: kwargs[f"augmentation_{k}"] = np.stack( [v[k] for v in aug_obs.values()]) for k in self.augmentation_discrete_keys: kwargs[f"augmentation_{k}"] = np.array( [v[k] for v in aug_obs.values()]) # Responses response = obs["response"] # We need to handle None below because the first state won't have response for k, d in self.response_box_keys: if response is not None: kwargs[f"response_{k}"] = np.stack([v[k] for v in response]) else: kwargs[f"response_{k}"] = np.zeros((self.num_responses, *d), dtype=np.float32) for k, _n in self.response_discrete_keys: if response is not None: kwargs[f"response_{k}"] = np.array([v[k] for v in response]) else: kwargs[f"response_{k}"] = np.zeros((self.num_responses, ), dtype=np.int64) transition_dict.update(kwargs) replay_buffer.add(observation=user, **transition_dict)
def __call__( self, replay_buffer: ReplayBuffer, obs: Any, action: Any, reward: float, terminal: bool, log_prob: float, ): user = obs["user"] kwargs = {} if self.box_keys or self.discrete_keys: doc_obs = obs["doc"] for k in self.box_keys: kwargs[f"doc_{k}"] = np.stack([v[k] for v in doc_obs.values()]) for k in self.discrete_keys: kwargs[f"doc_{k}"] = np.array([v[k] for v in doc_obs.values()]) else: kwargs["doc"] = np.stack(list(obs["doc"].values())) # Responses response = obs["response"] # We need to handle None below because the first state won't have response for k, d in self.response_box_keys: if response is not None: kwargs[f"response_{k}"] = np.stack([v[k] for v in response]) else: kwargs[f"response_{k}"] = np.zeros((self.num_responses, *d)) for k, _n in self.response_discrete_keys: if response is not None: kwargs[f"response_{k}"] = np.array([v[k] for v in response]) else: kwargs[f"response_{k}"] = np.zeros((self.num_responses, )) replay_buffer.add( observation=user, action=action, reward=reward, terminal=terminal, log_prob=log_prob, **kwargs, )
def train_mdnrnn( env: EnvWrapper, trainer: MDNRNNTrainer, trainer_preprocessor, num_train_transitions: int, seq_len: int, batch_size: int, num_train_epochs: int, # for optional validation test_replay_buffer=None, ): train_replay_buffer = ReplayBuffer( replay_capacity=num_train_transitions, batch_size=batch_size, stack_size=seq_len, return_everything_as_stack=True, ) random_policy = make_random_policy_for_env(env) agent = Agent.create_for_env(env, policy=random_policy) fill_replay_buffer(env, train_replay_buffer, num_train_transitions, agent) num_batch_per_epoch = train_replay_buffer.size // batch_size logger.info("Made RBs, starting to train now!") optimizer = trainer.configure_optimizers()[0] for _ in range(num_train_epochs): for i in range(num_batch_per_epoch): batch = train_replay_buffer.sample_transition_batch(batch_size=batch_size) preprocessed_batch = trainer_preprocessor(batch) loss = next(trainer.train_step_gen(preprocessed_batch, i)) optimizer.zero_grad() loss.backward() optimizer.step() # validation if test_replay_buffer is not None: with torch.no_grad(): trainer.memory_network.mdnrnn.eval() test_batch = test_replay_buffer.sample_transition_batch( batch_size=batch_size ) preprocessed_test_batch = trainer_preprocessor(test_batch) valid_losses = trainer.get_loss(preprocessed_test_batch) trainer.memory_network.mdnrnn.train() return trainer
def train_seq2reward( env: EnvWrapper, trainer: Seq2RewardTrainer, trainer_preprocessor, num_train_transitions: int, seq_len: int, batch_size: int, num_train_epochs: int, # for optional validation test_replay_buffer=None, ): train_replay_buffer = ReplayBuffer( replay_capacity=num_train_transitions, batch_size=batch_size, stack_size=seq_len, return_everything_as_stack=True, ) fill_replay_buffer(env, train_replay_buffer, num_train_transitions) num_batch_per_epoch = train_replay_buffer.size // batch_size logger.info("Made RBs, starting to train now!") # pyre-fixme[16]: `EnvWrapper` has no attribute `observation_space`. state_dim = env.observation_space.shape[0] for epoch in range(num_train_epochs): for i in range(num_batch_per_epoch): batch = train_replay_buffer.sample_transition_batch( batch_size=batch_size) preprocessed_batch = trainer_preprocessor(batch) adhoc_padding(preprocessed_batch, state_dim=state_dim) losses = trainer.train(preprocessed_batch) print_seq2reward_losses(epoch, i, losses) # validation if test_replay_buffer is not None: with torch.no_grad(): trainer.seq2reward_network.eval() test_batch = test_replay_buffer.sample_transition_batch( batch_size=batch_size) preprocessed_test_batch = trainer_preprocessor(test_batch) adhoc_padding(preprocessed_test_batch, state_dim=state_dim) valid_losses = trainer.get_loss(preprocessed_test_batch) print_seq2reward_losses(epoch, "validation", valid_losses) trainer.seq2reward_network.train() return trainer
def create_embed_rl_dataset( env: EnvWrapper, memory_network: MemoryNetwork, num_state_embed_transitions: int, batch_size: int, seq_len: int, hidden_dim: int, use_gpu: bool, ): assert isinstance(env.action_space, gym.spaces.Discrete) assert isinstance(env.observation_space, gym.spaces.Box) assert len(env.observation_space.shape) == 1 logger.info("Starting to create embedded RL Dataset!") # seqlen+1 because MDNRNN embeds the first seq_len steps and then # the embedded state will be concatenated with the last step # Ie.. (o1,o2,...,on) -> RNN -> h1,h2,...,hn # and we set s_{n+1} = [o_{n+1}, h_n] embed_env = StateEmbedEnvironment( gym_env=env, mdnrnn=memory_network, max_embed_seq_len=seq_len + 1 ) # now create a filled replay buffer of embeddings # new obs shape dim = state_dim + hidden_dim embed_rb = ReplayBuffer( replay_capacity=num_state_embed_transitions, batch_size=batch_size, stack_size=1 ) random_policy = make_random_policy_for_env(env) agent = Agent.create_for_env(env, policy=random_policy) fill_replay_buffer( env=embed_env, replay_buffer=embed_rb, desired_size=num_state_embed_transitions, agent=agent, ) batch = embed_rb.sample_transition_batch(batch_size=num_state_embed_transitions) state_min = min(batch.state.min(), batch.next_state.min()).item() state_max = max(batch.state.max(), batch.next_state.max()).item() logger.info( f"Finished making embed dataset with size {embed_rb.size}, " f"min {state_min}, max {state_max}" ) return embed_rb, state_min, state_max
def run_test( env: str, model: ModelManager__Union, replay_memory_size: int, train_every_ts: int, train_after_ts: int, num_episodes: int, max_steps: Optional[int], last_score_bar: float, ): env = EnvFactory.make(env) env.seed(SEED) env.action_space.seed(SEED) normalization = build_normalizer(env) logger.info(f"Normalization is {normalization}") manager = model.value trainer = manager.initialize_trainer( use_gpu=False, reward_options=RewardOptions(), normalization_data_map=normalization, ) policy = manager.create_policy() replay_buffer = ReplayBuffer.create_from_env( env=env, replay_memory_size=replay_memory_size, batch_size=trainer.minibatch_size, ) post_step = train_with_replay_buffer_post_step( replay_buffer=replay_buffer, trainer=trainer, training_freq=train_every_ts, batch_size=trainer.minibatch_size, replay_burnin=train_after_ts, ) agent = Agent.create_for_env(env, policy=policy, post_transition_callback=post_step) reward_history = [] for i in range(num_episodes): logger.info(f"running episode {i}") ep_reward = run_episode(env=env, agent=agent, max_steps=max_steps) reward_history.append(ep_reward) assert reward_history[-1] >= last_score_bar, ( f"reward after {len(reward_history)} episodes is {reward_history[-1]}," f"less than < {last_score_bar}...\n" f"Full reward history: {reward_history}") return reward_history
def train_mdnrnn( env: EnvWrapper, trainer: MDNRNNTrainer, trainer_preprocessor, num_train_transitions: int, seq_len: int, batch_size: int, num_train_epochs: int, # for optional validation test_replay_buffer=None, ): train_replay_buffer = ReplayBuffer( replay_capacity=num_train_transitions, batch_size=batch_size, stack_size=seq_len, return_everything_as_stack=True, ) fill_replay_buffer(env, train_replay_buffer, num_train_transitions) num_batch_per_epoch = train_replay_buffer.size // batch_size logger.info("Made RBs, starting to train now!") for epoch in range(num_train_epochs): for i in range(num_batch_per_epoch): batch = train_replay_buffer.sample_transition_batch(batch_size=batch_size) preprocessed_batch = trainer_preprocessor(batch) losses = trainer.train(preprocessed_batch) print_mdnrnn_losses(epoch, i, losses) # validation if test_replay_buffer is not None: with torch.no_grad(): trainer.memory_network.mdnrnn.eval() test_batch = test_replay_buffer.sample_transition_batch( batch_size=batch_size ) preprocessed_test_batch = trainer_preprocessor(test_batch) valid_losses = trainer.get_loss(preprocessed_test_batch) print_mdnrnn_losses(epoch, "validation", valid_losses) trainer.memory_network.mdnrnn.train() return trainer
def __call__( self, replay_buffer: ReplayBuffer, obs: Any, action: Any, reward: float, terminal: bool, log_prob: float, ): user = obs["user"] kwargs = {} if self.box_keys or self.discrete_keys: doc_obs = obs["doc"] for k in self.box_keys: kwargs["doc_{k}"] = np.vstack([v[k] for v in doc_obs.values()]) for k in self.discrete_keys: kwargs["doc_{k}"] = np.array([v[k] for v in doc_obs.values()]) else: kwargs["doc"] = obs["doc"] # Responses for k in self.response_box_keys: kwargs["response_{k}"] = np.vstack([v[k] for v in obs["response"]]) for k in self.response_discrete_keys: kwargs["response_{k}"] = np.arrray([v[k] for v in obs["response"]]) replay_buffer.add( observation=user, action=action, reward=reward, terminal=terminal, log_prob=log_prob, **kwargs, )
def test_create_from_recsim_interest_evolution(self): env_config = { "num_candidates": 20, "slate_size": 3, "resample_documents": False, "seed": 1, } env = interest_evolution.create_environment(env_config) replay_buffer = ReplayBuffer.create_from_env(env, replay_memory_size=100, batch_size=10, store_log_prob=True) obs = env.reset() observation = obs["user"] action = env.action_space.sample() log_prob = -1.0 doc_features = np.stack(list(obs["doc"].values()), axis=0) next_obs, reward, terminal, _env = env.step(action) response = next_obs["response"] click = np.array([r["click"] for r in response]) response_quality = np.stack([r["quality"] for r in response], axis=0) repsonse_cluster_id = np.array([r["cluster_id"] for r in response]) response_watch_time = np.stack([r["watch_time"] for r in response], axis=0) response_liked = np.array([r["liked"] for r in response]) replay_buffer.add( observation, action, reward, terminal, mdp_id=0, sequence_number=0, doc=doc_features, response_click=click, response_cluster_id=repsonse_cluster_id, response_quality=response_quality, response_liked=response_liked, response_watch_time=response_watch_time, log_prob=log_prob, )
def _offline_gym( env: Gym, agent: Agent, pkl_path: str, num_train_transitions: int, max_steps: Optional[int], seed: int = 1, ): initialize_seed(seed) replay_buffer = ReplayBuffer(replay_capacity=num_train_transitions, batch_size=1) fill_replay_buffer(env, replay_buffer, num_train_transitions, agent) if isinstance(env.action_space, gym.spaces.Discrete): is_discrete_action = True else: assert isinstance(env.action_space, gym.spaces.Box) is_discrete_action = False df = replay_buffer_to_pre_timeline_df(is_discrete_action, replay_buffer) logger.info(f"Saving dataset with {len(df)} samples to {pkl_path}") df.to_pickle(pkl_path)
def run_test( env_name: str, model: ModelManager__Union, replay_memory_size: int, train_every_ts: int, train_after_ts: int, num_train_episodes: int, max_steps: Optional[int], passing_score_bar: float, num_eval_episodes: int, use_gpu: bool, ): env = EnvFactory.make(env_name) env.seed(SEED) env.action_space.seed(SEED) normalization = build_normalizer(env) logger.info(f"Normalization is: \n{pprint.pformat(normalization)}") manager = model.value trainer = manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=normalization, ) training_policy = manager.create_policy(serving=False) replay_buffer = ReplayBuffer.create_from_env( env=env, replay_memory_size=replay_memory_size, batch_size=trainer.minibatch_size, ) device = torch.device("cuda") if use_gpu else None # first fill the replay buffer to burn_in train_after_ts = max(train_after_ts, trainer.minibatch_size) fill_replay_buffer(env=env, replay_buffer=replay_buffer, desired_size=train_after_ts) post_step = train_with_replay_buffer_post_step( replay_buffer=replay_buffer, env=env, trainer=trainer, training_freq=train_every_ts, batch_size=trainer.minibatch_size, device=device, ) agent = Agent.create_for_env( env, policy=training_policy, post_transition_callback=post_step, # pyre-fixme[6]: Expected `Union[str, torch.device]` for 4th param but got # `Optional[torch.device]`. device=device, ) writer = SummaryWriter() with summary_writer_context(writer): train_rewards = [] for i in range(num_train_episodes): trajectory = run_episode(env=env, agent=agent, mdp_id=i, max_steps=max_steps) ep_reward = trajectory.calculate_cumulative_reward() train_rewards.append(ep_reward) logger.info( f"Finished training episode {i} with reward {ep_reward}.") assert train_rewards[-1] >= passing_score_bar, ( f"reward after {len(train_rewards)} episodes is {train_rewards[-1]}," f"less than < {passing_score_bar}...\n" f"Full reward history: {train_rewards}") logger.info("============Train rewards=============") logger.info(train_rewards) serving_policy = manager.create_policy(serving=True) agent = Agent.create_for_env_with_serving_policy(env, serving_policy) eval_rewards = evaluate_for_n_episodes(n=num_eval_episodes, env=env, agent=agent, max_steps=max_steps).squeeze(1) assert np.mean(eval_rewards) >= passing_score_bar, ( f"Predictor reward is {np.mean(eval_rewards)}," f"less than < {passing_score_bar}...\n" f"Full eval rewards: {eval_rewards}.") logger.info("============Eval rewards==============") logger.info(eval_rewards)
def train_mdnrnn_and_compute_feature_stats( env_name: str, model: ModelManager__Union, num_train_transitions: int, num_test_transitions: int, seq_len: int, batch_size: int, num_train_epochs: int, use_gpu: bool, saved_mdnrnn_path: Optional[str] = None, ): """ Train MDNRNN Memory Network and compute feature importance/sensitivity. """ env: gym.Env = EnvFactory.make(env_name) env.seed(SEED) manager = model.value trainer = manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=build_normalizer(env), ) device = "cuda" if use_gpu else "cpu" trainer_preprocessor = make_replay_buffer_trainer_preprocessor(trainer, device, env) test_replay_buffer = ReplayBuffer.create_from_env( env=env, replay_memory_size=num_test_transitions, batch_size=batch_size, stack_size=seq_len, return_everything_as_stack=True, ) fill_replay_buffer(env, test_replay_buffer, num_test_transitions) if saved_mdnrnn_path is None: # train from scratch trainer = train_mdnrnn( env=env, trainer=trainer, trainer_preprocessor=trainer_preprocessor, num_train_transitions=num_train_transitions, seq_len=seq_len, batch_size=batch_size, num_train_epochs=num_train_epochs, test_replay_buffer=test_replay_buffer, ) else: # load a pretrained model, and just evaluate it trainer.memory_network.mdnrnn.load_state_dict(torch.load(saved_mdnrnn_path)) with torch.no_grad(): trainer.memory_network.mdnrnn.eval() test_batch = test_replay_buffer.sample_transition_batch_tensor( batch_size=test_replay_buffer.size ) preprocessed_test_batch = trainer_preprocessor(test_batch) feature_importance = calculate_feature_importance( env=env, trainer=trainer, use_gpu=use_gpu, test_batch=preprocessed_test_batch, ) feature_sensitivity = calculate_feature_sensitivity( env=env, trainer=trainer, use_gpu=use_gpu, test_batch=preprocessed_test_batch, ) trainer.memory_network.mdnrnn.train() return feature_importance, feature_sensitivity
def run_test( env: str, model: ModelManager__Union, replay_memory_size: int, train_every_ts: int, train_after_ts: int, num_train_episodes: int, max_steps: Optional[int], passing_score_bar: float, num_eval_episodes: int, use_gpu: bool, ): env = EnvFactory.make(env) env.seed(SEED) env.action_space.seed(SEED) normalization = build_normalizer(env) logger.info(f"Normalization is {normalization}") manager = model.value trainer = manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=normalization, ) replay_buffer = ReplayBuffer.create_from_env( env=env, replay_memory_size=replay_memory_size, batch_size=trainer.minibatch_size, ) device = torch.device("cuda") if use_gpu else None post_step = train_with_replay_buffer_post_step( replay_buffer=replay_buffer, trainer=trainer, training_freq=train_every_ts, batch_size=trainer.minibatch_size, replay_burnin=train_after_ts, device=device, ) training_policy = manager.create_policy(serving=False) agent = Agent.create_for_env(env, policy=training_policy, post_transition_callback=post_step, device=device) train_rewards = [] for i in range(num_train_episodes): ep_reward = run_episode(env=env, agent=agent, max_steps=max_steps) train_rewards.append(ep_reward) logger.info(f"Finished training episode {i} with reward {ep_reward}.") assert train_rewards[-1] >= passing_score_bar, ( f"reward after {len(train_rewards)} episodes is {train_rewards[-1]}," f"less than < {passing_score_bar}...\n" f"Full reward history: {train_rewards}") logger.info("============Train rewards=============") logger.info(train_rewards) def gym_to_reagent_serving( obs: np.array) -> Tuple[torch.Tensor, torch.Tensor]: obs_tensor = torch.tensor(obs).float().unsqueeze(0) presence_tensor = torch.ones_like(obs_tensor) return (obs_tensor, presence_tensor) serving_policy = manager.create_policy(serving=True) agent = Agent.create_for_env(env, policy=serving_policy, obs_preprocessor=gym_to_reagent_serving) eval_rewards = [] for i in range(num_eval_episodes): ep_reward = run_episode(env=env, agent=agent, max_steps=max_steps) eval_rewards.append(ep_reward) logger.info(f"Finished eval episode {i} with reward {ep_reward}.") assert np.mean(eval_rewards) >= passing_score_bar, ( f"Predictor reward is {np.mean(eval_rewards)}," f"less than < {passing_score_bar}...\n" f"Full eval rewards: {eval_rewards}.") logger.info("============Eval rewards==============") logger.info(eval_rewards)
def run_test( env: Env__Union, model: ModelManager__Union, replay_memory_size: int, train_every_ts: int, train_after_ts: int, num_train_episodes: int, passing_score_bar: float, num_eval_episodes: int, use_gpu: bool, minibatch_size: Optional[int] = None, ): env = env.value normalization = build_normalizer(env) logger.info(f"Normalization is: \n{pprint.pformat(normalization)}") manager = model.value trainer = manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=normalization, ) training_policy = manager.create_policy(serving=False) # pyre-fixme[16]: Module `pl` has no attribute `LightningModule`. if not isinstance(trainer, pl.LightningModule): if minibatch_size is None: minibatch_size = trainer.minibatch_size assert minibatch_size == trainer.minibatch_size assert minibatch_size is not None replay_buffer = ReplayBuffer(replay_capacity=replay_memory_size, batch_size=minibatch_size) device = torch.device("cuda") if use_gpu else torch.device("cpu") # first fill the replay buffer to burn_in train_after_ts = max(train_after_ts, minibatch_size) fill_replay_buffer(env=env, replay_buffer=replay_buffer, desired_size=train_after_ts) # pyre-fixme[16]: Module `pl` has no attribute `LightningModule`. if isinstance(trainer, pl.LightningModule): agent = Agent.create_for_env(env, policy=training_policy) # TODO: Simplify this setup by creating LightningDataModule dataset = ReplayBufferDataset.create_for_trainer( trainer, env, agent, replay_buffer, batch_size=minibatch_size, training_frequency=train_every_ts, num_episodes=num_train_episodes, max_steps=200, ) data_loader = torch.utils.data.DataLoader(dataset, collate_fn=identity_collate) # pyre-fixme[16]: Module `pl` has no attribute `Trainer`. pl_trainer = pl.Trainer(max_epochs=1, gpus=int(use_gpu)) pl_trainer.fit(trainer, data_loader) # TODO: Also check train_reward else: post_step = train_with_replay_buffer_post_step( replay_buffer=replay_buffer, env=env, trainer=trainer, training_freq=train_every_ts, batch_size=trainer.minibatch_size, device=device, ) env.seed(SEED) env.action_space.seed(SEED) train_rewards = train_policy( env, training_policy, num_train_episodes, post_step=post_step, post_episode=None, use_gpu=use_gpu, ) # Check whether the max score passed the score bar; we explore during training # the return could be bad (leading to flakiness in C51 and QRDQN). assert np.max(train_rewards) >= passing_score_bar, ( f"max reward ({np.max(train_rewards)}) after training for " f"{len(train_rewards)} episodes is less than < {passing_score_bar}.\n" ) serving_policy = manager.create_policy(serving=True) eval_rewards = eval_policy(env, serving_policy, num_eval_episodes, serving=True) assert ( eval_rewards.mean() >= passing_score_bar ), f"Eval reward is {eval_rewards.mean()}, less than < {passing_score_bar}.\n"
def run_test( env: Env__Union, model: ModelManager__Union, replay_memory_size: int, train_every_ts: int, train_after_ts: int, num_train_episodes: int, passing_score_bar: float, num_eval_episodes: int, use_gpu: bool, ): env = env.value normalization = build_normalizer(env) logger.info(f"Normalization is: \n{pprint.pformat(normalization)}") manager = model.value trainer = manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=normalization, ) training_policy = manager.create_policy(serving=False) replay_buffer = ReplayBuffer(replay_capacity=replay_memory_size, batch_size=trainer.minibatch_size) device = torch.device("cuda") if use_gpu else torch.device("cpu") # first fill the replay buffer to burn_in train_after_ts = max(train_after_ts, trainer.minibatch_size) fill_replay_buffer(env=env, replay_buffer=replay_buffer, desired_size=train_after_ts) post_step = train_with_replay_buffer_post_step( replay_buffer=replay_buffer, env=env, trainer=trainer, training_freq=train_every_ts, batch_size=trainer.minibatch_size, device=device, ) env.seed(SEED) env.action_space.seed(SEED) train_rewards = train_policy( env, training_policy, num_train_episodes, post_step=post_step, post_episode=None, use_gpu=use_gpu, ) # Check whether the max score passed the score bar; we explore during training # the return could be bad (leading to flakiness in C51 and QRDQN). assert np.max(train_rewards) >= passing_score_bar, ( f"max reward ({np.max(train_rewards)}) after training for " f"{len(train_rewards)} episodes is less than < {passing_score_bar}.\n") serving_policy = manager.create_policy(serving=True) eval_rewards = eval_policy(env, serving_policy, num_eval_episodes, serving=True) assert ( eval_rewards.mean() >= passing_score_bar ), f"Eval reward is {eval_rewards.mean()}, less than < {passing_score_bar}.\n"
def test_sparse_input(self): replay_capacity = 100 num_transitions = replay_capacity // 2 memory = ReplayBuffer( stack_size=1, replay_capacity=replay_capacity, update_horizon=1 ) def trans(i): sparse_feat1 = list(range(0, i % 4)) sparse_feat2 = list(range(i % 4, 4)) id_list = {"sparse_feat1": sparse_feat1, "sparse_feat2": sparse_feat2} sparse_feat3 = (list(range(0, i % 7)), [k + 0.5 for k in range(0, i % 7)]) sparse_feat4 = (list(range(i % 7, 7)), [k + 0.5 for k in range(i % 7, 7)]) id_score_list = {"sparse_feat3": sparse_feat3, "sparse_feat4": sparse_feat4} return { "observation": np.ones(OBS_SHAPE, dtype=OBS_TYPE), "action": int(2 * i), "reward": float(3 * i), "terminal": i % 4, "id_list": id_list, "id_score_list": id_score_list, } for i in range(num_transitions): memory.add(**trans(i)) indices = list(range(num_transitions - 1)) batch = memory.sample_transition_batch(len(indices), torch.tensor(indices)) # calculate expected res = { "id_list": {"sparse_feat1": ([], []), "sparse_feat2": ([], [])}, "id_score_list": { "sparse_feat3": ([], [], []), "sparse_feat4": ([], [], []), }, "next_id_list": {"sparse_feat1": ([], []), "sparse_feat2": ([], [])}, "next_id_score_list": { "sparse_feat3": ([], [], []), "sparse_feat4": ([], [], []), }, } for i in range(num_transitions - 1): feats_i = trans(i) feats_next = trans(i + 1) for k in ["id_list", "id_score_list"]: for feat_id in res[k]: res[k][feat_id][0].append(len(res[k][feat_id][1])) if k == "id_list": res[k][feat_id][1].extend(feats_i[k][feat_id]) else: res[k][feat_id][1].extend(feats_i[k][feat_id][0]) res[k][feat_id][2].extend(feats_i[k][feat_id][1]) for k in ["next_id_list", "next_id_score_list"]: for feat_id in res[k]: res[k][feat_id][0].append(len(res[k][feat_id][1])) orig_k = k[len("next_") :] if k == "next_id_list": res[k][feat_id][1].extend(feats_next[orig_k][feat_id]) else: res[k][feat_id][1].extend(feats_next[orig_k][feat_id][0]) res[k][feat_id][2].extend(feats_next[orig_k][feat_id][1]) for k in ["id_list", "id_score_list", "next_id_list", "next_id_score_list"]: for feat_id in res[k]: if k in ["id_list", "next_id_list"]: npt.assert_array_equal( res[k][feat_id][0], getattr(batch, k)[feat_id][0] ) npt.assert_array_equal( res[k][feat_id][1], getattr(batch, k)[feat_id][1] ) else: npt.assert_array_equal( res[k][feat_id][0], getattr(batch, k)[feat_id][0] ) npt.assert_array_equal( res[k][feat_id][1], getattr(batch, k)[feat_id][1] ) npt.assert_array_equal( res[k][feat_id][2], getattr(batch, k)[feat_id][2] ) # sample random _ = memory.sample_transition_batch(10)
def test_replay_overflow(self): """ hard to make a stress test for this, since tracking which indices gets replaced would be effectively building a second RB so instead opt for simple test... stack_size = 2 so there's 1 padding. """ multi_steps = 2 stack_size = 2 memory = ReplayBuffer( stack_size=stack_size, replay_capacity=6, batch_size=1, update_horizon=multi_steps, return_everything_as_stack=None, return_as_timeline_format=True, ) def trans(i): return { "observation": np.ones(OBS_SHAPE, dtype=OBS_TYPE), "action": int(2 * i), "reward": float(3 * i), } # Contents of RB # start: [X, X, X, X, X, X] npt.assert_array_equal( memory._is_index_valid, [False, False, False, False, False, False] ) # t0: [X, s0, X, X, X, X] memory.add(**trans(0), terminal=False) npt.assert_array_equal( memory._is_index_valid, [False, False, False, False, False, False] ) # t1: [X, s0, s1, X, X, X] memory.add(**trans(1), terminal=False) npt.assert_array_equal( memory._is_index_valid, [False, False, False, False, False, False] ) # t2: [X, s0, s1, s2, X, X] # s0 finally becomes valid as its next state was added memory.add(**trans(2), terminal=False) npt.assert_array_equal( memory._is_index_valid, [False, True, False, False, False, False] ) batch = memory.sample_all_valid_transitions() npt.assert_array_equal(batch.action, [[0, 0]]) npt.assert_array_equal(batch.next_action[0], [[0, 2], [2, 4]]) # t3: [X, s0, s1, s2, s3, X] # episode termination validates whole episode memory.add(**trans(3), terminal=True) npt.assert_array_equal( memory._is_index_valid, [False, True, True, True, True, False] ) batch = memory.sample_all_valid_transitions() npt.assert_array_equal(batch.action, [[0, 0], [0, 2], [2, 4], [4, 6]]) npt.assert_array_equal(batch.next_action[0], [[0, 2], [2, 4]]) npt.assert_array_equal(batch.next_action[1], [[2, 4], [4, 6]]) # batch.next_action[2][1] is garbage npt.assert_array_equal(batch.next_action[2][0], [4, 6]) # batch.next_action[3] is [garbage] # t4: [s4, s0, s1, s2, s3, X] # s0 invalidated as its previous frame is corrupted memory.add(**trans(4), terminal=False) npt.assert_array_equal( memory._is_index_valid, [False, False, True, True, True, False] ) batch = memory.sample_all_valid_transitions() npt.assert_array_equal(batch.action, [[0, 2], [2, 4], [4, 6]]) npt.assert_array_equal(batch.next_action[0], [[2, 4], [4, 6]]) npt.assert_array_equal(batch.next_action[1][0], [4, 6]) # t5: [s4, s5, s1, s2, s3, X] memory.add(**trans(5), terminal=False) npt.assert_array_equal( memory._is_index_valid, [False, False, False, True, True, False] ) batch = memory.sample_all_valid_transitions() npt.assert_array_equal(batch.action, [[2, 4], [4, 6]]) npt.assert_array_equal(batch.next_action[0][0], [4, 6]) # t6: [s4, s5, s6, s2, s3, X] memory.add(**trans(6), terminal=True) npt.assert_array_equal( memory._is_index_valid, [True, True, True, False, True, False] ) batch = memory.sample_all_valid_transitions() npt.assert_array_equal(batch.action, [[0, 8], [8, 10], [10, 12], [4, 6]]) npt.assert_array_equal(batch.next_action[0], [[8, 10], [10, 12]]) npt.assert_array_equal(batch.next_action[1][0], [10, 12]) # batch.next_action[2] is [garbage] # batch.next_action[3] is [garbage] logger.info("Overflow test passes!")
def run_test_offline( env_name: str, model: ModelManager__Union, replay_memory_size: int, num_batches_per_epoch: int, num_train_epochs: int, passing_score_bar: float, num_eval_episodes: int, minibatch_size: int, use_gpu: bool, ): env = Gym(env_name=env_name) env.seed(SEED) env.action_space.seed(SEED) normalization = build_normalizer(env) logger.info(f"Normalization is: \n{pprint.pformat(normalization)}") manager = model.value trainer = manager.build_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=normalization, ) # first fill the replay buffer to burn_in replay_buffer = ReplayBuffer( replay_capacity=replay_memory_size, batch_size=minibatch_size ) # always fill full RB random_policy = make_random_policy_for_env(env) agent = Agent.create_for_env(env, policy=random_policy) fill_replay_buffer( env=env, replay_buffer=replay_buffer, desired_size=replay_memory_size, agent=agent, ) device = torch.device("cuda") if use_gpu else None dataset = OfflineReplayBufferDataset.create_for_trainer( trainer, env, replay_buffer, batch_size=minibatch_size, num_batches=num_batches_per_epoch, device=device, ) data_loader = torch.utils.data.DataLoader(dataset, collate_fn=identity_collate) pl_trainer = pl.Trainer( max_epochs=num_train_epochs, gpus=int(use_gpu), deterministic=True, default_root_dir=f"lightning_log_{str(uuid.uuid4())}", ) pl_trainer.fit(trainer, data_loader) logger.info(f"Evaluating after training for {num_train_epochs} epochs: ") eval_rewards = evaluate_cem(env, manager, trainer, num_eval_episodes) mean_rewards = np.mean(eval_rewards) assert ( mean_rewards >= passing_score_bar ), f"{mean_rewards} doesn't pass the bar {passing_score_bar}."
def __call__(self, replay_buffer: ReplayBuffer, transition: Transition): replay_buffer.add(**transition.asdict())
def run_test( env_name: str, model: ModelManager__Union, replay_memory_size: int, train_every_ts: int, train_after_ts: int, num_train_episodes: int, max_steps: Optional[int], passing_score_bar: float, num_eval_episodes: int, use_gpu: bool, ): env = EnvFactory.make(env_name) env.seed(SEED) env.action_space.seed(SEED) normalization = build_normalizer(env) logger.info(f"Normalization is: \n{pprint.pformat(normalization)}") manager = model.value try: # pyre-fixme[16]: `Env` has no attribute `state_feature_config_provider`. manager.state_feature_config_provider = env.state_feature_config_provider logger.info( f"Using environment's state_feature_config_provider.\n" f"{manager.state_feature_config_provider}" ) except AttributeError: logger.info("state_feature_config_provider override not applicable") trainer = manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=normalization, ) training_policy = manager.create_policy(serving=False) replay_buffer = ReplayBuffer.create_from_env( env=env, replay_memory_size=replay_memory_size, batch_size=trainer.minibatch_size, ) device = torch.device("cuda") if use_gpu else torch.device("cpu") # first fill the replay buffer to burn_in train_after_ts = max(train_after_ts, trainer.minibatch_size) fill_replay_buffer( env=env, replay_buffer=replay_buffer, desired_size=train_after_ts ) post_step = train_with_replay_buffer_post_step( replay_buffer=replay_buffer, env=env, trainer=trainer, training_freq=train_every_ts, batch_size=trainer.minibatch_size, device=device, ) agent = Agent.create_for_env( env, policy=training_policy, post_transition_callback=post_step, device=device ) writer = SummaryWriter() with summary_writer_context(writer): train_rewards = [] for i in range(num_train_episodes): trajectory = run_episode( env=env, agent=agent, mdp_id=i, max_steps=max_steps ) ep_reward = trajectory.calculate_cumulative_reward() train_rewards.append(ep_reward) logger.info( f"Finished training episode {i} (len {len(trajectory)})" f" with reward {ep_reward}." ) logger.info("============Train rewards=============") logger.info(train_rewards) logger.info(f"average: {np.mean(train_rewards)};\tmax: {np.max(train_rewards)}") # Check whether the max score passed the score bar; we explore during training # the return could be bad (leading to flakiness in C51 and QRDQN). assert np.max(train_rewards) >= passing_score_bar, ( f"max reward ({np.max(train_rewards)})after training for " f"{len(train_rewards)} episodes is less than < {passing_score_bar}.\n" ) serving_policy = manager.create_policy(serving=True) agent = Agent.create_for_env_with_serving_policy(env, serving_policy) eval_rewards = evaluate_for_n_episodes( n=num_eval_episodes, env=env, agent=agent, max_steps=max_steps ).squeeze(1) logger.info("============Eval rewards==============") logger.info(eval_rewards) logger.info(f"average: {np.mean(eval_rewards)};\tmax: {np.max(eval_rewards)}") assert np.mean(eval_rewards) >= passing_score_bar, ( f"Predictor reward is {np.mean(eval_rewards)}," f"less than < {passing_score_bar}.\n" )
def run_test_replay_buffer( env: Env__Union, model: ModelManager__Union, replay_memory_size: int, train_every_ts: int, train_after_ts: int, num_train_episodes: int, passing_score_bar: float, num_eval_episodes: int, use_gpu: bool, minibatch_size: Optional[int] = None, ): """ Run an online learning test with a replay buffer. The replay buffer is pre-filled, then the training starts. Each transition is added to the replay buffer immediately after it takes place. """ env = env.value # pyre-fixme[16]: Module `pl` has no attribute `seed_everything`. pl.seed_everything(SEED) env.seed(SEED) env.action_space.seed(SEED) normalization = build_normalizer(env) logger.info(f"Normalization is: \n{pprint.pformat(normalization)}") manager = model.value trainer = manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=normalization, ) training_policy = manager.create_policy(serving=False) # pyre-fixme[16]: Module `pl` has no attribute `LightningModule`. if not isinstance(trainer, pl.LightningModule): if minibatch_size is None: minibatch_size = trainer.minibatch_size assert minibatch_size == trainer.minibatch_size assert minibatch_size is not None replay_buffer = ReplayBuffer(replay_capacity=replay_memory_size, batch_size=minibatch_size) device = torch.device("cuda") if use_gpu else torch.device("cpu") # first fill the replay buffer using random policy train_after_ts = max(train_after_ts, minibatch_size) fill_replay_buffer(env=env, replay_buffer=replay_buffer, desired_size=train_after_ts) agent = Agent.create_for_env(env, policy=training_policy, device=device) # TODO: Simplify this setup by creating LightningDataModule dataset = ReplayBufferDataset.create_for_trainer( trainer, env, agent, replay_buffer, batch_size=minibatch_size, training_frequency=train_every_ts, num_episodes=num_train_episodes, max_steps=200, device=device, ) data_loader = torch.utils.data.DataLoader(dataset, collate_fn=identity_collate) # pyre-fixme[16]: Module `pl` has no attribute `Trainer`. pl_trainer = pl.Trainer(max_epochs=1, gpus=int(use_gpu)) # Note: the fit() function below also evaluates the agent along the way # and adds the new transitions to the replay buffer, so it is training # on incrementally larger and larger buffers. pl_trainer.fit(trainer, data_loader) # TODO: Also check train_reward serving_policy = manager.create_policy(serving=True) eval_rewards = eval_policy(env, serving_policy, num_eval_episodes, serving=True) assert ( eval_rewards.mean() >= passing_score_bar ), f"Eval reward is {eval_rewards.mean()}, less than < {passing_score_bar}.\n"