def evaluate_cem(env, manager, num_eval_episodes: int): # NOTE: for CEM, serving isn't implemented policy = manager.create_policy(serving=False) agent = Agent.create_for_env(env, policy) return evaluate_for_n_episodes( n=num_eval_episodes, env=env, agent=agent, max_steps=env.max_steps )
def eval_policy( env: EnvWrapper, serving_policy: Policy, num_eval_episodes: int, serving: bool = True, ) -> np.ndarray: agent = ( Agent.create_for_env_with_serving_policy(env, serving_policy) if serving else Agent.create_for_env(env, serving_policy) ) eval_rewards = evaluate_for_n_episodes( n=num_eval_episodes, env=env, agent=agent, max_steps=env.max_steps, num_processes=1, ).squeeze(1) logger.info("============Eval rewards==============") logger.info(eval_rewards) mean_eval = np.mean(eval_rewards) logger.info(f"average: {mean_eval};\tmax: {np.max(eval_rewards)}") return np.array(eval_rewards)
def offline_gym( env: str, pkl_path: str, num_episodes_for_data_batch: int, max_steps: Optional[int], seed: Optional[int] = None, ): """ Generate samples from a DiscreteRandomPolicy on the Gym environment and saves results in a pandas df parquet. """ initialize_seed(seed) env = EnvFactory.make(env) policy = DiscreteRandomPolicy.create_for_env(env) dataset = RLDataset() for i in range(num_episodes_for_data_batch): logger.info(f"Starting episode {i}") post_step = log_data_post_step(dataset=dataset, mdp_id=str(i), env=env) agent = Agent.create_for_env(env, policy, post_transition_callback=post_step) run_episode(env=env, agent=agent, max_steps=max_steps) logger.info(f"Saving dataset with {len(dataset)} samples to {pkl_path}") df = dataset.to_pandas_df() df.to_pickle(pkl_path)
def train_policy( env: EnvWrapper, training_policy: Policy, num_train_episodes: int, post_step: Optional[PostStep] = None, post_episode: Optional[PostEpisode] = None, use_gpu: bool = False, ) -> np.ndarray: device = torch.device("cuda") if use_gpu else torch.device("cpu") agent = Agent.create_for_env( env, policy=training_policy, post_transition_callback=post_step, post_episode_callback=post_episode, device=device, ) running_reward = 0 writer = SummaryWriter() with summary_writer_context(writer): train_rewards = [] with trange(num_train_episodes, unit=" epoch") as t: for i in t: # Note: run_episode also performs a training step for the agent, if specified in post_step trajectory = run_episode(env=env, agent=agent, mdp_id=i, max_steps=200) ep_reward = trajectory.calculate_cumulative_reward() train_rewards.append(ep_reward) running_reward *= REWARD_DECAY running_reward += (1 - REWARD_DECAY) * ep_reward t.set_postfix(reward=running_reward) logger.info("============Train rewards=============") logger.info(train_rewards) logger.info(f"average: {np.mean(train_rewards)};\tmax: {np.max(train_rewards)}") return np.array(train_rewards)
def evaluate_gym( env_name: str, model: torch.nn.Module, eval_temperature: float, num_eval_episodes: int, passing_score_bar: float, max_steps: Optional[int] = None, ): env: gym.Env = EnvFactory.make(env_name) policy = create_predictor_policy_from_model( env, model, eval_temperature=eval_temperature) # since we already return softmax action, override action_extractor agent = Agent.create_for_env( env, policy=policy, action_extractor=policy.get_action_extractor()) rewards = [] for _ in range(num_eval_episodes): ep_reward = run_episode(env=env, agent=agent, max_steps=max_steps) rewards.append(ep_reward) avg_reward = np.mean(rewards) logger.info(f"Average reward over {num_eval_episodes} is {avg_reward}.\n" f"List of rewards: {rewards}") assert (avg_reward >= passing_score_bar ), f"{avg_reward} fails to pass the bar of {passing_score_bar}!"
def run_test_offline( env_name: str, model: ModelManager__Union, replay_memory_size: int, num_batches_per_epoch: int, num_train_epochs: int, passing_score_bar: float, num_eval_episodes: int, minibatch_size: int, use_gpu: bool, ): env = Gym(env_name=env_name) env.seed(SEED) env.action_space.seed(SEED) normalization = build_normalizer(env) logger.info(f"Normalization is: \n{pprint.pformat(normalization)}") manager = model.value trainer = manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=normalization, ) # first fill the replay buffer to burn_in replay_buffer = ReplayBuffer(replay_capacity=replay_memory_size, batch_size=minibatch_size) # always fill full RB random_policy = make_random_policy_for_env(env) agent = Agent.create_for_env(env, policy=random_policy) fill_replay_buffer( env=env, replay_buffer=replay_buffer, desired_size=replay_memory_size, agent=agent, ) device = torch.device("cuda") if use_gpu else None # pyre-fixme[6]: Expected `device` for 2nd param but got `Optional[torch.device]`. trainer_preprocessor = make_replay_buffer_trainer_preprocessor( trainer, device, env) writer = SummaryWriter() with summary_writer_context(writer): for epoch in range(num_train_epochs): logger.info(f"Evaluating before epoch {epoch}: ") eval_rewards = evaluate_cem(env, manager, 1) for _ in tqdm(range(num_batches_per_epoch)): train_batch = replay_buffer.sample_transition_batch() preprocessed_batch = trainer_preprocessor(train_batch) trainer.train(preprocessed_batch) logger.info(f"Evaluating after training for {num_train_epochs} epochs: ") eval_rewards = evaluate_cem(env, manager, num_eval_episodes) mean_rewards = np.mean(eval_rewards) assert (mean_rewards >= passing_score_bar ), f"{mean_rewards} doesn't pass the bar {passing_score_bar}."
def run_test_online_episode( env: Env__Union, model: ModelManager__Union, num_train_episodes: int, passing_score_bar: float, num_eval_episodes: int, use_gpu: bool, ): """ Run an online learning test. At the end of each episode training is run on the trajectory. """ env = env.value pl.seed_everything(SEED) env.seed(SEED) env.action_space.seed(SEED) normalization = build_normalizer(env) logger.info(f"Normalization is: \n{pprint.pformat(normalization)}") manager = model.value trainer = manager.build_trainer( use_gpu=use_gpu, normalization_data_map=normalization, ) policy = manager.create_policy(trainer, serving=False) device = torch.device("cuda") if use_gpu else torch.device("cpu") agent = Agent.create_for_env(env, policy, device=device) pl_trainer = pl.Trainer( max_epochs=1, gpus=int(use_gpu), deterministic=True, default_root_dir=f"lightning_log_{str(uuid.uuid4())}", ) dataset = EpisodicDataset(env=env, agent=agent, num_episodes=num_train_episodes, seed=SEED) data_loader = torch.utils.data.DataLoader(dataset, collate_fn=identity_collate) pl_trainer.fit(trainer, data_loader) eval_rewards = evaluate_for_n_episodes( n=num_eval_episodes, env=env, agent=agent, max_steps=env.max_steps, num_processes=1, ).squeeze(1) assert ( eval_rewards.mean() >= passing_score_bar ), f"Eval reward is {eval_rewards.mean()}, less than < {passing_score_bar}.\n"
def fill_replay_buffer(env, replay_buffer: ReplayBuffer, desired_size: int): """ Fill replay buffer with random transitions until size reaches desired_size. """ assert ( 0 < desired_size and desired_size <= replay_buffer._replay_capacity ), f"It's not true that 0 < {desired_size} <= {replay_buffer._replay_capacity}." assert replay_buffer.size < desired_size, ( f"Replay buffer already has {replay_buffer.size} elements. " f"(more than desired_size = {desired_size})") logger.info( f" Starting to fill replay buffer using random policy to size: {desired_size}." ) random_policy = make_random_policy_for_env(env) post_step = add_replay_buffer_post_step(replay_buffer, env=env) agent = Agent.create_for_env(env, policy=random_policy, post_transition_callback=post_step) max_episode_steps = env.max_steps with tqdm( total=desired_size - replay_buffer.size, desc= f"Filling replay buffer from {replay_buffer.size} to size {desired_size} using random policy", ) as pbar: mdp_id = 0 while replay_buffer.size < desired_size: last_size = replay_buffer.size max_steps = desired_size - replay_buffer.size - 1 if max_episode_steps is not None: max_steps = min(max_episode_steps, max_steps) run_episode(env=env, agent=agent, mdp_id=mdp_id, max_steps=max_steps) size_delta = replay_buffer.size - last_size # The assertion below is commented out because it can't # support input samples which has seq_len>1. This should be # treated as a bug, and need to be fixed in the future. # assert ( # size_delta >= 0 # ), f"size delta is {size_delta} which should be non-negative." pbar.update(n=size_delta) mdp_id += 1 if size_delta <= 0: # replay buffer size isn't increasing... so stop early break if replay_buffer.size >= desired_size: logger.info( f"Successfully filled replay buffer to size: {replay_buffer.size}!" ) else: logger.info( f"Stopped early and filled replay buffer to size: {replay_buffer.size}." )
def run_test( env: str, model: ModelManager__Union, replay_memory_size: int, train_every_ts: int, train_after_ts: int, num_episodes: int, max_steps: Optional[int], last_score_bar: float, ): env = EnvFactory.make(env) env.seed(SEED) env.action_space.seed(SEED) normalization = build_normalizer(env) logger.info(f"Normalization is {normalization}") manager = model.value trainer = manager.initialize_trainer( use_gpu=False, reward_options=RewardOptions(), normalization_data_map=normalization, ) policy = manager.create_policy() replay_buffer = ReplayBuffer.create_from_env( env=env, replay_memory_size=replay_memory_size, batch_size=trainer.minibatch_size, ) post_step = train_with_replay_buffer_post_step( replay_buffer=replay_buffer, trainer=trainer, training_freq=train_every_ts, batch_size=trainer.minibatch_size, replay_burnin=train_after_ts, ) agent = Agent.create_for_env(env, policy=policy, post_transition_callback=post_step) reward_history = [] for i in range(num_episodes): logger.info(f"running episode {i}") ep_reward = run_episode(env=env, agent=agent, max_steps=max_steps) reward_history.append(ep_reward) assert reward_history[-1] >= last_score_bar, ( f"reward after {len(reward_history)} episodes is {reward_history[-1]}," f"less than < {last_score_bar}...\n" f"Full reward history: {reward_history}") return reward_history
def fill_replay_buffer(env: Env, replay_buffer: ReplayBuffer, desired_size: int): """ Fill replay buffer with random transitions until size reaches desired_size. """ assert ( 0 < desired_size and desired_size <= replay_buffer._replay_capacity ), f"It's not true that 0 < {desired_size} <= {replay_buffer._replay_capacity}." assert replay_buffer.size < desired_size, ( f"Replay buffer already has {replay_buffer.size} elements. " f"(more than desired_size = {desired_size})") logger.info(f"Starting to fill replay buffer to size: {desired_size}.") random_policy = make_random_policy_for_env(env) post_step = add_replay_buffer_post_step(replay_buffer, env=env) agent = Agent.create_for_env(env, policy=random_policy, post_transition_callback=post_step) max_episode_steps = get_max_steps(env) with tqdm( total=desired_size - replay_buffer.size, desc= f"Filling replay buffer from {replay_buffer.size} to size {desired_size}", ) as pbar: mdp_id = 0 while replay_buffer.size < desired_size: last_size = replay_buffer.size max_steps = desired_size - replay_buffer.size - 1 if max_episode_steps is not None: max_steps = min(max_episode_steps, max_steps) run_episode(env=env, agent=agent, mdp_id=mdp_id, max_steps=max_steps) size_delta = replay_buffer.size - last_size assert ( size_delta >= 0), f"size delta is {size_delta} which should be non-negative." pbar.update(n=size_delta) mdp_id += 1 if size_delta == 0: # replay buffer size isn't increasing... so stop early break if replay_buffer.size >= desired_size: logger.info( f"Successfully filled replay buffer to size: {replay_buffer.size}!" ) else: logger.info( f"Stopped early and filled replay buffer to size: {replay_buffer.size}." )
def offline_gym_random( env_name: str, pkl_path: str, num_train_transitions: int, max_steps: Optional[int], seed: int = 1, ): """ Generate samples from a random Policy on the Gym environment and saves results in a pandas df parquet. """ env = Gym(env_name=env_name) random_policy = make_random_policy_for_env(env) agent = Agent.create_for_env(env, policy=random_policy) return _offline_gym(env, agent, pkl_path, num_train_transitions, max_steps, seed)
def train_mdnrnn( env: EnvWrapper, trainer: MDNRNNTrainer, trainer_preprocessor, num_train_transitions: int, seq_len: int, batch_size: int, num_train_epochs: int, # for optional validation test_replay_buffer=None, ): train_replay_buffer = ReplayBuffer( replay_capacity=num_train_transitions, batch_size=batch_size, stack_size=seq_len, return_everything_as_stack=True, ) random_policy = make_random_policy_for_env(env) agent = Agent.create_for_env(env, policy=random_policy) fill_replay_buffer(env, train_replay_buffer, num_train_transitions, agent) num_batch_per_epoch = train_replay_buffer.size // batch_size logger.info("Made RBs, starting to train now!") optimizer = trainer.configure_optimizers()[0] for _ in range(num_train_epochs): for i in range(num_batch_per_epoch): batch = train_replay_buffer.sample_transition_batch(batch_size=batch_size) preprocessed_batch = trainer_preprocessor(batch) loss = next(trainer.train_step_gen(preprocessed_batch, i)) optimizer.zero_grad() loss.backward() optimizer.step() # validation if test_replay_buffer is not None: with torch.no_grad(): trainer.memory_network.mdnrnn.eval() test_batch = test_replay_buffer.sample_transition_batch( batch_size=batch_size ) preprocessed_test_batch = trainer_preprocessor(test_batch) valid_losses = trainer.get_loss(preprocessed_test_batch) trainer.memory_network.mdnrnn.train() return trainer
def create_embed_rl_dataset( env: EnvWrapper, memory_network: MemoryNetwork, num_state_embed_transitions: int, batch_size: int, seq_len: int, hidden_dim: int, use_gpu: bool, ): assert isinstance(env.action_space, gym.spaces.Discrete) assert isinstance(env.observation_space, gym.spaces.Box) assert len(env.observation_space.shape) == 1 logger.info("Starting to create embedded RL Dataset!") # seqlen+1 because MDNRNN embeds the first seq_len steps and then # the embedded state will be concatenated with the last step # Ie.. (o1,o2,...,on) -> RNN -> h1,h2,...,hn # and we set s_{n+1} = [o_{n+1}, h_n] embed_env = StateEmbedEnvironment( gym_env=env, mdnrnn=memory_network, max_embed_seq_len=seq_len + 1 ) # now create a filled replay buffer of embeddings # new obs shape dim = state_dim + hidden_dim embed_rb = ReplayBuffer( replay_capacity=num_state_embed_transitions, batch_size=batch_size, stack_size=1 ) random_policy = make_random_policy_for_env(env) agent = Agent.create_for_env(env, policy=random_policy) fill_replay_buffer( env=embed_env, replay_buffer=embed_rb, desired_size=num_state_embed_transitions, agent=agent, ) batch = embed_rb.sample_transition_batch(batch_size=num_state_embed_transitions) state_min = min(batch.state.min(), batch.next_state.min()).item() state_max = max(batch.state.max(), batch.next_state.max()).item() logger.info( f"Finished making embed dataset with size {embed_rb.size}, " f"min {state_min}, max {state_max}" ) return embed_rb, state_min, state_max
def setUp(self): logging.getLogger().setLevel(logging.DEBUG) env = Gym("CartPole-v0") norm = build_normalizer(env) net_builder = FullyConnected(sizes=[8], activations=["linear"]) cartpole_scorer = net_builder.build_q_network( state_feature_config=None, state_normalization_data=norm["state"], output_dim=len(norm["action"].dense_normalization_parameters), ) policy = Policy(scorer=cartpole_scorer, sampler=SoftmaxActionSampler()) agent = Agent.create_for_env(env, policy) self.max_steps = 3 self.num_episodes = 6 self.dataset = EpisodicDataset( env=env, agent=agent, num_episodes=self.num_episodes, seed=0, max_steps=self.max_steps, )
def train_mdnrnn_and_train_on_embedded_env( env_name: str, embedding_model: ModelManager__Union, num_embedding_train_transitions: int, seq_len: int, batch_size: int, num_embedding_train_epochs: int, train_model: ModelManager__Union, num_state_embed_transitions: int, num_agent_train_epochs: int, num_agent_eval_epochs: int, use_gpu: bool, passing_score_bar: float, # pyre-fixme[9]: saved_mdnrnn_path has type `str`; used as `None`. saved_mdnrnn_path: str = None, ): """ Train an agent on embedded states by the MDNRNN. """ env = EnvFactory.make(env_name) env.seed(SEED) embedding_manager = embedding_model.value embedding_trainer = embedding_manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=build_normalizer(env), ) device = "cuda" if use_gpu else "cpu" embedding_trainer_preprocessor = make_replay_buffer_trainer_preprocessor( embedding_trainer, device, env ) if saved_mdnrnn_path is None: # train from scratch embedding_trainer = train_mdnrnn( env=env, trainer=embedding_trainer, trainer_preprocessor=embedding_trainer_preprocessor, num_train_transitions=num_embedding_train_transitions, seq_len=seq_len, batch_size=batch_size, num_train_epochs=num_embedding_train_epochs, ) else: # load a pretrained model, and just evaluate it embedding_trainer.memory_network.mdnrnn.load_state_dict( torch.load(saved_mdnrnn_path) ) # create embedding dataset embed_rb, state_min, state_max = create_embed_rl_dataset( env=env, memory_network=embedding_trainer.memory_network, num_state_embed_transitions=num_state_embed_transitions, batch_size=batch_size, seq_len=seq_len, hidden_dim=embedding_trainer.params.hidden_size, use_gpu=use_gpu, ) embed_env = StateEmbedEnvironment( gym_env=env, mdnrnn=embedding_trainer.memory_network, max_embed_seq_len=seq_len, state_min_value=state_min, state_max_value=state_max, ) agent_manager = train_model.value agent_trainer = agent_manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=build_normalizer(embed_env), ) device = "cuda" if use_gpu else "cpu" agent_trainer_preprocessor = make_replay_buffer_trainer_preprocessor( agent_trainer, device, env ) num_batch_per_epoch = embed_rb.size // batch_size for epoch in range(num_agent_train_epochs): for _ in tqdm(range(num_batch_per_epoch), desc=f"epoch {epoch}"): batch = embed_rb.sample_transition_batch_tensor(batch_size=batch_size) preprocessed_batch = agent_trainer_preprocessor(batch) agent_trainer.train(preprocessed_batch) # evaluate model rewards = [] policy = agent_manager.create_policy(serving=False) agent = Agent.create_for_env(embed_env, policy=policy, device=device) for i in range(num_agent_eval_epochs): ep_reward = run_episode(env=embed_env, agent=agent) rewards.append(ep_reward) logger.info(f"Finished eval episode {i} with reward {ep_reward}.") logger.info(f"Average eval reward is {np.mean(rewards)}.") assert ( np.mean(rewards) >= passing_score_bar ), f"average reward doesn't pass our bar {passing_score_bar}" return rewards
def run_test( env_name: str, model: ModelManager__Union, replay_memory_size: int, train_every_ts: int, train_after_ts: int, num_train_episodes: int, max_steps: Optional[int], passing_score_bar: float, num_eval_episodes: int, use_gpu: bool, ): env = EnvFactory.make(env_name) env.seed(SEED) env.action_space.seed(SEED) normalization = build_normalizer(env) logger.info(f"Normalization is: \n{pprint.pformat(normalization)}") manager = model.value trainer = manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=normalization, ) training_policy = manager.create_policy(serving=False) replay_buffer = ReplayBuffer.create_from_env( env=env, replay_memory_size=replay_memory_size, batch_size=trainer.minibatch_size, ) device = torch.device("cuda") if use_gpu else None # first fill the replay buffer to burn_in train_after_ts = max(train_after_ts, trainer.minibatch_size) fill_replay_buffer(env=env, replay_buffer=replay_buffer, desired_size=train_after_ts) post_step = train_with_replay_buffer_post_step( replay_buffer=replay_buffer, env=env, trainer=trainer, training_freq=train_every_ts, batch_size=trainer.minibatch_size, device=device, ) agent = Agent.create_for_env( env, policy=training_policy, post_transition_callback=post_step, # pyre-fixme[6]: Expected `Union[str, torch.device]` for 4th param but got # `Optional[torch.device]`. device=device, ) writer = SummaryWriter() with summary_writer_context(writer): train_rewards = [] for i in range(num_train_episodes): trajectory = run_episode(env=env, agent=agent, mdp_id=i, max_steps=max_steps) ep_reward = trajectory.calculate_cumulative_reward() train_rewards.append(ep_reward) logger.info( f"Finished training episode {i} with reward {ep_reward}.") assert train_rewards[-1] >= passing_score_bar, ( f"reward after {len(train_rewards)} episodes is {train_rewards[-1]}," f"less than < {passing_score_bar}...\n" f"Full reward history: {train_rewards}") logger.info("============Train rewards=============") logger.info(train_rewards) serving_policy = manager.create_policy(serving=True) agent = Agent.create_for_env_with_serving_policy(env, serving_policy) eval_rewards = evaluate_for_n_episodes(n=num_eval_episodes, env=env, agent=agent, max_steps=max_steps).squeeze(1) assert np.mean(eval_rewards) >= passing_score_bar, ( f"Predictor reward is {np.mean(eval_rewards)}," f"less than < {passing_score_bar}...\n" f"Full eval rewards: {eval_rewards}.") logger.info("============Eval rewards==============") logger.info(eval_rewards)
def train_mdnrnn_and_train_on_embedded_env( env_name: str, embedding_model: ModelManager__Union, num_embedding_train_transitions: int, seq_len: int, batch_size: int, num_embedding_train_epochs: int, train_model: ModelManager__Union, num_state_embed_transitions: int, num_agent_train_epochs: int, num_agent_eval_epochs: int, use_gpu: bool, passing_score_bar: float, # pyre-fixme[9]: saved_mdnrnn_path has type `str`; used as `None`. saved_mdnrnn_path: str = None, ): """ Train an agent on embedded states by the MDNRNN. """ env = Gym(env_name=env_name) env.seed(SEED) embedding_manager = embedding_model.value embedding_trainer = embedding_manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=build_normalizer(env), ) device = "cuda" if use_gpu else "cpu" embedding_trainer_preprocessor = make_replay_buffer_trainer_preprocessor( embedding_trainer, # pyre-fixme[6]: Expected `device` for 2nd param but got `str`. device, env, ) if saved_mdnrnn_path is None: # train from scratch embedding_trainer = train_mdnrnn( env=env, trainer=embedding_trainer, trainer_preprocessor=embedding_trainer_preprocessor, num_train_transitions=num_embedding_train_transitions, seq_len=seq_len, batch_size=batch_size, num_train_epochs=num_embedding_train_epochs, ) else: # load a pretrained model, and just evaluate it embedding_trainer.memory_network.mdnrnn.load_state_dict( torch.load(saved_mdnrnn_path)) # create embedding dataset embed_rb, state_min, state_max = create_embed_rl_dataset( env=env, memory_network=embedding_trainer.memory_network, num_state_embed_transitions=num_state_embed_transitions, batch_size=batch_size, seq_len=seq_len, hidden_dim=embedding_trainer.params.hidden_size, use_gpu=use_gpu, ) embed_env = StateEmbedEnvironment( gym_env=env, mdnrnn=embedding_trainer.memory_network, max_embed_seq_len=seq_len, state_min_value=state_min, state_max_value=state_max, ) agent_manager = train_model.value agent_trainer = agent_manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), # pyre-fixme[6]: Expected `EnvWrapper` for 1st param but got # `StateEmbedEnvironment`. normalization_data_map=build_normalizer(embed_env), ) device = "cuda" if use_gpu else "cpu" agent_trainer_preprocessor = make_replay_buffer_trainer_preprocessor( agent_trainer, # pyre-fixme[6]: Expected `device` for 2nd param but got `str`. device, env, ) num_batch_per_epoch = embed_rb.size // batch_size # FIXME: This has to be wrapped in dataloader for epoch in range(num_agent_train_epochs): for _ in tqdm(range(num_batch_per_epoch), desc=f"epoch {epoch}"): batch = embed_rb.sample_transition_batch(batch_size=batch_size) preprocessed_batch = agent_trainer_preprocessor(batch) # FIXME: This should be fitted with Lightning's trainer agent_trainer.train(preprocessed_batch) # evaluate model rewards = [] policy = agent_manager.create_policy(serving=False) # pyre-fixme[6]: Expected `EnvWrapper` for 1st param but got # `StateEmbedEnvironment`. agent = Agent.create_for_env(embed_env, policy=policy, device=device) # num_processes=1 needed to avoid workers from dying on CircleCI tests rewards = evaluate_for_n_episodes( n=num_agent_eval_epochs, # pyre-fixme[6]: Expected `EnvWrapper` for 2nd param but got # `StateEmbedEnvironment`. env=embed_env, agent=agent, num_processes=1, ) assert (np.mean(rewards) >= passing_score_bar ), f"average reward doesn't pass our bar {passing_score_bar}" return rewards
def run_test( env: Env__Union, model: ModelManager__Union, replay_memory_size: int, train_every_ts: int, train_after_ts: int, num_train_episodes: int, passing_score_bar: float, num_eval_episodes: int, use_gpu: bool, minibatch_size: Optional[int] = None, ): env = env.value normalization = build_normalizer(env) logger.info(f"Normalization is: \n{pprint.pformat(normalization)}") manager = model.value trainer = manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=normalization, ) training_policy = manager.create_policy(serving=False) # pyre-fixme[16]: Module `pl` has no attribute `LightningModule`. if not isinstance(trainer, pl.LightningModule): if minibatch_size is None: minibatch_size = trainer.minibatch_size assert minibatch_size == trainer.minibatch_size assert minibatch_size is not None replay_buffer = ReplayBuffer(replay_capacity=replay_memory_size, batch_size=minibatch_size) device = torch.device("cuda") if use_gpu else torch.device("cpu") # first fill the replay buffer to burn_in train_after_ts = max(train_after_ts, minibatch_size) fill_replay_buffer(env=env, replay_buffer=replay_buffer, desired_size=train_after_ts) # pyre-fixme[16]: Module `pl` has no attribute `LightningModule`. if isinstance(trainer, pl.LightningModule): agent = Agent.create_for_env(env, policy=training_policy) # TODO: Simplify this setup by creating LightningDataModule dataset = ReplayBufferDataset.create_for_trainer( trainer, env, agent, replay_buffer, batch_size=minibatch_size, training_frequency=train_every_ts, num_episodes=num_train_episodes, max_steps=200, ) data_loader = torch.utils.data.DataLoader(dataset, collate_fn=identity_collate) # pyre-fixme[16]: Module `pl` has no attribute `Trainer`. pl_trainer = pl.Trainer(max_epochs=1, gpus=int(use_gpu)) pl_trainer.fit(trainer, data_loader) # TODO: Also check train_reward else: post_step = train_with_replay_buffer_post_step( replay_buffer=replay_buffer, env=env, trainer=trainer, training_freq=train_every_ts, batch_size=trainer.minibatch_size, device=device, ) env.seed(SEED) env.action_space.seed(SEED) train_rewards = train_policy( env, training_policy, num_train_episodes, post_step=post_step, post_episode=None, use_gpu=use_gpu, ) # Check whether the max score passed the score bar; we explore during training # the return could be bad (leading to flakiness in C51 and QRDQN). assert np.max(train_rewards) >= passing_score_bar, ( f"max reward ({np.max(train_rewards)}) after training for " f"{len(train_rewards)} episodes is less than < {passing_score_bar}.\n" ) serving_policy = manager.create_policy(serving=True) eval_rewards = eval_policy(env, serving_policy, num_eval_episodes, serving=True) assert ( eval_rewards.mean() >= passing_score_bar ), f"Eval reward is {eval_rewards.mean()}, less than < {passing_score_bar}.\n"
def create_df_from_replay_buffer( env, problem_domain: ProblemDomain, desired_size: int, multi_steps: Optional[int], ds: str, ) -> pd.DataFrame: # fill the replay buffer set_seed(env, SEED) if multi_steps is None: update_horizon = 1 return_as_timeline_format = False else: update_horizon = multi_steps return_as_timeline_format = True is_multi_steps = multi_steps is not None replay_buffer = ReplayBuffer( replay_capacity=desired_size, batch_size=1, update_horizon=update_horizon, return_as_timeline_format=return_as_timeline_format, ) random_policy = make_random_policy_for_env(env) agent = Agent.create_for_env(env, policy=random_policy) fill_replay_buffer(env, replay_buffer, desired_size, agent) batch = replay_buffer.sample_all_valid_transitions() n = batch.state.shape[0] logger.info(f"Creating df of size {n}.") def discrete_feat_transform(elem) -> str: """query data expects str format""" return str(elem.item()) def continuous_feat_transform(elem: List[float]) -> Dict[int, float]: """query data expects sparse format""" assert isinstance(elem, torch.Tensor), f"{type(elem)} isn't tensor" assert len(elem.shape) == 1, f"{elem.shape} isn't 1-dimensional" return {i: s.item() for i, s in enumerate(elem)} def make_parametric_feat_transform(one_hot_dim: int): """one-hot and then continuous_feat_transform""" def transform(elem) -> Dict[int, float]: elem_tensor = torch.tensor(elem.item()) one_hot_feat = F.one_hot(elem_tensor, one_hot_dim).float() return continuous_feat_transform(one_hot_feat) return transform state_features = feature_transform(batch.state, continuous_feat_transform) next_state_features = feature_transform( batch.next_state, continuous_feat_transform, is_next_with_multi_steps=is_multi_steps, ) if problem_domain == ProblemDomain.DISCRETE_ACTION: # discrete action is str action = feature_transform(batch.action, discrete_feat_transform) next_action = feature_transform( batch.next_action, discrete_feat_transform, is_next_with_multi_steps=is_multi_steps, replace_when_terminal="", terminal=batch.terminal, ) elif problem_domain == ProblemDomain.PARAMETRIC_ACTION: # continuous action is Dict[int, double] assert isinstance(env.action_space, gym.spaces.Discrete) parametric_feat_transform = make_parametric_feat_transform(env.action_space.n) action = feature_transform(batch.action, parametric_feat_transform) next_action = feature_transform( batch.next_action, parametric_feat_transform, is_next_with_multi_steps=is_multi_steps, replace_when_terminal={}, terminal=batch.terminal, ) elif problem_domain == ProblemDomain.CONTINUOUS_ACTION: action = feature_transform(batch.action, continuous_feat_transform) next_action = feature_transform( batch.next_action, continuous_feat_transform, is_next_with_multi_steps=is_multi_steps, replace_when_terminal={}, terminal=batch.terminal, ) elif problem_domain == ProblemDomain.MDN_RNN: action = feature_transform(batch.action, discrete_feat_transform) assert multi_steps is not None next_action = feature_transform( batch.next_action, discrete_feat_transform, is_next_with_multi_steps=True, replace_when_terminal="", terminal=batch.terminal, ) else: raise NotImplementedError(f"model type: {problem_domain}.") if multi_steps is None: time_diff = [1] * n reward = batch.reward.squeeze(1).tolist() metrics = [{"reward": r} for r in reward] else: time_diff = [[1] * len(ns) for ns in next_state_features] reward = [reward_list.tolist() for reward_list in batch.reward] metrics = [ [{"reward": r.item()} for r in reward_list] for reward_list in batch.reward ] # TODO(T67265031): change this to int mdp_id = [str(i.item()) for i in batch.mdp_id] sequence_number = batch.sequence_number.squeeze(1).tolist() # in the product data, all sequence_number_ordinal start from 1. # So to be consistent with the product data. sequence_number_ordinal = (batch.sequence_number.squeeze(1) + 1).tolist() action_probability = batch.log_prob.exp().squeeze(1).tolist() df_dict = { "state_features": state_features, "next_state_features": next_state_features, "action": action, "next_action": next_action, "reward": reward, "action_probability": action_probability, "metrics": metrics, "time_diff": time_diff, "mdp_id": mdp_id, "sequence_number": sequence_number, "sequence_number_ordinal": sequence_number_ordinal, "ds": [ds] * n, } if problem_domain == ProblemDomain.PARAMETRIC_ACTION: # Possible actions are List[Dict[int, float]] assert isinstance(env.action_space, gym.spaces.Discrete) possible_actions = [{i: 1.0} for i in range(env.action_space.n)] elif problem_domain == ProblemDomain.DISCRETE_ACTION: # Possible actions are List[str] assert isinstance(env.action_space, gym.spaces.Discrete) possible_actions = [str(i) for i in range(env.action_space.n)] elif problem_domain == ProblemDomain.MDN_RNN: # Possible actions are List[str] assert isinstance(env.action_space, gym.spaces.Discrete) possible_actions = [str(i) for i in range(env.action_space.n)] # these are fillers, which should have correct shape pa_features = range(n) pna_features = time_diff if problem_domain in ( ProblemDomain.DISCRETE_ACTION, ProblemDomain.PARAMETRIC_ACTION, ProblemDomain.MDN_RNN, ): def pa_transform(x): return possible_actions df_dict["possible_actions"] = feature_transform(pa_features, pa_transform) df_dict["possible_next_actions"] = feature_transform( pna_features, pa_transform, is_next_with_multi_steps=is_multi_steps, replace_when_terminal=[], terminal=batch.terminal, ) df = pd.DataFrame(df_dict) # validate df validate_mdp_ids_seq_nums(df) # shuffling (sample the whole batch) df = df.reindex(np.random.permutation(df.index)) return df
def run_test_online_episode( env: Env__Union, model: ModelManager__Union, num_train_episodes: int, passing_score_bar: float, num_eval_episodes: int, use_gpu: bool, ): """ Run an online learning test. At the end of each episode training is run on the trajectory. """ env = env.value # pyre-fixme[16]: Module `pl` has no attribute `seed_everything`. pl.seed_everything(SEED) env.seed(SEED) env.action_space.seed(SEED) normalization = build_normalizer(env) logger.info(f"Normalization is: \n{pprint.pformat(normalization)}") manager = model.value trainer = manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=normalization, ) policy = manager.create_policy(serving=False) device = torch.device("cuda") if use_gpu else torch.device("cpu") agent = Agent.create_for_env(env, policy, device=device) # pyre-fixme[16]: Module `pl` has no attribute `LightningModule`. if isinstance(trainer, pl.LightningModule): # pyre-fixme[16]: Module `pl` has no attribute `Trainer`. pl_trainer = pl.Trainer(max_epochs=1, gpus=int(use_gpu), deterministic=True) dataset = EpisodicDataset(env=env, agent=agent, num_episodes=num_train_episodes, seed=SEED) pl_trainer.fit(trainer, dataset) else: post_episode_callback = train_post_episode(env, trainer, use_gpu) _ = train_policy( env, policy, num_train_episodes, post_step=None, post_episode=post_episode_callback, use_gpu=use_gpu, ) eval_rewards = evaluate_for_n_episodes( n=num_eval_episodes, env=env, agent=agent, max_steps=env.max_steps, num_processes=1, ).squeeze(1) assert ( eval_rewards.mean() >= passing_score_bar ), f"Eval reward is {eval_rewards.mean()}, less than < {passing_score_bar}.\n"
def run_test_replay_buffer( env: Env__Union, model: ModelManager__Union, replay_memory_size: int, train_every_ts: int, train_after_ts: int, num_train_episodes: int, passing_score_bar: float, num_eval_episodes: int, use_gpu: bool, minibatch_size: Optional[int] = None, ): """ Run an online learning test with a replay buffer. The replay buffer is pre-filled, then the training starts. Each transition is added to the replay buffer immediately after it takes place. """ env = env.value # pyre-fixme[16]: Module `pl` has no attribute `seed_everything`. pl.seed_everything(SEED) env.seed(SEED) env.action_space.seed(SEED) normalization = build_normalizer(env) logger.info(f"Normalization is: \n{pprint.pformat(normalization)}") manager = model.value trainer = manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=normalization, ) training_policy = manager.create_policy(serving=False) # pyre-fixme[16]: Module `pl` has no attribute `LightningModule`. if not isinstance(trainer, pl.LightningModule): if minibatch_size is None: minibatch_size = trainer.minibatch_size assert minibatch_size == trainer.minibatch_size assert minibatch_size is not None replay_buffer = ReplayBuffer(replay_capacity=replay_memory_size, batch_size=minibatch_size) device = torch.device("cuda") if use_gpu else torch.device("cpu") # first fill the replay buffer using random policy train_after_ts = max(train_after_ts, minibatch_size) fill_replay_buffer(env=env, replay_buffer=replay_buffer, desired_size=train_after_ts) agent = Agent.create_for_env(env, policy=training_policy, device=device) # TODO: Simplify this setup by creating LightningDataModule dataset = ReplayBufferDataset.create_for_trainer( trainer, env, agent, replay_buffer, batch_size=minibatch_size, training_frequency=train_every_ts, num_episodes=num_train_episodes, max_steps=200, device=device, ) data_loader = torch.utils.data.DataLoader(dataset, collate_fn=identity_collate) # pyre-fixme[16]: Module `pl` has no attribute `Trainer`. pl_trainer = pl.Trainer(max_epochs=1, gpus=int(use_gpu)) # Note: the fit() function below also evaluates the agent along the way # and adds the new transitions to the replay buffer, so it is training # on incrementally larger and larger buffers. pl_trainer.fit(trainer, data_loader) # TODO: Also check train_reward serving_policy = manager.create_policy(serving=True) eval_rewards = eval_policy(env, serving_policy, num_eval_episodes, serving=True) assert ( eval_rewards.mean() >= passing_score_bar ), f"Eval reward is {eval_rewards.mean()}, less than < {passing_score_bar}.\n"
def run_test( env: str, model: ModelManager__Union, replay_memory_size: int, train_every_ts: int, train_after_ts: int, num_train_episodes: int, max_steps: Optional[int], passing_score_bar: float, num_eval_episodes: int, use_gpu: bool, ): env = EnvFactory.make(env) env.seed(SEED) env.action_space.seed(SEED) normalization = build_normalizer(env) logger.info(f"Normalization is {normalization}") manager = model.value trainer = manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=normalization, ) replay_buffer = ReplayBuffer.create_from_env( env=env, replay_memory_size=replay_memory_size, batch_size=trainer.minibatch_size, ) device = torch.device("cuda") if use_gpu else None post_step = train_with_replay_buffer_post_step( replay_buffer=replay_buffer, trainer=trainer, training_freq=train_every_ts, batch_size=trainer.minibatch_size, replay_burnin=train_after_ts, device=device, ) training_policy = manager.create_policy(serving=False) agent = Agent.create_for_env(env, policy=training_policy, post_transition_callback=post_step, device=device) train_rewards = [] for i in range(num_train_episodes): ep_reward = run_episode(env=env, agent=agent, max_steps=max_steps) train_rewards.append(ep_reward) logger.info(f"Finished training episode {i} with reward {ep_reward}.") assert train_rewards[-1] >= passing_score_bar, ( f"reward after {len(train_rewards)} episodes is {train_rewards[-1]}," f"less than < {passing_score_bar}...\n" f"Full reward history: {train_rewards}") logger.info("============Train rewards=============") logger.info(train_rewards) def gym_to_reagent_serving( obs: np.array) -> Tuple[torch.Tensor, torch.Tensor]: obs_tensor = torch.tensor(obs).float().unsqueeze(0) presence_tensor = torch.ones_like(obs_tensor) return (obs_tensor, presence_tensor) serving_policy = manager.create_policy(serving=True) agent = Agent.create_for_env(env, policy=serving_policy, obs_preprocessor=gym_to_reagent_serving) eval_rewards = [] for i in range(num_eval_episodes): ep_reward = run_episode(env=env, agent=agent, max_steps=max_steps) eval_rewards.append(ep_reward) logger.info(f"Finished eval episode {i} with reward {ep_reward}.") assert np.mean(eval_rewards) >= passing_score_bar, ( f"Predictor reward is {np.mean(eval_rewards)}," f"less than < {passing_score_bar}...\n" f"Full eval rewards: {eval_rewards}.") logger.info("============Eval rewards==============") logger.info(eval_rewards)
def train_mdnrnn_and_compute_feature_stats( env_name: str, model: ModelManager__Union, num_train_transitions: int, num_test_transitions: int, seq_len: int, batch_size: int, num_train_epochs: int, use_gpu: bool, saved_mdnrnn_path: Optional[str] = None, ): """Train MDNRNN Memory Network and compute feature importance/sensitivity.""" env: gym.Env = Gym(env_name=env_name) env.seed(SEED) manager = model.value trainer = manager.build_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=build_normalizer(env), ) device = "cuda" if use_gpu else "cpu" # pyre-fixme[6]: Expected `device` for 2nd param but got `str`. trainer_preprocessor = make_replay_buffer_trainer_preprocessor(trainer, device, env) test_replay_buffer = ReplayBuffer( replay_capacity=num_test_transitions, batch_size=batch_size, stack_size=seq_len, return_everything_as_stack=True, ) random_policy = make_random_policy_for_env(env) agent = Agent.create_for_env(env, policy=random_policy) fill_replay_buffer(env, test_replay_buffer, num_test_transitions, agent) if saved_mdnrnn_path is None: # train from scratch trainer = train_mdnrnn( env=env, trainer=trainer, trainer_preprocessor=trainer_preprocessor, num_train_transitions=num_train_transitions, seq_len=seq_len, batch_size=batch_size, num_train_epochs=num_train_epochs, test_replay_buffer=test_replay_buffer, ) else: # load a pretrained model, and just evaluate it trainer.memory_network.mdnrnn.load_state_dict(torch.load(saved_mdnrnn_path)) with torch.no_grad(): trainer.memory_network.mdnrnn.eval() test_batch = test_replay_buffer.sample_transition_batch( batch_size=test_replay_buffer.size ) preprocessed_test_batch = trainer_preprocessor(test_batch) feature_importance = calculate_feature_importance( env=env, trainer=trainer, use_gpu=use_gpu, test_batch=preprocessed_test_batch, ) feature_sensitivity = calculate_feature_sensitivity( env=env, trainer=trainer, use_gpu=use_gpu, test_batch=preprocessed_test_batch, ) trainer.memory_network.mdnrnn.train() return feature_importance, feature_sensitivity
def run_test( env_name: str, model: ModelManager__Union, replay_memory_size: int, train_every_ts: int, train_after_ts: int, num_train_episodes: int, max_steps: Optional[int], passing_score_bar: float, num_eval_episodes: int, use_gpu: bool, ): env = EnvFactory.make(env_name) env.seed(SEED) env.action_space.seed(SEED) normalization = build_normalizer(env) logger.info(f"Normalization is: \n{pprint.pformat(normalization)}") manager = model.value try: # pyre-fixme[16]: `Env` has no attribute `state_feature_config_provider`. manager.state_feature_config_provider = env.state_feature_config_provider logger.info( f"Using environment's state_feature_config_provider.\n" f"{manager.state_feature_config_provider}" ) except AttributeError: logger.info("state_feature_config_provider override not applicable") trainer = manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=normalization, ) training_policy = manager.create_policy(serving=False) replay_buffer = ReplayBuffer.create_from_env( env=env, replay_memory_size=replay_memory_size, batch_size=trainer.minibatch_size, ) device = torch.device("cuda") if use_gpu else torch.device("cpu") # first fill the replay buffer to burn_in train_after_ts = max(train_after_ts, trainer.minibatch_size) fill_replay_buffer( env=env, replay_buffer=replay_buffer, desired_size=train_after_ts ) post_step = train_with_replay_buffer_post_step( replay_buffer=replay_buffer, env=env, trainer=trainer, training_freq=train_every_ts, batch_size=trainer.minibatch_size, device=device, ) agent = Agent.create_for_env( env, policy=training_policy, post_transition_callback=post_step, device=device ) writer = SummaryWriter() with summary_writer_context(writer): train_rewards = [] for i in range(num_train_episodes): trajectory = run_episode( env=env, agent=agent, mdp_id=i, max_steps=max_steps ) ep_reward = trajectory.calculate_cumulative_reward() train_rewards.append(ep_reward) logger.info( f"Finished training episode {i} (len {len(trajectory)})" f" with reward {ep_reward}." ) logger.info("============Train rewards=============") logger.info(train_rewards) logger.info(f"average: {np.mean(train_rewards)};\tmax: {np.max(train_rewards)}") # Check whether the max score passed the score bar; we explore during training # the return could be bad (leading to flakiness in C51 and QRDQN). assert np.max(train_rewards) >= passing_score_bar, ( f"max reward ({np.max(train_rewards)})after training for " f"{len(train_rewards)} episodes is less than < {passing_score_bar}.\n" ) serving_policy = manager.create_policy(serving=True) agent = Agent.create_for_env_with_serving_policy(env, serving_policy) eval_rewards = evaluate_for_n_episodes( n=num_eval_episodes, env=env, agent=agent, max_steps=max_steps ).squeeze(1) logger.info("============Eval rewards==============") logger.info(eval_rewards) logger.info(f"average: {np.mean(eval_rewards)};\tmax: {np.max(eval_rewards)}") assert np.mean(eval_rewards) >= passing_score_bar, ( f"Predictor reward is {np.mean(eval_rewards)}," f"less than < {passing_score_bar}.\n" )
def run_test_offline( env_name: str, model: ModelManager__Union, replay_memory_size: int, num_batches_per_epoch: int, num_train_epochs: int, passing_score_bar: float, num_eval_episodes: int, minibatch_size: int, use_gpu: bool, ): env = Gym(env_name=env_name) env.seed(SEED) env.action_space.seed(SEED) normalization = build_normalizer(env) logger.info(f"Normalization is: \n{pprint.pformat(normalization)}") manager = model.value trainer = manager.build_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=normalization, ) # first fill the replay buffer to burn_in replay_buffer = ReplayBuffer( replay_capacity=replay_memory_size, batch_size=minibatch_size ) # always fill full RB random_policy = make_random_policy_for_env(env) agent = Agent.create_for_env(env, policy=random_policy) fill_replay_buffer( env=env, replay_buffer=replay_buffer, desired_size=replay_memory_size, agent=agent, ) device = torch.device("cuda") if use_gpu else None dataset = OfflineReplayBufferDataset.create_for_trainer( trainer, env, replay_buffer, batch_size=minibatch_size, num_batches=num_batches_per_epoch, device=device, ) data_loader = torch.utils.data.DataLoader(dataset, collate_fn=identity_collate) pl_trainer = pl.Trainer( max_epochs=num_train_epochs, gpus=int(use_gpu), deterministic=True, default_root_dir=f"lightning_log_{str(uuid.uuid4())}", ) pl_trainer.fit(trainer, data_loader) logger.info(f"Evaluating after training for {num_train_epochs} epochs: ") eval_rewards = evaluate_cem(env, manager, trainer, num_eval_episodes) mean_rewards = np.mean(eval_rewards) assert ( mean_rewards >= passing_score_bar ), f"{mean_rewards} doesn't pass the bar {passing_score_bar}."