def evaluate_cem(env, manager, num_eval_episodes: int): # NOTE: for CEM, serving isn't implemented policy = manager.create_policy(serving=False) agent = Agent.create_for_env(env, policy) return evaluate_for_n_episodes( n=num_eval_episodes, env=env, agent=agent, max_steps=env.max_steps )
def eval_policy( env: EnvWrapper, serving_policy: Policy, num_eval_episodes: int, serving: bool = True, ) -> np.ndarray: agent = ( Agent.create_for_env_with_serving_policy(env, serving_policy) if serving else Agent.create_for_env(env, serving_policy) ) eval_rewards = evaluate_for_n_episodes( n=num_eval_episodes, env=env, agent=agent, max_steps=env.max_steps, num_processes=1, ).squeeze(1) logger.info("============Eval rewards==============") logger.info(eval_rewards) mean_eval = np.mean(eval_rewards) logger.info(f"average: {mean_eval};\tmax: {np.max(eval_rewards)}") return np.array(eval_rewards)
def evaluate_gym( env_name: str, model: ModelManager__Union, publisher: ModelPublisher__Union, num_eval_episodes: int, passing_score_bar: float, max_steps: Optional[int] = None, ): publisher_manager = publisher.value assert isinstance( publisher_manager, FileSystemPublisher ), f"publishing manager is type {type(publisher_manager)}, not FileSystemPublisher" env = Gym(env_name=env_name) torchscript_path = publisher_manager.get_latest_published_model( model.value) jit_model = torch.jit.load(torchscript_path) policy = create_predictor_policy_from_model(jit_model) agent = Agent.create_for_env_with_serving_policy(env, policy) rewards = evaluate_for_n_episodes(n=num_eval_episodes, env=env, agent=agent, max_steps=max_steps) avg_reward = np.mean(rewards) logger.info(f"Average reward over {num_eval_episodes} is {avg_reward}.\n" f"List of rewards: {rewards}") assert (avg_reward >= passing_score_bar ), f"{avg_reward} fails to pass the bar of {passing_score_bar}!" return
def run_test_online_episode( env: Env__Union, model: ModelManager__Union, num_train_episodes: int, passing_score_bar: float, num_eval_episodes: int, use_gpu: bool, ): """ Run an online learning test. At the end of each episode training is run on the trajectory. """ env = env.value pl.seed_everything(SEED) env.seed(SEED) env.action_space.seed(SEED) normalization = build_normalizer(env) logger.info(f"Normalization is: \n{pprint.pformat(normalization)}") manager = model.value trainer = manager.build_trainer( use_gpu=use_gpu, normalization_data_map=normalization, ) policy = manager.create_policy(trainer, serving=False) device = torch.device("cuda") if use_gpu else torch.device("cpu") agent = Agent.create_for_env(env, policy, device=device) pl_trainer = pl.Trainer( max_epochs=1, gpus=int(use_gpu), deterministic=True, default_root_dir=f"lightning_log_{str(uuid.uuid4())}", ) dataset = EpisodicDataset(env=env, agent=agent, num_episodes=num_train_episodes, seed=SEED) data_loader = torch.utils.data.DataLoader(dataset, collate_fn=identity_collate) pl_trainer.fit(trainer, data_loader) eval_rewards = evaluate_for_n_episodes( n=num_eval_episodes, env=env, agent=agent, max_steps=env.max_steps, num_processes=1, ).squeeze(1) assert ( eval_rewards.mean() >= passing_score_bar ), f"Eval reward is {eval_rewards.mean()}, less than < {passing_score_bar}.\n"
def evaluate_gym( env_name: str, model: ModelManager__Union, publisher: ModelPublisher__Union, num_eval_episodes: int, passing_score_bar: float, module_name: str = "default_model", max_steps: Optional[int] = None, ): initialize_seed(1) env = Gym(env_name=env_name) agent = make_agent_from_model(env, model, publisher, module_name) rewards = evaluate_for_n_episodes(n=num_eval_episodes, env=env, agent=agent, max_steps=max_steps) avg_reward = np.mean(rewards) logger.info(f"Average reward over {num_eval_episodes} is {avg_reward}.\n" f"List of rewards: {rewards}\n" f"Passing score bar: {passing_score_bar}") assert (avg_reward >= passing_score_bar ), f"{avg_reward} fails to pass the bar of {passing_score_bar}!" return
def run_test( env_name: str, model: ModelManager__Union, replay_memory_size: int, train_every_ts: int, train_after_ts: int, num_train_episodes: int, max_steps: Optional[int], passing_score_bar: float, num_eval_episodes: int, use_gpu: bool, ): env = EnvFactory.make(env_name) env.seed(SEED) env.action_space.seed(SEED) normalization = build_normalizer(env) logger.info(f"Normalization is: \n{pprint.pformat(normalization)}") manager = model.value try: # pyre-fixme[16]: `Env` has no attribute `state_feature_config_provider`. manager.state_feature_config_provider = env.state_feature_config_provider logger.info( f"Using environment's state_feature_config_provider.\n" f"{manager.state_feature_config_provider}" ) except AttributeError: logger.info("state_feature_config_provider override not applicable") trainer = manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=normalization, ) training_policy = manager.create_policy(serving=False) replay_buffer = ReplayBuffer.create_from_env( env=env, replay_memory_size=replay_memory_size, batch_size=trainer.minibatch_size, ) device = torch.device("cuda") if use_gpu else torch.device("cpu") # first fill the replay buffer to burn_in train_after_ts = max(train_after_ts, trainer.minibatch_size) fill_replay_buffer( env=env, replay_buffer=replay_buffer, desired_size=train_after_ts ) post_step = train_with_replay_buffer_post_step( replay_buffer=replay_buffer, env=env, trainer=trainer, training_freq=train_every_ts, batch_size=trainer.minibatch_size, device=device, ) agent = Agent.create_for_env( env, policy=training_policy, post_transition_callback=post_step, device=device ) writer = SummaryWriter() with summary_writer_context(writer): train_rewards = [] for i in range(num_train_episodes): trajectory = run_episode( env=env, agent=agent, mdp_id=i, max_steps=max_steps ) ep_reward = trajectory.calculate_cumulative_reward() train_rewards.append(ep_reward) logger.info( f"Finished training episode {i} (len {len(trajectory)})" f" with reward {ep_reward}." ) logger.info("============Train rewards=============") logger.info(train_rewards) logger.info(f"average: {np.mean(train_rewards)};\tmax: {np.max(train_rewards)}") # Check whether the max score passed the score bar; we explore during training # the return could be bad (leading to flakiness in C51 and QRDQN). assert np.max(train_rewards) >= passing_score_bar, ( f"max reward ({np.max(train_rewards)})after training for " f"{len(train_rewards)} episodes is less than < {passing_score_bar}.\n" ) serving_policy = manager.create_policy(serving=True) agent = Agent.create_for_env_with_serving_policy(env, serving_policy) eval_rewards = evaluate_for_n_episodes( n=num_eval_episodes, env=env, agent=agent, max_steps=max_steps ).squeeze(1) logger.info("============Eval rewards==============") logger.info(eval_rewards) logger.info(f"average: {np.mean(eval_rewards)};\tmax: {np.max(eval_rewards)}") assert np.mean(eval_rewards) >= passing_score_bar, ( f"Predictor reward is {np.mean(eval_rewards)}," f"less than < {passing_score_bar}.\n" )
def run_test( env_name: str, model: ModelManager__Union, replay_memory_size: int, train_every_ts: int, train_after_ts: int, num_train_episodes: int, max_steps: Optional[int], passing_score_bar: float, num_eval_episodes: int, use_gpu: bool, ): env = EnvFactory.make(env_name) env.seed(SEED) env.action_space.seed(SEED) normalization = build_normalizer(env) logger.info(f"Normalization is: \n{pprint.pformat(normalization)}") manager = model.value trainer = manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=normalization, ) training_policy = manager.create_policy(serving=False) replay_buffer = ReplayBuffer.create_from_env( env=env, replay_memory_size=replay_memory_size, batch_size=trainer.minibatch_size, ) device = torch.device("cuda") if use_gpu else None # first fill the replay buffer to burn_in train_after_ts = max(train_after_ts, trainer.minibatch_size) fill_replay_buffer(env=env, replay_buffer=replay_buffer, desired_size=train_after_ts) post_step = train_with_replay_buffer_post_step( replay_buffer=replay_buffer, env=env, trainer=trainer, training_freq=train_every_ts, batch_size=trainer.minibatch_size, device=device, ) agent = Agent.create_for_env( env, policy=training_policy, post_transition_callback=post_step, # pyre-fixme[6]: Expected `Union[str, torch.device]` for 4th param but got # `Optional[torch.device]`. device=device, ) writer = SummaryWriter() with summary_writer_context(writer): train_rewards = [] for i in range(num_train_episodes): trajectory = run_episode(env=env, agent=agent, mdp_id=i, max_steps=max_steps) ep_reward = trajectory.calculate_cumulative_reward() train_rewards.append(ep_reward) logger.info( f"Finished training episode {i} with reward {ep_reward}.") assert train_rewards[-1] >= passing_score_bar, ( f"reward after {len(train_rewards)} episodes is {train_rewards[-1]}," f"less than < {passing_score_bar}...\n" f"Full reward history: {train_rewards}") logger.info("============Train rewards=============") logger.info(train_rewards) serving_policy = manager.create_policy(serving=True) agent = Agent.create_for_env_with_serving_policy(env, serving_policy) eval_rewards = evaluate_for_n_episodes(n=num_eval_episodes, env=env, agent=agent, max_steps=max_steps).squeeze(1) assert np.mean(eval_rewards) >= passing_score_bar, ( f"Predictor reward is {np.mean(eval_rewards)}," f"less than < {passing_score_bar}...\n" f"Full eval rewards: {eval_rewards}.") logger.info("============Eval rewards==============") logger.info(eval_rewards)
def train_mdnrnn_and_train_on_embedded_env( env_name: str, embedding_model: ModelManager__Union, num_embedding_train_transitions: int, seq_len: int, batch_size: int, num_embedding_train_epochs: int, train_model: ModelManager__Union, num_state_embed_transitions: int, num_agent_train_epochs: int, num_agent_eval_epochs: int, use_gpu: bool, passing_score_bar: float, # pyre-fixme[9]: saved_mdnrnn_path has type `str`; used as `None`. saved_mdnrnn_path: str = None, ): """ Train an agent on embedded states by the MDNRNN. """ env = Gym(env_name=env_name) env.seed(SEED) embedding_manager = embedding_model.value embedding_trainer = embedding_manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=build_normalizer(env), ) device = "cuda" if use_gpu else "cpu" embedding_trainer_preprocessor = make_replay_buffer_trainer_preprocessor( embedding_trainer, # pyre-fixme[6]: Expected `device` for 2nd param but got `str`. device, env, ) if saved_mdnrnn_path is None: # train from scratch embedding_trainer = train_mdnrnn( env=env, trainer=embedding_trainer, trainer_preprocessor=embedding_trainer_preprocessor, num_train_transitions=num_embedding_train_transitions, seq_len=seq_len, batch_size=batch_size, num_train_epochs=num_embedding_train_epochs, ) else: # load a pretrained model, and just evaluate it embedding_trainer.memory_network.mdnrnn.load_state_dict( torch.load(saved_mdnrnn_path)) # create embedding dataset embed_rb, state_min, state_max = create_embed_rl_dataset( env=env, memory_network=embedding_trainer.memory_network, num_state_embed_transitions=num_state_embed_transitions, batch_size=batch_size, seq_len=seq_len, hidden_dim=embedding_trainer.params.hidden_size, use_gpu=use_gpu, ) embed_env = StateEmbedEnvironment( gym_env=env, mdnrnn=embedding_trainer.memory_network, max_embed_seq_len=seq_len, state_min_value=state_min, state_max_value=state_max, ) agent_manager = train_model.value agent_trainer = agent_manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), # pyre-fixme[6]: Expected `EnvWrapper` for 1st param but got # `StateEmbedEnvironment`. normalization_data_map=build_normalizer(embed_env), ) device = "cuda" if use_gpu else "cpu" agent_trainer_preprocessor = make_replay_buffer_trainer_preprocessor( agent_trainer, # pyre-fixme[6]: Expected `device` for 2nd param but got `str`. device, env, ) num_batch_per_epoch = embed_rb.size // batch_size # FIXME: This has to be wrapped in dataloader for epoch in range(num_agent_train_epochs): for _ in tqdm(range(num_batch_per_epoch), desc=f"epoch {epoch}"): batch = embed_rb.sample_transition_batch(batch_size=batch_size) preprocessed_batch = agent_trainer_preprocessor(batch) # FIXME: This should be fitted with Lightning's trainer agent_trainer.train(preprocessed_batch) # evaluate model rewards = [] policy = agent_manager.create_policy(serving=False) # pyre-fixme[6]: Expected `EnvWrapper` for 1st param but got # `StateEmbedEnvironment`. agent = Agent.create_for_env(embed_env, policy=policy, device=device) # num_processes=1 needed to avoid workers from dying on CircleCI tests rewards = evaluate_for_n_episodes( n=num_agent_eval_epochs, # pyre-fixme[6]: Expected `EnvWrapper` for 2nd param but got # `StateEmbedEnvironment`. env=embed_env, agent=agent, num_processes=1, ) assert (np.mean(rewards) >= passing_score_bar ), f"average reward doesn't pass our bar {passing_score_bar}" return rewards
def run_test_online_episode( env: Env__Union, model: ModelManager__Union, num_train_episodes: int, passing_score_bar: float, num_eval_episodes: int, use_gpu: bool, ): """ Run an online learning test. At the end of each episode training is run on the trajectory. """ env = env.value # pyre-fixme[16]: Module `pl` has no attribute `seed_everything`. pl.seed_everything(SEED) env.seed(SEED) env.action_space.seed(SEED) normalization = build_normalizer(env) logger.info(f"Normalization is: \n{pprint.pformat(normalization)}") manager = model.value trainer = manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=normalization, ) policy = manager.create_policy(serving=False) device = torch.device("cuda") if use_gpu else torch.device("cpu") agent = Agent.create_for_env(env, policy, device=device) # pyre-fixme[16]: Module `pl` has no attribute `LightningModule`. if isinstance(trainer, pl.LightningModule): # pyre-fixme[16]: Module `pl` has no attribute `Trainer`. pl_trainer = pl.Trainer(max_epochs=1, gpus=int(use_gpu), deterministic=True) dataset = EpisodicDataset(env=env, agent=agent, num_episodes=num_train_episodes, seed=SEED) pl_trainer.fit(trainer, dataset) else: post_episode_callback = train_post_episode(env, trainer, use_gpu) _ = train_policy( env, policy, num_train_episodes, post_step=None, post_episode=post_episode_callback, use_gpu=use_gpu, ) eval_rewards = evaluate_for_n_episodes( n=num_eval_episodes, env=env, agent=agent, max_steps=env.max_steps, num_processes=1, ).squeeze(1) assert ( eval_rewards.mean() >= passing_score_bar ), f"Eval reward is {eval_rewards.mean()}, less than < {passing_score_bar}.\n"