def evaluate_gym( env_name: str, model: ModelManager__Union, publisher: ModelPublisher__Union, num_eval_episodes: int, passing_score_bar: float, max_steps: Optional[int] = None, ): publisher_manager = publisher.value assert isinstance( publisher_manager, FileSystemPublisher ), f"publishing manager is type {type(publisher_manager)}, not FileSystemPublisher" env = Gym(env_name=env_name) torchscript_path = publisher_manager.get_latest_published_model( model.value) jit_model = torch.jit.load(torchscript_path) policy = create_predictor_policy_from_model(jit_model) agent = Agent.create_for_env_with_serving_policy(env, policy) rewards = evaluate_for_n_episodes(n=num_eval_episodes, env=env, agent=agent, max_steps=max_steps) avg_reward = np.mean(rewards) logger.info(f"Average reward over {num_eval_episodes} is {avg_reward}.\n" f"List of rewards: {rewards}") assert (avg_reward >= passing_score_bar ), f"{avg_reward} fails to pass the bar of {passing_score_bar}!" return
def eval_policy( env: EnvWrapper, serving_policy: Policy, num_eval_episodes: int, serving: bool = True, ) -> np.ndarray: agent = ( Agent.create_for_env_with_serving_policy(env, serving_policy) if serving else Agent.create_for_env(env, serving_policy) ) eval_rewards = evaluate_for_n_episodes( n=num_eval_episodes, env=env, agent=agent, max_steps=env.max_steps, num_processes=1, ).squeeze(1) logger.info("============Eval rewards==============") logger.info(eval_rewards) mean_eval = np.mean(eval_rewards) logger.info(f"average: {mean_eval};\tmax: {np.max(eval_rewards)}") return np.array(eval_rewards)
def make_agent_from_model( env: Gym, model: ModelManager__Union, publisher: ModelPublisher__Union, module_name: str, ): publisher_manager = publisher.value assert isinstance( publisher_manager, FileSystemPublisher ), f"publishing manager is type {type(publisher_manager)}, not FileSystemPublisher" module_names = model.value.serving_module_names() assert module_name in module_names, f"{module_name} not in {module_names}" torchscript_path = publisher_manager.get_latest_published_model( model.value, module_name) jit_model = torch.jit.load(torchscript_path) policy = create_predictor_policy_from_model(jit_model) agent = Agent.create_for_env_with_serving_policy(env, policy) return agent
def run_test( env_name: str, model: ModelManager__Union, replay_memory_size: int, train_every_ts: int, train_after_ts: int, num_train_episodes: int, max_steps: Optional[int], passing_score_bar: float, num_eval_episodes: int, use_gpu: bool, ): env = EnvFactory.make(env_name) env.seed(SEED) env.action_space.seed(SEED) normalization = build_normalizer(env) logger.info(f"Normalization is: \n{pprint.pformat(normalization)}") manager = model.value try: # pyre-fixme[16]: `Env` has no attribute `state_feature_config_provider`. manager.state_feature_config_provider = env.state_feature_config_provider logger.info( f"Using environment's state_feature_config_provider.\n" f"{manager.state_feature_config_provider}" ) except AttributeError: logger.info("state_feature_config_provider override not applicable") trainer = manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=normalization, ) training_policy = manager.create_policy(serving=False) replay_buffer = ReplayBuffer.create_from_env( env=env, replay_memory_size=replay_memory_size, batch_size=trainer.minibatch_size, ) device = torch.device("cuda") if use_gpu else torch.device("cpu") # first fill the replay buffer to burn_in train_after_ts = max(train_after_ts, trainer.minibatch_size) fill_replay_buffer( env=env, replay_buffer=replay_buffer, desired_size=train_after_ts ) post_step = train_with_replay_buffer_post_step( replay_buffer=replay_buffer, env=env, trainer=trainer, training_freq=train_every_ts, batch_size=trainer.minibatch_size, device=device, ) agent = Agent.create_for_env( env, policy=training_policy, post_transition_callback=post_step, device=device ) writer = SummaryWriter() with summary_writer_context(writer): train_rewards = [] for i in range(num_train_episodes): trajectory = run_episode( env=env, agent=agent, mdp_id=i, max_steps=max_steps ) ep_reward = trajectory.calculate_cumulative_reward() train_rewards.append(ep_reward) logger.info( f"Finished training episode {i} (len {len(trajectory)})" f" with reward {ep_reward}." ) logger.info("============Train rewards=============") logger.info(train_rewards) logger.info(f"average: {np.mean(train_rewards)};\tmax: {np.max(train_rewards)}") # Check whether the max score passed the score bar; we explore during training # the return could be bad (leading to flakiness in C51 and QRDQN). assert np.max(train_rewards) >= passing_score_bar, ( f"max reward ({np.max(train_rewards)})after training for " f"{len(train_rewards)} episodes is less than < {passing_score_bar}.\n" ) serving_policy = manager.create_policy(serving=True) agent = Agent.create_for_env_with_serving_policy(env, serving_policy) eval_rewards = evaluate_for_n_episodes( n=num_eval_episodes, env=env, agent=agent, max_steps=max_steps ).squeeze(1) logger.info("============Eval rewards==============") logger.info(eval_rewards) logger.info(f"average: {np.mean(eval_rewards)};\tmax: {np.max(eval_rewards)}") assert np.mean(eval_rewards) >= passing_score_bar, ( f"Predictor reward is {np.mean(eval_rewards)}," f"less than < {passing_score_bar}.\n" )
def run_test( env_name: str, model: ModelManager__Union, replay_memory_size: int, train_every_ts: int, train_after_ts: int, num_train_episodes: int, max_steps: Optional[int], passing_score_bar: float, num_eval_episodes: int, use_gpu: bool, ): env = EnvFactory.make(env_name) env.seed(SEED) env.action_space.seed(SEED) normalization = build_normalizer(env) logger.info(f"Normalization is: \n{pprint.pformat(normalization)}") manager = model.value trainer = manager.initialize_trainer( use_gpu=use_gpu, reward_options=RewardOptions(), normalization_data_map=normalization, ) training_policy = manager.create_policy(serving=False) replay_buffer = ReplayBuffer.create_from_env( env=env, replay_memory_size=replay_memory_size, batch_size=trainer.minibatch_size, ) device = torch.device("cuda") if use_gpu else None # first fill the replay buffer to burn_in train_after_ts = max(train_after_ts, trainer.minibatch_size) fill_replay_buffer(env=env, replay_buffer=replay_buffer, desired_size=train_after_ts) post_step = train_with_replay_buffer_post_step( replay_buffer=replay_buffer, env=env, trainer=trainer, training_freq=train_every_ts, batch_size=trainer.minibatch_size, device=device, ) agent = Agent.create_for_env( env, policy=training_policy, post_transition_callback=post_step, # pyre-fixme[6]: Expected `Union[str, torch.device]` for 4th param but got # `Optional[torch.device]`. device=device, ) writer = SummaryWriter() with summary_writer_context(writer): train_rewards = [] for i in range(num_train_episodes): trajectory = run_episode(env=env, agent=agent, mdp_id=i, max_steps=max_steps) ep_reward = trajectory.calculate_cumulative_reward() train_rewards.append(ep_reward) logger.info( f"Finished training episode {i} with reward {ep_reward}.") assert train_rewards[-1] >= passing_score_bar, ( f"reward after {len(train_rewards)} episodes is {train_rewards[-1]}," f"less than < {passing_score_bar}...\n" f"Full reward history: {train_rewards}") logger.info("============Train rewards=============") logger.info(train_rewards) serving_policy = manager.create_policy(serving=True) agent = Agent.create_for_env_with_serving_policy(env, serving_policy) eval_rewards = evaluate_for_n_episodes(n=num_eval_episodes, env=env, agent=agent, max_steps=max_steps).squeeze(1) assert np.mean(eval_rewards) >= passing_score_bar, ( f"Predictor reward is {np.mean(eval_rewards)}," f"less than < {passing_score_bar}...\n" f"Full eval rewards: {eval_rewards}.") logger.info("============Eval rewards==============") logger.info(eval_rewards)