def run_episode(env: Env, agent: Agent, mdp_id: int = 0, max_steps: Optional[int] = None) -> Trajectory: """ Return sum of rewards from episode. After max_steps (if specified), the environment is assumed to be terminal. Can also specify the mdp_id and gamma of episode. """ trajectory = Trajectory() obs = env.reset() terminal = False num_steps = 0 while not terminal: action = agent.act(obs) next_obs, reward, terminal, _ = env.step(action) if max_steps is not None and num_steps >= max_steps: terminal = True # Only partially filled. Agent can fill in more fields. transition = Transition( mdp_id=mdp_id, sequence_number=num_steps, observation=obs, action=action, reward=reward, terminal=terminal, ) agent.post_step(transition) trajectory.add_transition(transition) SummaryWriterContext.increase_global_step() obs = next_obs num_steps += 1 return trajectory
async def async_run_episode( env: EnvWrapper, agent: Agent, mdp_id: int = 0, max_steps: Optional[int] = None, fill_info: bool = False, ) -> Trajectory: """ NOTE: this funciton is an async coroutine in order to support async env.step(). If you are using it with regular env.step() method, use non-async run_episode(), which wraps this function. Return sum of rewards from episode. After max_steps (if specified), the environment is assumed to be terminal. Can also specify the mdp_id and gamma of episode. """ trajectory = Trajectory() obs = env.reset() possible_actions_mask = env.possible_actions_mask terminal = False num_steps = 0 step_is_coroutine = asyncio.iscoroutinefunction(env.step) while not terminal: action, log_prob = agent.act(obs, possible_actions_mask) if step_is_coroutine: next_obs, reward, terminal, info = await env.step(action) else: next_obs, reward, terminal, info = env.step(action) if not fill_info: info = None next_possible_actions_mask = env.possible_actions_mask if max_steps is not None and num_steps >= max_steps: terminal = True # Only partially filled. Agent can fill in more fields. transition = Transition( mdp_id=mdp_id, sequence_number=num_steps, observation=obs, action=action, reward=float(reward), terminal=bool(terminal), log_prob=log_prob, possible_actions_mask=possible_actions_mask, info=info, ) agent.post_step(transition) trajectory.add_transition(transition) SummaryWriterContext.increase_global_step() obs = next_obs possible_actions_mask = next_possible_actions_mask num_steps += 1 agent.post_episode(trajectory) return trajectory
def run_episode(env: Env, agent: Agent, max_steps: Optional[int] = None) -> float: """ Return sum of rewards from episode. """ ep_reward = 0.0 obs = env.reset() terminal = False num_steps = 0 while not terminal: action = agent.act(obs) next_obs, reward, terminal, _ = env.step(action) obs = next_obs ep_reward += reward num_steps += 1 if max_steps is not None and num_steps > max_steps: terminal = True agent.post_step(reward, terminal) return ep_reward
def run_episode(env: EnvWrapper, agent: Agent, mdp_id: int = 0, max_steps: Optional[int] = None) -> Trajectory: """ Return sum of rewards from episode. After max_steps (if specified), the environment is assumed to be terminal. Can also specify the mdp_id and gamma of episode. """ trajectory = Trajectory() # pyre-fixme[16]: `EnvWrapper` has no attribute `reset`. obs = env.reset() possible_actions_mask = env.possible_actions_mask terminal = False num_steps = 0 while not terminal: action, log_prob = agent.act(obs, possible_actions_mask) # pyre-fixme[16]: `EnvWrapper` has no attribute `step`. next_obs, reward, terminal, _ = env.step(action) next_possible_actions_mask = env.possible_actions_mask if max_steps is not None and num_steps >= max_steps: terminal = True # Only partially filled. Agent can fill in more fields. transition = Transition( mdp_id=mdp_id, sequence_number=num_steps, observation=obs, action=action, reward=float(reward), terminal=bool(terminal), log_prob=log_prob, possible_actions_mask=possible_actions_mask, ) agent.post_step(transition) trajectory.add_transition(transition) SummaryWriterContext.increase_global_step() obs = next_obs possible_actions_mask = next_possible_actions_mask num_steps += 1 agent.post_episode(trajectory) return trajectory
def run_episode(env: Env, agent: Agent, max_steps: Optional[int] = None) -> float: """ Return sum of rewards from episode. After max_steps (if specified), the environment is assumed to be terminal. """ ep_reward = 0.0 obs = env.reset() terminal = False num_steps = 0 while not terminal: action = agent.act(obs) next_obs, reward, terminal, _ = env.step(action) obs = next_obs ep_reward += reward num_steps += 1 if max_steps is not None and num_steps > max_steps: terminal = True agent.post_step(reward, terminal) SummaryWriterContext.increase_global_step() return ep_reward