def run_episode(env: Env, agent: Agent, mdp_id: int = 0, max_steps: Optional[int] = None) -> Trajectory: """ Return sum of rewards from episode. After max_steps (if specified), the environment is assumed to be terminal. Can also specify the mdp_id and gamma of episode. """ trajectory = Trajectory() obs = env.reset() terminal = False num_steps = 0 while not terminal: action = agent.act(obs) next_obs, reward, terminal, _ = env.step(action) if max_steps is not None and num_steps >= max_steps: terminal = True # Only partially filled. Agent can fill in more fields. transition = Transition( mdp_id=mdp_id, sequence_number=num_steps, observation=obs, action=action, reward=reward, terminal=terminal, ) agent.post_step(transition) trajectory.add_transition(transition) SummaryWriterContext.increase_global_step() obs = next_obs num_steps += 1 return trajectory
def _create_replay_buffer_and_insert(env: EnvWrapper): env.seed(1) replay_buffer = ReplayBuffer(replay_capacity=6, batch_size=1) replay_buffer_inserter = make_replay_buffer_inserter(env) obs = env.reset() inserted = [] terminal = False i = 0 while not terminal and i < 5: logger.info(f"Iteration: {i}") action = env.action_space.sample() next_obs, reward, terminal, _ = env.step(action) inserted.append( { "observation": obs, "action": action, "reward": reward, "terminal": terminal, } ) transition = Transition( mdp_id=0, sequence_number=i, observation=obs, action=action, reward=reward, terminal=terminal, log_prob=0.0, ) replay_buffer_inserter(replay_buffer, transition) obs = next_obs i += 1 return replay_buffer, inserted
def __iter__(self): mdp_id = 0 global_num_steps = 0 rewards = [] # TODO: We probably should put member vars into local vars to # reduce indirection, improving perf while self._num_episodes is None or mdp_id < self._num_episodes: obs = self._env.reset() possible_actions_mask = self._env.possible_actions_mask terminal = False num_steps = 0 episode_reward_sum = 0 trajectory = Trajectory() while not terminal: action, log_prob = self._agent.act(obs, possible_actions_mask) next_obs, reward, terminal, info = self._env.step(action) next_possible_actions_mask = self._env.possible_actions_mask if self._max_steps is not None and num_steps >= self._max_steps: terminal = True # Only partially filled. Agent can fill in more fields. transition = Transition( mdp_id=mdp_id, sequence_number=num_steps, observation=obs, action=action, reward=float(reward), terminal=bool(terminal), log_prob=log_prob, possible_actions_mask=possible_actions_mask, ) trajectory.add_transition(transition) self._replay_buffer_inserter(self._replay_buffer, transition) episode_reward_sum += reward if (global_num_steps % self._training_frequency == 0 and self._replay_buffer.size >= self._batch_size): train_batch = self._replay_buffer.sample_transition_batch( batch_size=self._batch_size) if self._trainer_preprocessor: train_batch = self._trainer_preprocessor(train_batch) yield train_batch obs = next_obs possible_actions_mask = next_possible_actions_mask num_steps += 1 global_num_steps += 1 if self._post_episode_callback: self._post_episode_callback(trajectory, info) rewards.append(episode_reward_sum) mdp_id += 1 print() print("Training episode: " + str(mdp_id) + ", total episode reward = " + str(episode_reward_sum)) print("Episode rewards during training:") print(rewards)
def post_step(self, transition: Transition): """ to be called after step(action) """ if self.post_transition_callback is not None: transition.log_prob = self._log_prob # pyre-fixme[29]: `Optional[typing.Callable[[Transition], None]]` is not # a function. self.post_transition_callback(transition) self._reset_internal_states()
async def async_run_episode( env: EnvWrapper, agent: Agent, mdp_id: int = 0, max_steps: Optional[int] = None, fill_info: bool = False, ) -> Trajectory: """ NOTE: this funciton is an async coroutine in order to support async env.step(). If you are using it with regular env.step() method, use non-async run_episode(), which wraps this function. Return sum of rewards from episode. After max_steps (if specified), the environment is assumed to be terminal. Can also specify the mdp_id and gamma of episode. """ trajectory = Trajectory() obs = env.reset() possible_actions_mask = env.possible_actions_mask terminal = False num_steps = 0 step_is_coroutine = asyncio.iscoroutinefunction(env.step) while not terminal: action, log_prob = agent.act(obs, possible_actions_mask) if step_is_coroutine: next_obs, reward, terminal, info = await env.step(action) else: next_obs, reward, terminal, info = env.step(action) if not fill_info: info = None next_possible_actions_mask = env.possible_actions_mask if max_steps is not None and num_steps >= max_steps: terminal = True # Only partially filled. Agent can fill in more fields. transition = Transition( mdp_id=mdp_id, sequence_number=num_steps, observation=obs, action=action, reward=float(reward), terminal=bool(terminal), log_prob=log_prob, possible_actions_mask=possible_actions_mask, info=info, ) agent.post_step(transition) trajectory.add_transition(transition) SummaryWriterContext.increase_global_step() obs = next_obs possible_actions_mask = next_possible_actions_mask num_steps += 1 agent.post_episode(trajectory) return trajectory
def __iter__(self): mdp_id = 0 global_num_steps = 0 # TODO: We probably should put member vars into local vars to # reduce indirection, improving perf while self._num_episodes is None or mdp_id < self._num_episodes: obs = self._env.reset() possible_actions_mask = self._env.possible_actions_mask terminal = False num_steps = 0 while not terminal: action, log_prob = self._agent.act(obs, possible_actions_mask) next_obs, reward, terminal, _ = self._env.step(action) next_possible_actions_mask = self._env.possible_actions_mask if self._max_steps is not None and num_steps >= self._max_steps: terminal = True # Only partially filled. Agent can fill in more fields. transition = Transition( mdp_id=mdp_id, sequence_number=num_steps, observation=obs, action=action, reward=float(reward), terminal=bool(terminal), log_prob=log_prob, possible_actions_mask=possible_actions_mask, ) self._replay_buffer_inserter(self._replay_buffer, transition) if ( global_num_steps % self._training_frequency == 0 and self._replay_buffer.size >= self._batch_size ): train_batch = self._replay_buffer.sample_transition_batch( batch_size=self._batch_size ) if self._trainer_preprocessor: train_batch = self._trainer_preprocessor(train_batch) yield train_batch obs = next_obs possible_actions_mask = next_possible_actions_mask num_steps += 1 global_num_steps += 1 mdp_id += 1
def __call__(self, replay_buffer: ReplayBuffer, transition: Transition): transition_dict = transition.asdict() obs = transition_dict.pop("observation") user = obs["user"] kwargs = {} if self.box_keys or self.discrete_keys: doc_obs = obs["doc"] for k in self.box_keys: kwargs[f"doc_{k}"] = np.stack([v[k] for v in doc_obs.values()]) for k in self.discrete_keys: kwargs[f"doc_{k}"] = np.array([v[k] for v in doc_obs.values()]) else: kwargs["doc"] = np.stack(list(obs["doc"].values())) # Augmentation if self.augmentation_box_keys or self.augmentation_discrete_keys: aug_obs = obs["augmentation"] for k in self.augmentation_box_keys: kwargs[f"augmentation_{k}"] = np.stack( [v[k] for v in aug_obs.values()]) for k in self.augmentation_discrete_keys: kwargs[f"augmentation_{k}"] = np.array( [v[k] for v in aug_obs.values()]) # Responses response = obs["response"] # We need to handle None below because the first state won't have response for k, d in self.response_box_keys: if response is not None: kwargs[f"response_{k}"] = np.stack([v[k] for v in response]) else: kwargs[f"response_{k}"] = np.zeros((self.num_responses, *d), dtype=np.float32) for k, _n in self.response_discrete_keys: if response is not None: kwargs[f"response_{k}"] = np.array([v[k] for v in response]) else: kwargs[f"response_{k}"] = np.zeros((self.num_responses, ), dtype=np.int64) transition_dict.update(kwargs) replay_buffer.add(observation=user, **transition_dict)
def run_episode(env: EnvWrapper, agent: Agent, mdp_id: int = 0, max_steps: Optional[int] = None) -> Trajectory: """ Return sum of rewards from episode. After max_steps (if specified), the environment is assumed to be terminal. Can also specify the mdp_id and gamma of episode. """ trajectory = Trajectory() # pyre-fixme[16]: `EnvWrapper` has no attribute `reset`. obs = env.reset() possible_actions_mask = env.possible_actions_mask terminal = False num_steps = 0 while not terminal: action, log_prob = agent.act(obs, possible_actions_mask) # pyre-fixme[16]: `EnvWrapper` has no attribute `step`. next_obs, reward, terminal, _ = env.step(action) next_possible_actions_mask = env.possible_actions_mask if max_steps is not None and num_steps >= max_steps: terminal = True # Only partially filled. Agent can fill in more fields. transition = Transition( mdp_id=mdp_id, sequence_number=num_steps, observation=obs, action=action, reward=float(reward), terminal=bool(terminal), log_prob=log_prob, possible_actions_mask=possible_actions_mask, ) agent.post_step(transition) trajectory.add_transition(transition) SummaryWriterContext.increase_global_step() obs = next_obs possible_actions_mask = next_possible_actions_mask num_steps += 1 agent.post_episode(trajectory) return trajectory
def __call__(self, replay_buffer: ReplayBuffer, transition: Transition): replay_buffer.add(**transition.asdict())