def step(self, action_dict: Dict[int, RailEnvActions]) -> StepOutput: obs, rewards, dones, infos = self.rail_env.step(action_dict) o, r, d = self._step_out(obs, dones) assert len(obs) > 0 assert all([x is not None for x in (dones, rewards, obs)]) return StepOutput(obs=o, reward=r, done=d, info={ agent: { 'max_episode_steps': self.rail_env._max_episode_steps, 'num_agents': self.rail_env.get_num_agents(), 'agent_done': dones[agent] and agent not in self.rail_env.active_agents, 'agent_score': self._agent_scores[agent], 'agent_step': self._agent_steps[agent], } for agent in o.keys() })
def _scheduling_step(self, action): norm_factor = self._env.rail_env._max_episode_steps * self._env.rail_env.get_num_agents( ) sorted_actions = { k: v for k, v in sorted( action.items(), key=lambda item: item[1], reverse=True) } self._env.sorted_handles = list(sorted_actions.keys()) done = defaultdict(lambda: False) while not done['__all__']: actions = ShortestPathAgent().compute_actions( self.last_obs, self._env.rail_env) _, _, done, _ = self._env.step(actions) pc = np.sum( np.array([1 for a in self._env.rail_env.agents if is_done(a) ])) / self._env.rail_env.get_num_agents() malf = np.sum([ a.malfunction_data['nr_malfunctions'] for a in self._env.rail_env.agents ]) print("EPISODE PC:", pc, "NR MALFUNCTIONS:", malf) d = { a.handle: a.status == RailAgentStatus.DONE or a.status == RailAgentStatus.DONE_REMOVED for a in self._env.rail_env.agents } d['__all__'] = True r = { a.handle: self._env._agent_scores[a.handle] / norm_factor for a in self._env.rail_env.agents } o = self.last_obs return StepOutput(obs=o, reward=r, done=d, info={ a.handle: { 'max_episode_steps': self._env.rail_env._max_episode_steps, 'num_agents': self._env.rail_env.get_num_agents(), 'agent_done': d[a.handle], 'agent_score': self._env._agent_scores[a.handle], 'agent_step': self._env._agent_steps[a.handle], } for a in self._env.rail_env.agents })
def step(self, action_dict: Dict[int, RailEnvActions]) -> StepOutput: obs, reward, done, info = self.env.step(action_dict) o, r, d, i = {}, {}, {}, {} for agent_id, agent_obs in obs.items(): o[agent_id] = obs[agent_id] d[agent_id] = done[agent_id] i[agent_id] = info[agent_id] r[agent_id] = np.mean(list(reward.values())) d['__all__'] = done['__all__'] or all(d.values()) return StepOutput(o, r, d, i)
def step(self, action_dict: Dict[int, RailEnvActions]) -> StepOutput: obs, rewards, dones, infos = self.rail_env.step(action_dict) done = dones["__all__"] reward = np.sum(list(rewards.values())) observation = obs self._agent_score += reward self._agent_steps += 1 return StepOutput(obs=observation, reward=reward, done=done, info={ 'max_episode_steps': self.rail_env._max_episode_steps, 'num_agents': self.rail_env.get_num_agents(), 'agent_done': done, 'agent_score': self._agent_score, 'agent_step': self._agent_steps, })
def step(self, action_dict: Dict[int, float]) -> StepOutput: rail_env: RailEnv = self.unwrapped.rail_env sorted_actions = { k: v for k, v in sorted( action_dict.items(), key=lambda item: item[1], reverse=True) } self.env.sorted_handles = list(sorted_actions.keys()) rail_actions = self.sp_agent.compute_actions( {h: None for h in action_dict.keys()}, env=rail_env) o, r, d, i = self.env.step(rail_actions) r = {h: rew / self.norm_factor for h, rew in r.items()} return StepOutput(o, r, d, i)
def step(self, action_list): action_dict = {} for i, action in enumerate(action_list): action_dict[i] = action step_r = self._env.step(action_dict) if not self._global_obs: return StepOutput(obs=[step for step in step_r.obs.values()], reward=np.sum( [r for r in step_r.reward.values()]), done=all(step_r.done.values()), info=step_r.info[0]) else: return step_r
def step(self, action: Dict[int, RailEnvActions]) -> StepOutput: action_dict = { h: RailEnvActions.STOP_MOVING.value for h in range(self._num_agents) if h not in self._agents_done } if self._allow_noop: action_dict[self._current_handle] = action[self._current_handle] else: action_dict[ self._current_handle] = action[self._current_handle] + 1 obs, rewards, dones, infos = self.rail_env.step(action_dict) new_dones = [] for agent, done in dones.items(): if agent not in self._agents_done and agent != '__all__' and done: new_dones.append(agent) if not dones['__all__']: self._current_handle = (self._current_handle + 1) % self._num_agents while self._current_handle in (self._agents_done + new_dones): self._current_handle = (self._current_handle + 1) % self._num_agents d, r, o = dict(), dict(), dict() for agent in new_dones + [self._current_handle]: o[agent] = obs[agent] r[agent] = rewards[agent] d[agent] = dones[agent] self._agent_scores[agent] += rewards[agent] if not d[agent]: self._agent_steps[agent] += 1 d['__all__'] = dones['__all__'] self._agents_done.extend(new_dones) i = { h: self.get_agent_info(h, d) for h, done in d.items() if not h == '__all__' } return StepOutput(obs=o, reward=r, done=d, info=i)
def step(self, action_dict: Dict[int, RailEnvActions]) -> StepOutput: obs, reward, done, info = self.env.step(action_dict) if self._deadlock_reward != 0: new_deadlocked_agents = self.check_deadlock() else: new_deadlocked_agents = [] o, r, d, i = {}, {}, {}, {} for agent_id, agent_obs in obs.items(): if agent_id not in self._deadlocked_agents or agent_id in new_deadlocked_agents: o[agent_id] = obs[agent_id] d[agent_id] = done[agent_id] i[agent_id] = info[agent_id] r[agent_id] = reward[agent_id] if agent_id in new_deadlocked_agents: # agent is in deadlocked (and was not before) -> give deadlock reward and set to done r[agent_id] += self._deadlock_reward d[agent_id] = True d['__all__'] = done['__all__'] or all(d.values()) return StepOutput(o, r, d, i)
def step(self, action_dict: Dict[int, float]) -> StepOutput: rail_env: RailEnv = self.unwrapped.rail_env sorted_actions = { k: v for k, v in sorted( action_dict.items(), key=lambda item: item[1], reverse=True) } self.env.sorted_handles = list(sorted_actions.keys()) cum_done = defaultdict(lambda: False) cum_rew = defaultdict(lambda: 0) rail_actions = self.sp_agent.compute_actions( {h: None for h in action_dict.keys()}, env=rail_env) o, r, done, i = self.env.step(rail_actions) r = {h: rew / self.norm_factor for h, rew in r.items()} for h, rew in r.items(): cum_rew[h] += rew for h, curr_d in done.items(): cum_done[h] = curr_d or cum_done[h] while not rail_env._elapsed_steps % 10 == 0 and not cum_done.get( '__all__', False): rail_actions = self.sp_agent.compute_actions( {h: None for h in action_dict.keys()}, env=rail_env) o, r, done, i = self.env.step(rail_actions) r = {h: rew / self.norm_factor for h, rew in r.items()} for h, curr_d in done.items(): cum_done[h] = curr_d or cum_done[h] for h, rew in r.items(): cum_rew[h] += rew return StepOutput(o, cum_rew, cum_done, i)
def step(self, action_dict: Dict[int, RailEnvActions]) -> StepOutput: o, r, d, i = {}, {}, {}, {} while len(o) == 0: obs, reward, done, info = self.env.step(action_dict) for agent_id, agent_obs in obs.items(): if done[agent_id] or self._on_decision_cell( self.unwrapped.rail_env.agents[agent_id]): o[agent_id] = agent_obs r[agent_id] = reward[agent_id] d[agent_id] = done[agent_id] i[agent_id] = info[agent_id] if self._accumulate_skipped_rewards: discounted_skipped_reward = r[agent_id] for skipped_reward in reversed( self._skipped_rewards[agent_id]): discounted_skipped_reward = self._discounting * discounted_skipped_reward + skipped_reward r[agent_id] = discounted_skipped_reward self._skipped_rewards[agent_id] = [] elif self._accumulate_skipped_rewards: self._skipped_rewards[agent_id].append(reward[agent_id]) d['__all__'] = done['__all__'] action_dict = {} return StepOutput(o, r, d, i)
def step(self, action_dict: Dict[int, RailEnvActions]) -> StepOutput: rail_env: RailEnv = self.unwrapped.rail_env obs, reward, done, info = self.env.step(action_dict) o, r, d, i = {}, {}, {}, {} for agent_id, agent_obs in obs.items(): o[agent_id] = obs[agent_id] d[agent_id] = done[agent_id] i[agent_id] = info[agent_id] if done[agent_id]: if rail_env.agents[agent_id].status in [ RailAgentStatus.DONE, RailAgentStatus.DONE_REMOVED ]: # agent is done and really done -> give finished reward r[agent_id] = self._finished_reward else: # agent is done but not really done -> give not_finished reward r[agent_id] = self._not_finished_reward else: r[agent_id] = 0 d['__all__'] = done['__all__'] or all(d.values()) return StepOutput(o, r, d, i)
def step(self, action_dict: Dict[int, RailEnvActions]) -> StepOutput: obs, reward, done, info = self.env.step(action_dict) return StepOutput(self._transform_obs(obs), reward, done, info)
def step(self, action_dict: Dict[int, RailEnvActions]) -> StepOutput: d, r, o = None, None, None obs_or_done = False action_dict = { k: v for k, v in action_dict.items() if k not in self._agents_done } while not obs_or_done: # Perform envs steps as long as there is no observation (for all agents) or all agents are done # The observation is `None` if an agent is done or malfunctioning. obs, rewards, dones, infos = self.rail_env.step(action_dict) d, r, o = dict(), dict(), dict() for agent in self.agent_keys + ["__all__"]: if agent != '__all__': if dones.get(agent, False): if agent not in self._agents_done: self._agents_done.append(agent) if self.agent_done_independent and agent not in self._agents_done: o[agent] = obs.get(agent, self.prev_obs[agent]) r[agent] = rewards.get( agent, 0 if agent in self._agents_done else -1) self._agent_scores[agent] += rewards.get(agent, 0) self._agent_steps[agent] += 1 elif not self.agent_done_independent: o[agent] = obs.get(agent, self.prev_obs[agent]) r[agent] = rewards.get( agent, 0 if agent in self._agents_done else -1) self._agent_scores[agent] += rewards.get(agent, 0) self._agent_steps[agent] += 1 if self.agent_done_independent: d[agent] = dones[agent] else: d[agent] = dones["__all__"] action_dict = { } # reset action dict for cases where we do multiple envs steps obs_or_done = len(o) > 0 or d[ '__all__'] # step through envs as long as there are no obs/all agents done assert all([x is not None for x in (d, r, o)]) self.prev_obs = o return StepOutput(obs=o, reward=r, done=d, info={ agent: { 'max_episode_steps': self.rail_env._max_episode_steps, 'num_agents': self.rail_env.get_num_agents(), 'agent_done': d.get(agent, False) and agent not in self.rail_env.active_agents, 'agent_score': self._agent_scores.get(agent, 0), 'agent_step': self._agent_steps.get(agent, 0), } for agent in o.keys() })