def evaluate(n_episodes): run = SUBMISSIONS["rlps-tcpr"] config, run = init_run(run) agent = ShortestPathRllibAgent(get_agent(config, run)) env = get_env(config, rl=True) env_renderer = RenderTool(env, screen_width=8800) returns = [] pcs = [] malfs = [] for _ in tqdm(range(n_episodes)): obs, _ = env.reset(regenerate_schedule=True, regenerate_rail=True) if RENDER: env_renderer.reset() env_renderer.render_env(show=True, frames=True, show_observations=False) if not obs: break steps = 0 ep_return = 0 done = defaultdict(lambda: False) robust_env = RobustFlatlandGymEnv(rail_env=env, max_nr_active_agents=200, observation_space=None, priorizer=DistToTargetPriorizer(), allow_noop=True) sorted_handles = robust_env.priorizer.priorize(handles=list( obs.keys()), rail_env=env) while not done['__all__']: actions = agent.compute_actions(obs, env) robust_actions = robust_env.get_robust_actions( actions, sorted_handles) obs, all_rewards, done, info = env.step(robust_actions) if RENDER: env_renderer.render_env(show=True, frames=True, show_observations=False) print('.', end='', flush=True) steps += 1 ep_return += np.sum(list(all_rewards.values())) pc = np.sum(np.array([1 for a in env.agents if is_done(a) ])) / env.get_num_agents() print("EPISODE PC:", pc) n_episodes += 1 pcs.append(pc) returns.append(ep_return / (env._max_episode_steps * env.get_num_agents())) malfs.append( np.sum([a.malfunction_data['nr_malfunctions'] for a in env.agents])) return pcs, returns, malfs
def explorative_plan(env: RailEnv, obs_dict, budget_seconds=60, exploring_agent=None): start_t = time() best_actions = [] best_return = -np.inf best_pc = -np.inf all_returns = [] all_pcs = [] plan_step = 0 budget_used = False while not budget_used: local_env = deepcopy(env) episode_return = 0 action_memory = [] dones = defaultdict(lambda: False) print(f'\nPlanning step {plan_step + 1}') while not dones['__all__'] and not budget_used: actions = defaultdict( lambda: None, exploring_agent.compute_actions(obs_dict, env=local_env)) action_memory.append(actions) obs_dict, all_rewards, dones, info = local_env.step(actions) episode_return += np.sum(list(all_rewards)) budget_used = (time() - start_t) > budget_seconds if not budget_used: all_returns.append(episode_return) pc = np.sum(np.array([1 for a in local_env.agents if is_done(a) ])) / local_env.get_num_agents() all_pcs.append(pc) if pc > best_pc: best_return = episode_return best_pc = pc best_actions = action_memory if pc == 1.0: print( f'MAX PC: {best_pc}, MIN PC: {np.min(all_pcs)}, MAX RETURN: {best_return}\n' ) return best_actions plan_step += 1 if len(all_pcs) > 0: print( f'MAX PC: {best_pc}, MIN PC: {np.min(all_pcs)}, MAX RETURN: {best_return}\n' ) else: print(f'Budget reached before any planning step could finish!') return best_actions if len(best_actions) > 0 else None
def evaluate(n_episodes): run = SUBMISSIONS["ato"] config, run = init_run(run) agent = get_agent(config, run) env = get_env(config, rl=True) env_renderer = RenderTool(env, screen_width=8800) returns = [] pcs = [] malfs = [] for _ in tqdm(range(n_episodes)): obs, _ = env.reset(regenerate_schedule=True, regenerate_rail=True) if RENDER: env_renderer.reset() env_renderer.render_env(show=True, frames=True, show_observations=False) if not obs: break steps = 0 ep_return = 0 done = defaultdict(lambda: False) while not done['__all__']: actions = agent.compute_actions(obs, explore=False) obs, all_rewards, done, info = env.step(actions) if RENDER: env_renderer.render_env(show=True, frames=True, show_observations=False) print('.', end='', flush=True) steps += 1 ep_return += np.sum(list(all_rewards.values())) pc = np.sum(np.array([1 for a in env.agents if is_done(a) ])) / env.get_num_agents() print("EPISODE PC:", pc) n_episodes += 1 pcs.append(pc) returns.append(ep_return / (env._max_episode_steps * env.get_num_agents())) malfs.append( np.sum([a.malfunction_data['nr_malfunctions'] for a in env.agents])) return pcs, returns, malfs
def evaluate(n_episodes, rl_prio=True): agent = None if rl_prio: config, run = init_run() agent = get_agent(config, run) env = get_env(config, rl=True) else: env = get_env(rl=False) env_renderer = RenderTool(env, screen_width=8800) returns = [] pcs = [] malfs = [] for _ in tqdm(range(n_episodes)): obs, _ = env.reset(regenerate_schedule=True, regenerate_rail=True) if RENDER: env_renderer.reset() env_renderer.render_env(show=True, frames=True, show_observations=False) if not obs: break steps = 0 ep_return = 0 done = defaultdict(lambda: False) robust_env = CprFlatlandGymEnv(rail_env=env, max_nr_active_agents=200, observation_space=None, priorizer=NrAgentsSameStart(), allow_noop=True) # if rl_prio: # priorities = prio_agent.compute_actions(obs, explore=False) # sorted_actions = {k: v for k, v in sorted(priorities.items(), key=lambda item: item[1], reverse=True)} # sorted_handles = list(sorted_actions.keys()) # else: sorted_handles = robust_env.priorizer.priorize(handles=list( obs.keys()), rail_env=env) while not done['__all__']: actions = ShortestPathAgent().compute_actions(obs, env) robust_actions = robust_env.get_robust_actions( actions, sorted_handles) obs, all_rewards, done, info = env.step(robust_actions) if RENDER: env_renderer.render_env(show=True, frames=True, show_observations=False) print('.', end='', flush=True) steps += 1 ep_return += np.sum(list(all_rewards.values())) pc = np.sum(np.array([1 for a in env.agents if is_done(a) ])) / env.get_num_agents() print("EPISODE PC:", pc) n_episodes += 1 pcs.append(pc) returns.append(ep_return / (env._max_episode_steps * env.get_num_agents())) malfs.append( np.sum([a.malfunction_data['nr_malfunctions'] for a in env.agents])) return pcs, returns, malfs
def epsilon_greedy_plan(env: RailEnv, obs_dict, budget_seconds=60, epsilon=0.1, policy_agent=HeuristicPriorityAgent()): start_t = time() best_actions = [] best_return = -np.inf best_pc = -np.inf all_returns = [] all_pcs = [] plan_step = 0 budget_used = False while not budget_used: local_env = deepcopy(env) episode_return = 0 action_memory = [] dones = defaultdict(lambda: False) print(f'\nPlanning step {plan_step + 1}') while not dones['__all__'] and not budget_used: actions = defaultdict( lambda: None, policy_agent.compute_actions(obs_dict, env=local_env)) for agent in env.agents: pos = get_agent_pos(agent) next_possible_moves = local_env.rail.get_transitions( *pos, agent.direction) departed = agent.status.value != RailAgentStatus.READY_TO_DEPART.value if np.random.random() < epsilon and promising( next_possible_moves, departed): possible_actions = set(np.flatnonzero(next_possible_moves)) possible_actions = possible_actions.union({ RailEnvActions.STOP_MOVING.value, RailEnvActions.MOVE_FORWARD.value }) non_default_actions = possible_actions.difference( {actions[agent.handle]}) actions[agent.handle] = np.random.choice( list(non_default_actions)) action_memory.append(actions) obs_dict, all_rewards, dones, info = local_env.step(actions) episode_return += np.sum(list(all_rewards)) budget_used = (time() - start_t) > budget_seconds if not budget_used: all_returns.append(episode_return) pc = np.sum(np.array([1 for a in local_env.agents if is_done(a) ])) / local_env.get_num_agents() all_pcs.append(pc) if pc > best_pc: best_return = episode_return best_pc = pc best_actions = action_memory if pc == 1.0: print( f'MAX PC: {best_pc}, MIN PC: {np.min(all_pcs)}, MAX RETURN: {best_return}\n' ) return best_actions plan_step += 1 if len(all_pcs) > 0: print( f'MAX PC: {best_pc}, MIN PC: {np.min(all_pcs)}, MAX RETURN: {best_return}\n' ) else: print(f'Budget reached before any planning step could finish!') return best_actions if len(best_actions) > 0 else None
def simulate(self): episode_return = 0 dones = defaultdict(lambda: False) local_env = deepcopy(self.env) obs_dict = self.initial_obs for actions in self.genotype: if not self.budget_function(): obs_dict, all_rewards, dones, info = local_env.step(actions) episode_return += np.sum(list(all_rewards)) while not dones['__all__'] and not self.budget_function(): actions: defaultdict[int, Optional[int]] = defaultdict( lambda: None, self.default_behaviour.compute_actions(obs_dict, env=local_env)) transitions = defaultdict(lambda: None) agents_departed = defaultdict(lambda: True) for agent in local_env.agents: pos = get_agent_pos(agent) next_possible_moves = local_env.rail.get_transitions( *pos, agent.direction) actions = defaultdict( lambda: None, self.default_behaviour.compute_actions(obs_dict, env=local_env)) for agent in local_env.agents: pos = get_agent_pos(agent) next_possible_moves = local_env.rail.get_transitions( *pos, agent.direction) if np.random.random() < self.epsilon: possible_actions = set( np.flatnonzero(next_possible_moves)) possible_actions = possible_actions.union({ RailEnvActions.STOP_MOVING.value, RailEnvActions.MOVE_FORWARD.value }) non_default_actions = possible_actions.difference( {actions[agent.handle]}) actions[agent.handle] = np.random.choice( list(non_default_actions)) transitions[agent.handle] = next_possible_moves agents_departed[ agent. handle] = agent.status.value != RailAgentStatus.READY_TO_DEPART.value self.genotype.append(actions) self.possible_mutations.append(transitions) self.departed.append(agents_departed) obs_dict, all_rewards, dones, info = local_env.step(actions) episode_return += np.sum(list(all_rewards)) if not self.budget_function(): percentage_complete = np.sum( np.array([1 for a in local_env.agents if is_done(a) ])) / local_env.get_num_agents() self.fitness = percentage_complete print( f"Simulation of candidate finished with fitness (PC): {percentage_complete}" )