def env(self): # obs builder obs_builder_object = self.obs_builder_dict[self.obs_builder] env = RailEnv( width=self.width, # width和height是网格grid的数量 height=self.height, rail_generator=sparse_rail_generator( max_num_cities=self.max_num_cities, # Number of cities in map (where train stations are) seed=19, # Random seed grid_mode=True, max_rails_between_cities=2, max_rails_in_city=2, ), schedule_generator=sparse_schedule_generator( self.speed_ration_map), number_of_agents=self.number_of_agents, malfunction_generator_and_process_data=malfunction_from_params( self.stochastic_data), # Malfunction data generator obs_builder_object=obs_builder_object, remove_agents_at_target=False, record_steps=True) return env
def create_rail_env(env_params, tree_observation): n_agents = env_params.n_agents x_dim = env_params.x_dim y_dim = env_params.y_dim n_cities = env_params.n_cities max_rails_between_cities = env_params.max_rails_between_cities max_rails_in_city = env_params.max_rails_in_city seed = env_params.seed # Break agents from time to time malfunction_parameters = MalfunctionParameters( malfunction_rate=env_params.malfunction_rate, min_duration=20, max_duration=50) return RailEnv( width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=n_cities, grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city), schedule_generator=sparse_schedule_generator(), number_of_agents=n_agents, malfunction_generator_and_process_data=malfunction_from_params( malfunction_parameters), obs_builder_object=tree_observation, random_seed=seed)
def _create_single_agent_environment(seed, x_dim, y_dim, n_agents, n_cities, timed, max_rails_between_cities, max_rails_in_city, observation_builder): # Set the seeds random.seed(seed) np.random.seed(seed) # Setup the environment env = RailEnv(width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=n_cities, seed=seed, grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city), schedule_generator=sparse_schedule_generator(timed=timed), number_of_agents=n_agents, obs_builder_object=observation_builder, random_seed=seed) # Compute the maximum number of steps allowed. max_steps = int(4 * 2 * (env.height + env.width + (n_agents / n_cities))) # Return produced environment return env, max_steps, x_dim, y_dim
def train_validate_env_generator_params(train_set, n_agents, x_dim, y_dim, observation, stochastic_data, speed_ration_map, seed=1): if train_set: random_seed = np.random.randint(1000) else: random_seed = np.random.randint(1000, 2000) random.seed(random_seed) np.random.seed(random_seed) env = RailEnv( width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=3, # Number of cities in map (where train stations are) seed=seed, # Random seed grid_mode=False, max_rails_between_cities=2, max_rails_in_city=3), schedule_generator=sparse_schedule_generator(speed_ration_map), number_of_agents=n_agents, malfunction_generator_and_process_data=malfunction_from_params( stochastic_data), # Malfunction data generator obs_builder_object=observation) return env, random_seed
def get_rail_generator(self): rail_generator = sparse_rail_generator( seed=self._config['seed'], max_num_cities=self._config['max_num_cities'], grid_mode=self._config['grid_mode'], max_rails_between_cities=self._config['max_rails_between_cities'], max_rails_in_city=self._config['max_rails_in_city']) return rail_generator
def test_static_env(self): rail_generator = sparse_rail_generator(seed=42, max_num_cities=3, grid_mode=True, max_rails_between_cities=4, max_rails_in_city=4) rail = rail_generator(25, 25, 5, 0, RandomState(None))[0] rail_from_grid_transition_map(rail)
def create_rail_env(args, load_env=""): ''' Build a RailEnv object with the specified parameters, as described in the .yml file ''' # Check if an environment file is provided if load_env: rail_generator = rail_from_file(load_env) else: rail_generator = sparse_rail_generator( max_num_cities=args.env.max_cities, grid_mode=args.env.grid, max_rails_between_cities=args.env.max_rails_between_cities, max_rails_in_city=args.env.max_rails_in_cities, seed=args.env.seed) # Build predictor and observator obs_type = args.policy.type.get_true_key() if PREDICTORS[obs_type] is ShortestDeviationPathPredictor: predictor = PREDICTORS[obs_type]( max_depth=args.observator.max_depth, max_deviations=args.predictor.max_depth) else: predictor = PREDICTORS[obs_type](max_depth=args.predictor.max_depth) observator = OBSERVATORS[obs_type](args.observator.max_depth, predictor) # Initialize malfunctions malfunctions = None if args.env.malfunctions.enabled: malfunctions = ParamMalfunctionGen( MalfunctionParameters( malfunction_rate=args.env.malfunctions.rate, min_duration=args.env.malfunctions.min_duration, max_duration=args.env.malfunctions.max_duration)) # Initialize agents speeds speed_map = None if args.env.variable_speed: speed_map = {1.: 0.25, 1. / 2.: 0.25, 1. / 3.: 0.25, 1. / 4.: 0.25} schedule_generator = sparse_schedule_generator(speed_map, seed=args.env.seed) # Build the environment return RailEnvWrapper(params=args, width=args.env.width, height=args.env.height, rail_generator=rail_generator, schedule_generator=schedule_generator, number_of_agents=args.env.num_trains, obs_builder_object=observator, malfunction_generator=malfunctions, remove_agents_at_target=True, random_seed=args.env.seed)
def _launch(self): rail_generator = sparse_rail_generator( seed=self._config['seed'], max_num_cities=self._config['max_num_cities'], grid_mode=self._config['grid_mode'], max_rails_between_cities=self._config['max_rails_between_cities'], max_rails_in_city=self._config['max_rails_in_city']) malfunction_generator = no_malfunction_generator() if { 'malfunction_rate', 'malfunction_min_duration', 'malfunction_max_duration' } <= self._config.keys(): stochastic_data = { 'malfunction_rate': self._config['malfunction_rate'], 'min_duration': self._config['malfunction_min_duration'], 'max_duration': self._config['malfunction_max_duration'] } malfunction_generator = malfunction_from_params(stochastic_data) speed_ratio_map = None if 'speed_ratio_map' in self._config: speed_ratio_map = { float(k): float(v) for k, v in self._config['speed_ratio_map'].items() } schedule_generator = sparse_schedule_generator(speed_ratio_map) env = None try: env = RailEnv( width=self._config['width'], height=self._config['height'], rail_generator=rail_generator, schedule_generator=schedule_generator, number_of_agents=self._config['number_of_agents'], malfunction_generator_and_process_data=malfunction_generator, obs_builder_object=self._observation.builder(), remove_agents_at_target=False, random_seed=self._config['seed'], # Should Below line be commented as here the env tries different configs, # hence opening it can be wasteful, morever the render has to be closed use_renderer=self._env_config.get('render')) env.reset() except ValueError as e: logging.error("=" * 50) logging.error(f"Error while creating env: {e}") logging.error("=" * 50) return env
def _launch(self): print("NEW ENV LAUNCHED") n_agents, n_cities, dim = get_round_2_env() rail_generator = sparse_rail_generator( seed=self._config['seed'], max_num_cities=n_cities, grid_mode=self._config['grid_mode'], max_rails_between_cities=self._config['max_rails_between_cities'], max_rails_in_city=self._config['max_rails_in_city']) malfunction_generator = NoMalfunctionGen() if { 'malfunction_rate', 'malfunction_min_duration', 'malfunction_max_duration' } <= self._config.keys(): stochastic_data = { 'malfunction_rate': self._config['malfunction_rate'], 'min_duration': self._config['malfunction_min_duration'], 'max_duration': self._config['malfunction_max_duration'] } malfunction_generator = ParamMalfunctionGen(stochastic_data) speed_ratio_map = None if 'speed_ratio_map' in self._config: speed_ratio_map = { float(k): float(v) for k, v in self._config['speed_ratio_map'].items() } schedule_generator = sparse_schedule_generator(speed_ratio_map) env = None try: env = RailEnv(width=dim, height=dim, rail_generator=rail_generator, schedule_generator=schedule_generator, number_of_agents=n_agents, malfunction_generator=malfunction_generator, obs_builder_object=self._observation.builder(), remove_agents_at_target=False, random_seed=self._config['seed'], use_renderer=self._env_config.get('render')) env.reset() except ValueError as e: logging.error("=" * 50) logging.error(f"Error while creating env: {e}") logging.error("=" * 50) return env
def create_random_railways(project_root): speed_ration_map = { 1 / 1: 1.0, # Fast passenger train 1 / 2.: 0.0, # Fast freight train 1 / 3.: 0.0, # Slow commuter train 1 / 4.: 0.0 } # Slow freight train rail_generator = sparse_rail_generator(grid_mode=False, max_num_cities=3, max_rails_between_cities=2, max_rails_in_city=3) schedule_generator = sparse_schedule_generator(speed_ration_map) return rail_generator, schedule_generator
def _launch(self, env_params, observation): return RailEnv( width=env_params.x_dim, height=env_params.y_dim, rail_generator=sparse_rail_generator( max_num_cities=env_params.n_cities, grid_mode=False, max_rails_between_cities=env_params.max_rails_between_cities, max_rails_in_city=env_params.max_rails_in_city, seed=env_params.seed ), schedule_generator=sparse_schedule_generator(env_params.speed_profiles), number_of_agents=env_params.n_agents, malfunction_generator_and_process_data=malfunction_from_params(env_params.malfunction_parameters), obs_builder_object=observation, random_seed=env_params.seed )
def random_sparse_env_small(random_seed, max_width, max_height, observation_builder): random.seed(random_seed) size = random.randint(0, 5) width = 20 + size * 5 height = 20 + size * 5 nr_cities = 2 + size // 2 + random.randint(0, 2) nr_trains = min(nr_cities * 5, 5 + random.randint(0, 5)) # , 10 + random.randint(0, 10)) max_rails_between_cities = 2 max_rails_in_cities = 3 + random.randint(0, size) malfunction_rate = 30 + random.randint(0, 100) malfunction_min_duration = 3 + random.randint(0, 7) malfunction_max_duration = 20 + random.randint(0, 80) rail_generator = sparse_rail_generator(max_num_cities=nr_cities, seed=random_seed, grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_cities) # new version: # stochastic_data = MalfunctionParameters(malfunction_rate, malfunction_min_duration, malfunction_max_duration) stochastic_data = {'malfunction_rate': malfunction_rate, 'min_duration': malfunction_min_duration, 'max_duration': malfunction_max_duration} schedule_generator = sparse_schedule_generator({1.: 0.25, 1. / 2.: 0.25, 1. / 3.: 0.25, 1. / 4.: 0.25}) while width <= max_width and height <= max_height: try: env = RailEnv(width=width, height=height, rail_generator=rail_generator, schedule_generator=schedule_generator, number_of_agents=nr_trains, malfunction_generator_and_process_data=malfunction_from_params(stochastic_data), obs_builder_object=observation_builder, remove_agents_at_target=False) print("[{}] {}x{} {} cities {} trains, max {} rails between cities, max {} rails in cities. Malfunction rate {}, {} to {} steps.".format( random_seed, width, height, nr_cities, nr_trains, max_rails_between_cities, max_rails_in_cities, malfunction_rate, malfunction_min_duration, malfunction_max_duration )) return env except ValueError as e: logging.error(f"Error: {e}") width += 5 height += 5 logging.info("Try again with larger env: (w,h):", width, height) logging.error(f"Unable to generate env with seed={random_seed}, max_width={max_height}, max_height={max_height}") return None
def test_schedule_from_file_sparse(): """ Test to see that all parameters are loaded as expected Returns ------- """ # Different agent types (trains) with different speeds. speed_ration_map = { 1.: 0.25, # Fast passenger train 1. / 2.: 0.25, # Fast freight train 1. / 3.: 0.25, # Slow commuter train 1. / 4.: 0.25 } # Slow freight train # Generate Sparse test env rail_generator = sparse_rail_generator( max_num_cities=5, seed=1, grid_mode=False, max_rails_between_cities=3, max_rails_in_city=6, ) schedule_generator = sparse_schedule_generator(speed_ration_map) create_and_save_env(file_name="./sparse_env_test.pkl", rail_generator=rail_generator, schedule_generator=schedule_generator) # Sparse generator rail_generator = rail_from_file("./sparse_env_test.pkl") schedule_generator = schedule_from_file("./sparse_env_test.pkl") sparse_env_from_file = RailEnv(width=1, height=1, rail_generator=rail_generator, schedule_generator=schedule_generator) sparse_env_from_file.reset(True, True) # Assert loaded agent number is correct assert sparse_env_from_file.get_num_agents() == 10 # Assert max steps is correct assert sparse_env_from_file._max_episode_steps == 500
def _thunk(): env_Orig = RailEnvWrapper( width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=3, # Number of cities in map (where train stations are) seed=1, # Random seed grid_mode=False, max_rails_between_cities=2, max_rails_in_city=3), schedule_generator=sparse_schedule_generator(speed_ration_map), number_of_agents=n_agents, stochastic_data=stochastic_data, # Malfunction data generator obs_builder_object=TreeObservation) env = copy.deepcopy(env_Orig) # After training we want to render the results so we also load a renderer #env_renderer = RenderTool(env, gl="PILSVG", ) return env
def get_env(config=None, rl=False): n_agents = 16 schedule_generator = sparse_schedule_generator(None) rail_generator = sparse_rail_generator( seed=seed, max_num_cities=3, grid_mode=False, max_rails_between_cities=2, max_rails_in_city=4, ) if rl: obs_builder = make_obs("combined", { "path": None, "simple_meta": None }).builder() else: obs_builder = DummyObs() params = MalfunctionParameters(malfunction_rate=1 / 1000, max_duration=50, min_duration=20) malfunction_generator = ParamMalfunctionGen(params) env = RailEnv( width=28, height=28, rail_generator=rail_generator, schedule_generator=schedule_generator, number_of_agents=n_agents, malfunction_generator=malfunction_generator, obs_builder_object=obs_builder, remove_agents_at_target=True, random_seed=seed, ) return env
def demo_lpg_planing(): from flatland.envs.rail_generators import sparse_rail_generator from flatland.envs.schedule_generators import sparse_schedule_generator from flatland.envs.observations import TreeObsForRailEnv n_agents = 1 x_dim = 25 y_dim = 25 n_cities = 4 max_rails_between_cities = 2 max_rails_in_city = 3 seed = 42 # Observation parameters observation_tree_depth = 2 domain_file = "./pddl/flatland.pddl" problem_dir = "./pddl/flatland" num_problems = 6 tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth) env = PDDLFlatlandEnv( width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=n_cities, seed=seed, grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city), schedule_generator=sparse_schedule_generator(), number_of_agents=n_agents, obs_builder_object=tree_observation, domain_file=domain_file, problem_dir=problem_dir) for problem_index in range(num_problems): env.fix_problem_index(problem_index) run_planning_flatland_demo(env, 'lpg')
def get_env(config=None, rl=False): n_agents = 32 schedule_generator = sparse_schedule_generator(None) rail_generator = sparse_rail_generator( seed=seed, max_num_cities=4, grid_mode=False, max_rails_between_cities=2, max_rails_in_city=4, ) if rl: obs_builder = make_obs( config["env_config"]['observation'], config["env_config"].get('observation_config')).builder() else: obs_builder = DummyObs() params = MalfunctionParameters(malfunction_rate=1 / 1000, max_duration=50, min_duration=20) malfunction_generator = ParamMalfunctionGen(params) env = RailEnv( width=32, height=32, rail_generator=rail_generator, schedule_generator=schedule_generator, number_of_agents=n_agents, malfunction_generator=malfunction_generator, obs_builder_object=obs_builder, remove_agents_at_target=True, random_seed=seed, ) return env
def __init__(self, n_cars=3, n_acts=5, min_obs=-1, max_obs=1, n_nodes=2, ob_radius=10, x_dim=36, y_dim=36, feats='all'): self.tree_obs = tree_observation.TreeObservation(n_nodes) self.n_cars = n_cars self.n_nodes = n_nodes self.ob_radius = ob_radius self.feats = feats rail_gen = sparse_rail_generator(max_num_cities=3, seed=666, grid_mode=False, max_rails_between_cities=2, max_rails_in_city=3) self._rail_env = RailEnv( width=x_dim, height=y_dim, rail_generator=rail_gen, schedule_generator=sparse_schedule_generator(speed_ration_map), number_of_agents=n_cars, malfunction_generator_and_process_data=malfunction_from_params( stochastic_data), obs_builder_object=self.tree_obs) self.renderer = RenderTool(self._rail_env, gl="PILSVG") self.action_dict = dict() self.info = dict() self.old_obs = dict()
def update_env_with_params(self, width, height, num_agents, max_steps, rail_type, rail_gen_params, seed=-1): if seed == -1: seed = random.randint(0, 100000) self.num_agents = num_agents self.max_steps = max_steps if rail_type == 'complex': self.rail_gen = complex_rail_generator( nr_start_goal=rail_gen_params['nr_start_goal'], nr_extra=rail_gen_params['nr_extra'], min_dist=rail_gen_params['min_dist'], max_dist=rail_gen_params['max_dist'], seed=seed) #self.schedule_gen = complex_schedule_generator() elif rail_type == 'sparse': self.rail_gen = sparse_rail_generator( max_num_cities=rail_gen_params['num_cities'], seed=seed, grid_mode=rail_gen_params['grid_mode'], max_rails_between_cities=rail_gen_params[ 'max_rails_between_cities'], max_rails_in_city=rail_gen_params['max_rails_in_city']) else: raise ValueError( 'Please specify either "complex" or "sparse" as rail_type') self.generate_env(width, height)
def create_save_env(path, width, height, num_trains, max_cities, max_rails_between_cities, max_rails_in_cities, grid=False, seed=0): ''' Create a RailEnv environment with the given settings and save it as pickle ''' rail_generator = sparse_rail_generator( max_num_cities=max_cities, seed=seed, grid_mode=grid, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_cities, ) env = RailEnv(width=width, height=height, rail_generator=rail_generator, number_of_agents=num_trains) env.save(path)
def create_multi_agent_rail_env(seed, timed): n_agents = 4 # Environment parameters x_dim = 25 y_dim = 25 n_cities = 4 max_rails_between_cities = 2 max_rails_in_city = 3 # Default observation parameters observation_tree_depth = 2 observation_max_path_depth = 30 # Default (tree) observation builder predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth) tree_observation = TreeObsForRailEnvExtended( max_depth=observation_tree_depth, predictor=predictor) random.seed(seed) np.random.seed(seed) env = RailEnv(width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=n_cities, seed=seed, grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city), schedule_generator=sparse_schedule_generator(timed=timed), number_of_agents=n_agents, malfunction_generator_and_process_data=None, obs_builder_object=tree_observation, random_seed=seed) max_steps = int(4 * 2 * (env.height + env.width + (n_agents / n_cities))) return env, max_steps, x_dim, y_dim, observation_tree_depth, observation_max_path_depth
} # Custom observation builder TreeObservation = TreeObsForRailEnv(max_depth=2) # Different agent types (trains) with different speeds. speed_ration_map = {1.: 1., # Fast passenger train 1. / 2.: 0.0, # Fast freight train 1. / 3.: 0.0, # Slow commuter train 1. / 4.: 0.0} # Slow freight train env = RailEnv(width=x_dim, height=y_dim, rail_generator=sparse_rail_generator(max_num_cities=3, # Number of cities in map (where train stations are) seed=1, # Random seed grid_mode=False, max_rails_between_cities=2, max_rails_in_city=4), schedule_generator=sparse_schedule_generator(speed_ration_map), number_of_agents=n_agents, malfunction_generator_and_process_data=malfunction_from_params(stochastic_data), obs_builder_object=TreeObservation) env.reset() env_renderer = RenderTool(env, gl="PILSVG", ) num_features_per_node = env.obs_builder.observation_dim tree_depth = 2 nr_nodes = 0 for i in range(tree_depth + 1): nr_nodes += np.power(4, i)
predictor=predictor) # Fraction of train which each speed speed_profiles = { 1.: 1.0, # Fast passenger train 1. / 2.: 0.0, # Fast freight train 1. / 3.: 0.0, # Slow commuter train 1. / 4.: 0.0 # Slow freight train } # Setup the environment env = RailEnv(width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=n_cities, grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city), schedule_generator=sparse_schedule_generator(speed_profiles), number_of_agents=n_agents, malfunction_generator_and_process_data=malfunction_from_params( malfunction_parameters), obs_builder_object=tree_observation, random_seed=seed) env.reset(regenerate_schedule=True, regenerate_rail=True) # Setup renderer env_renderer = RenderTool(env) '''
######### Set parameters for rail map generation ######### width = 16 * 4 # 16 * 7 # With of map height = 9 * 4 # 9 * 7 # Height of map nr_trains = 10 # nr_trains = 50 # Number of trains that have an assigned task in the env cities_in_map = 6 # 20 # Number of cities where agents can start or end seed = 14 # Random seed grid_distribution_of_cities = False # Type of city distribution, if False cities are randomly placed max_rails_between_cities = 2 # Max number of tracks allowed between cities. This is number of entry point to a city max_rail_in_cities = 6 # Max number of parallel tracks within a city, representing a realistic trainstation ######### Initialize railway ######### rail_generator = sparse_rail_generator( max_num_cities=cities_in_map, seed=seed, grid_mode=grid_distribution_of_cities, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rail_in_cities, ) # Different agent types (trains) with different speeds. speed_ration_map = { 1.: 0.25, # Fast passenger train 1. / 2.: 0.25, # Fast freight train 1. / 3.: 0.25, # Slow commuter train 1. / 4.: 0.25 } # Slow freight train schedule_generator = sparse_schedule_generator(speed_ration_map) stochastic_data = { "malfunction_rate": 10000, # Rate of malfunction occurence "min_duration": 15, # Minimal duration of malfunction "max_duration": 50 # Max duration of malfunction
def main(args, dir): ''' :param args: :return: Episodes to debug (set breakpoint in episodes loop to debug): - ep = 3, agent 1 spawns in front of 3, blocking its path; 0 and 2 are in a deadlock since they have same priority - ep = 4, agents stop because of wrong priorities even though the conflict zone wasn't entered, - ep = 14, ''' rail_generator = sparse_rail_generator( max_num_cities=args.max_num_cities, seed=args.seed, grid_mode=args.grid_mode, max_rails_between_cities=args.max_rails_between_cities, max_rails_in_city=args.max_rails_in_city, ) # Maps speeds to % of appearance in the env speed_ration_map = { 1.: 0.25, # Fast passenger train 1. / 2.: 0.25, # Fast freight train 1. / 3.: 0.25, # Slow commuter train 1. / 4.: 0.25 } # Slow freight train observation_builder = GraphObsForRailEnv( predictor=ShortestPathPredictorForRailEnv( max_depth=args.prediction_depth), bfs_depth=4) env = RailEnv( width=args.width, height=args.height, rail_generator=rail_generator, schedule_generator=sparse_schedule_generator(speed_ration_map), number_of_agents=args.num_agents, obs_builder_object=observation_builder, malfunction_generator_and_process_data=malfunction_from_params( parameters={ 'malfunction_rate': args.malfunction_rate, # Rate of malfunction occurrence 'min_duration': args.min_duration, # Minimal duration of malfunction 'max_duration': args.max_duration # Max duration of malfunction })) if args.render: env_renderer = RenderTool(env, agent_render_variant=AgentRenderVariant. AGENT_SHOWS_OPTIONS_AND_BOX, show_debug=True) sm = stateMachine() tb = TestBattery(env, observation_builder) state_machine_action_dict = {} railenv_action_dict = {} # max_time_steps = env.compute_max_episode_steps(args.width, args.height) max_time_steps = 200 T_rewards = [] # List of episodes rewards T_Qs = [] # List of q values T_num_done_agents = [] # List of number of done agents for each episode T_all_done = [] # If all agents completed in each episode T_episodes = [] # Time taken for each episode if args.save_image and not os.path.isdir("image_dump"): os.makedirs("image_dump") step_taken = 0 total_step_taken = 0 total_episodes = 0 step_times = [] # Time taken for each step for ep in range(args.num_episodes): # Reset info at the beginning of an episode start_time = time.time() # Take time of one episode if args.generate_baseline: if not os.path.isdir("image_dump/" + str(dir)) and args.save_image: os.makedirs("image_dump/" + str(dir)) else: if not os.path.isdir("image_dump/" + str(ep)) and args.save_image: os.makedirs("image_dump/" + str(ep)) state, info = env.reset() tb.reset() if args.render: env_renderer.reset() reward_sum, all_done = 0, False # reward_sum contains the cumulative reward obtained as sum during the steps num_done_agents = 0 state_machine_action = {} for i in range(env.number_of_agents): state_machine_action[i] = 0 for step in range(max_time_steps): start_step_time = time.time() #if step % 10 == 0: # print(step) # Test battery # see test_battery.py triggers = tb.tests(state, args.prediction_depth, state_machine_action) # state machine based on triggers of test battery # see state_machine.py state_machine_action = sm.act( triggers) # State machine picks action for a in range(env.get_num_agents()): #if info['action_required'][a]: # #railenv_action = observation_builder.choose_railenv_action(a, state_machine_action[a]) # railenv_action = observation_builder.choose_railenv_action(a, state_machine_action[a]) # state_machine_action_dict.update({a: state_machine_action}) # railenv_action_dict.update({a: railenv_action}) # railenv_action = observation_builder.choose_railenv_action(a, state_machine_action[a]) railenv_action = observation_builder.choose_railenv_action( a, state_machine_action[a]) state_machine_action_dict.update({a: state_machine_action}) railenv_action_dict.update({a: railenv_action}) state, reward, done, info = env.step( railenv_action_dict) # Env step if args.generate_baseline: #env_renderer.render_env(show=True, show_observations=False, show_predictions=True) env_renderer.render_env(show=False, show_observations=False, show_predictions=True) else: env_renderer.render_env(show=True, show_observations=False, show_predictions=True) if args.generate_baseline: if args.save_image: env_renderer.save_image("image_dump/" + str(dir) + "/image_" + str(step) + "_.png") else: if args.save_image: env_renderer.save_image("image_dump/" + str(ep) + "/image_" + str(step) + "_.png") if args.debug: for a in range(env.get_num_agents()): log('\n\n#########################################') log('\nInfo for agent {}'.format(a)) #log('\npath : {}'.format(state[a]["path"])) log('\noverlap : {}'.format(state[a]["overlap"])) log('\ndirection : {}'.format(state[a]["direction"])) log('\nOccupancy, first layer: {}'.format( state[a]["occupancy"])) log('\nOccupancy, second layer: {}'.format( state[a]["conflict"])) log('\nForks: {}'.format(state[a]["forks"])) log('\nTarget: {}'.format(state[a]["target"])) log('\nPriority: {}'.format(state[a]["priority"])) log('\nMax priority encountered: {}'.format( state[a]["max_priority"])) log('\nNum malfunctioning agents (globally): {}'.format( state[a]["n_malfunction"])) log('\nNum agents ready to depart (globally): {}'.format( state[a]["ready_to_depart"])) log('\nStatus: {}'.format(info['status'][a])) log('\nPosition: {}'.format(env.agents[a].position)) log('\nTarget: {}'.format(env.agents[a].target)) log('\nMoving? {} at speed: {}'.format( env.agents[a].moving, info['speed'][a])) log('\nAction required? {}'.format( info['action_required'][a])) log('\nState machine action: {}'.format( state_machine_action_dict[a])) log('\nRailenv action: {}'.format(railenv_action_dict[a])) log('\nRewards: {}'.format(reward[a])) log('\n\n#########################################') reward_sum += sum(reward[a] for a in range(env.get_num_agents())) step_taken = step time_taken_step = time.time() - start_step_time step_times.append(time_taken_step) if done['__all__']: all_done = True break total_step_taken += step_taken time_taken = time.time() - start_time # Time taken for one episode total_episodes = ep # Time metrics - too precise avg_time_step = sum(step_times) / step_taken #print("Avg time step: " + str(avg_time_step)) # No need to close the renderer since env parameter sizes stay the same T_rewards.append(reward_sum) # Compute num of agents that reached their target for a in range(env.get_num_agents()): if done[a]: num_done_agents += 1 percentage_done_agents = num_done_agents / env.get_num_agents() log("\nDone agents in episode: {}".format(percentage_done_agents)) T_num_done_agents.append( percentage_done_agents) # In proportion to total T_all_done.append(all_done) # Average number of agents that reached their target avg_done_agents = sum(T_num_done_agents) / len(T_num_done_agents) if len( T_num_done_agents) > 0 else 0 avg_reward = sum(T_rewards) / len(T_rewards) if len(T_rewards) > 0 else 0 avg_norm_reward = avg_reward / (max_time_steps / env.get_num_agents()) avg_ep_time = sum(T_episodes) / args.num_episodes if total_episodes == 0: total_episodes = 1 log("\nSeed: " + str(args.seed) \ + "\t | Avg_done_agents: " + str(avg_done_agents)\ + "\t | Avg_reward: " + str(avg_reward)\ + "\t | Avg_norm_reward: " + str(avg_norm_reward)\ + "\t | Max_num_time_steps: " + str(max_time_steps)\ + "\t | Avg_num_time_steps: " + str(total_step_taken/total_episodes) + "\t | Avg episode time: " + str(avg_ep_time))
def test(args, T, ep, dqn, val_mem, metrics, results_dir, evaluate=False): # Init env and set in evaluation mode # Maps speeds to % of appearance in the env speed_ration_map = { 1.: 0.25, # Fast passenger train 1. / 2.: 0.25, # Fast freight train 1. / 3.: 0.25, # Slow commuter train 1. / 4.: 0.25 } # Slow freight train schedule_generator = sparse_schedule_generator(speed_ration_map) observation_builder = GraphObsForRailEnv( predictor=ShortestPathPredictorForRailEnv( max_depth=args.prediction_depth)) env = RailEnv( width=args.width, height=args.height, rail_generator=sparse_rail_generator( max_num_cities=args.max_num_cities, seed= ep, # Use episode as seed when evaluation is performed during training grid_mode=args.grid_mode, max_rails_between_cities=args.max_rails_between_cities, max_rails_in_city=args.max_rails_in_city, ), schedule_generator=schedule_generator, number_of_agents=args.num_agents, obs_builder_object=observation_builder, malfunction_generator_and_process_data=malfunction_from_params( parameters={ 'malfunction_rate': args.malfunction_rate, 'min_duration': args.min_duration, 'max_duration': args.max_duration }), ) if args.render: env_renderer = RenderTool(env, gl="PILSVG", agent_render_variant=AgentRenderVariant. AGENT_SHOWS_OPTIONS_AND_BOX, show_debug=True, screen_height=1080, screen_width=1920) #max_time_steps = env.compute_max_episode_steps(env.width, env.height) max_time_steps = 200 # TODO Debug # metrics['steps'].append(T) metrics['episodes'].append(ep) T_rewards = [] # List of episodes rewards T_Qs = [] # List T_num_done_agents = [] # List of number of done agents for each episode T_all_done = [] # If all agents completed in each episode network_action_dict = dict() railenv_action_dict = dict() qvalues = {} # Test performance over several episodes for ep in range(args.evaluation_episodes): # Reset info state, info = env.reset() reward_sum, all_done = 0, False # reward_sum contains the cumulative reward obtained as sum during the steps num_done_agents = 0 if args.render: env_renderer.reset() # Choose first action - decide entering of agents into the environment for a in range(env.get_num_agents()): action = np.random.choice((0, 2)) railenv_action_dict.update({a: action}) state, reward, done, info = env.step(railenv_action_dict) # Env step reward_sum += sum(reward[a] for a in range(env.get_num_agents())) if args.render: env_renderer.render_env(show=True, show_observations=False, show_predictions=True) for step in range(max_time_steps - 1): # Choose actions for a in range(env.get_num_agents()): if info['action_required'][a]: network_action = dqn.act( state[a] ) # Choose an action greedily (with noisy weights) # network_action = 0 railenv_action = observation_builder.choose_railenv_action( a, network_action) qvalues.update({a: dqn.get_q_values(state[a])}) else: network_action = 0 railenv_action = 0 qvalues.update({a: [0, 0]}) # '0' if wasn't updated railenv_action_dict.update({a: railenv_action}) network_action_dict.update({a: network_action}) if args.debug: for a in range(env.get_num_agents()): print('#########################################') print('Info for agent {}'.format(a)) print('Occupancy, first layer: {}'.format( state[a][:args.prediction_depth])) print('Occupancy, second layer: {}'.format( state[a][args.prediction_depth:args.prediction_depth * 2])) print('Forks: {}'.format( state[a][args.prediction_depth * 2:args.prediction_depth * 3])) print('Target: {}'.format( state[a][args.prediction_depth * 3:args.prediction_depth * 4])) print('Priority: {}'.format( state[a][args.prediction_depth * 4])) print('Max priority encountered: {}'.format( state[a][args.prediction_depth * 4 + 1])) print('Num malfunctoning agents (globally): {}'.format( state[a][args.prediction_depth * 4 + 2])) print('Num agents ready to depart (globally): {}'.format( state[a][args.prediction_depth * 4 + 3])) print('Status: {}'.format(info['status'][a])) print('Position: {}'.format(env.agents[a].position)) print('Moving? {} at speed: {}'.format( env.agents[a].moving, info['speed'][a])) print('Action required? {}'.format( info['action_required'][a])) print('Network action: {}'.format(network_action_dict[a])) print('Railenv action: {}'.format(railenv_action_dict[a])) print('Q values: {}'.format(qvalues[a])) # print('QValues: {}'.format(qvalues)) print('Rewards: {}'.format(reward[a])) # Breakpoint for debugging here state, reward, done, info = env.step( railenv_action_dict) # Env step if args.render: env_renderer.render_env(show=True, show_observations=False, show_predictions=True) reward_sum += sum(reward[a] for a in range(env.get_num_agents())) if done['__all__']: all_done = True break # No need to close the renderer since env parameter sizes stay the same T_rewards.append(reward_sum) # Compute num of agents that reached their target for a in range(env.get_num_agents()): if done[a]: num_done_agents += 1 T_num_done_agents.append( num_done_agents / env.get_num_agents()) # In proportion to total T_all_done.append(all_done) # Test Q-values over validation memory for state in val_mem: # Iterate over valid states T_Qs.append(dqn.evaluate_q(state)) if args.debug: print('T_Qs: {}'.format(T_Qs)) # These are Qs from a single agent TODO avg_done_agents = sum(T_num_done_agents) / len( T_num_done_agents ) # Average number of agents that reached their target avg_reward = sum(T_rewards) / len(T_rewards) avg_norm_reward = avg_reward / (max_time_steps / env.get_num_agents()) # avg_reward, avg_Q = sum(T_rewards) / len(T_rewards), sum(T_Qs) / len(T_Qs) if not evaluate: # Save model parameters if improved if avg_done_agents > metrics['best_avg_done_agents']: metrics['best_avg_done_agents'] = avg_done_agents dqn.save(results_dir) # Append to results and save metrics metrics['rewards'].append(T_rewards) metrics['Qs'].append(T_Qs) torch.save(metrics, os.path.join(results_dir, 'metrics.pth')) # Plot HTML _plot_line(metrics['episodes'], metrics['rewards'], 'Reward', path=results_dir) # Plot rewards in episodes _plot_line(metrics['episodes'], metrics['Qs'], 'Q', path=results_dir) # Return average number of done agents (in proportion) and average reward return avg_done_agents, avg_reward, avg_norm_reward
def train(env): n_agents = env["n_agents"] x_dim = env["x_dim"] y_dim = env["y_dim"] n_cities = env["n_cities"] max_rails_between_cities = env["max_rails_between_cities"] max_rails_in_city = env["max_rails_in_city"] seed = 0 use_fast_tree_obs = False # Observation parameters observation_tree_depth = 4 observation_radius = 10 observation_max_path_depth = 30 # Set the seeds random.seed(seed) np.random.seed(seed) # Break agents from time to time malfunction_parameters = MalfunctionParameters( malfunction_rate=1. / 10000, # Rate of malfunctions min_duration=15, # Minimal duration max_duration=50 # Max duration ) # Observation builder predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth) tree_observation = None if use_fast_tree_obs: tree_observation = FastTreeObs(max_depth=observation_tree_depth) print("Using FastTreeObs") else: tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth, predictor=predictor) print("Using StandardTreeObs") speed_profiles = { 1.: 1.0, # Fast passenger train 1. / 2.: 0.0, # Fast freight train 1. / 3.: 0.0, # Slow commuter train 1. / 4.: 0.0 # Slow freight train } env = RailEnv( width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=n_cities, grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city), schedule_generator=sparse_schedule_generator(speed_profiles), number_of_agents=n_agents, malfunction_generator_and_process_data=malfunction_from_params( malfunction_parameters), obs_builder_object=tree_observation, random_seed=seed) rewards = [] obs, info = env.reset() if use_fast_tree_obs: state_size = tree_observation.observation_dim else: # Calculate the state size given the depth of the tree observation and the # number of features n_features_per_node = env.obs_builder.observation_dim n_nodes = 0 for i in range(observation_tree_depth + 1): n_nodes += np.power(4, i) state_size = n_features_per_node * n_nodes action_size = 5 DEVICE = 'cpu' # if torch.cuda.is_available(): # DEVICE = 'gpu' buffer_length = 10000 steps_to_save_model = 10 step_size = 100 num_steps = 100 # update every 100 steps avg_steps = 20 # num steps to average and plot rewards reward_q = [] batch_size = 100 agent_obs = np.array([None] * env.get_num_agents()) max_steps = int(4 * 2 * (env.height + env.width + (n_agents / n_cities))) num_episodes = 100000 agent_init_params = [] sa_size = [] for i in range(n_agents): agent_init_params.append({ 'num_in_pol': state_size, 'num_out_pol': action_size, 'init_weights': 'model.pt' }) sa_size.append((state_size, action_size)) hyperparams = { "tau": 0.01, "pi_lr": 0.00001, "q_lr": 0.00005, "pol_hidden_dim": 256, "critic_hidden_dim": 256, "attend_heads": 8 } model = AttentionSAC(agent_init_params=agent_init_params, sa_size=sa_size, tau=hyperparams["tau"], pi_lr=hyperparams["pi_lr"], q_lr=hyperparams["q_lr"], pol_hidden_dim=hyperparams["pol_hidden_dim"], critic_hidden_dim=hyperparams["critic_hidden_dim"], attend_heads=hyperparams["attend_heads"]) model.init_dict = {} replay_buffer = ReplayBuffer(buffer_length, n_agents, [state_size for i in range(n_agents)], [action_size for i in range(n_agents)]) print("MAX STEPS: " + str(max_steps)) print("NUM EPISODES: ", num_episodes) print("HYPERPARAMS: ") print(hyperparams) start_time = time.time() for ep in range(num_episodes): print("Episode " + str(ep) + ":", flush=True) obs, info = env.reset(True, True) model.prep_rollouts(device=DEVICE) reward_sum_for_this_episode = 0 for steps in range(max_steps): if steps % step_size == 0: print("=", end="", flush=True) for agent in env.get_agent_handles(): if obs[agent] is not None: if use_fast_tree_obs: agent_obs[agent] = obs[agent] else: agent_obs[agent] = normalize_observation( obs[agent], observation_tree_depth, observation_radius=observation_radius) else: agent_obs[agent] = np.array([0.] * state_size) action_dict = {} agent_actions = [] torch_obs = [ Variable(torch.Tensor([agent_obs[i]]), requires_grad=False) for i in range(n_agents) ] torch_agent_actions = model.step(torch_obs, explore=True) agent_actions = [ac.data.numpy() for ac in torch_agent_actions] for i in range(n_agents): dist = torch_agent_actions[i][0] idx = -1 for j in range(action_size): if dist[j] != 0: idx = j break action_dict[i] = idx next_obs, all_rewards, done, info = env.step(action_dict) rewards = [] dones = [] next_agent_obs = np.array([None] * env.get_num_agents()) for agent in env.get_agent_handles(): if next_obs[agent] is not None: if use_fast_tree_obs: next_agent_obs[agent] = next_obs[agent] else: next_agent_obs[agent] = normalize_observation( obs[agent], observation_tree_depth, observation_radius=observation_radius) else: next_agent_obs[agent] = np.array([0.] * state_size) for i in range(n_agents): reward_sum_for_this_episode += all_rewards[i] rewards.append(all_rewards[i]) all_rewards[i] += augment_reward(agent_obs[agent]) dones.append(done[i]) replay_buffer.push(np.array([agent_obs]), np.array(agent_actions), np.array([rewards]), np.array([next_agent_obs]), np.array([dones])) if steps % num_steps == 0: model.prep_training(device=DEVICE) sample = replay_buffer.sample(batch_size, norm_rews=False) #print(sample) model.update_critic(sample) model.update_policies(sample) model.update_all_targets() model.prep_rollouts(device=DEVICE) reward_sum_for_this_episode /= n_agents reward_q.append(reward_sum_for_this_episode) if len(reward_q) == avg_steps: wandb.log({'reward': np.mean(reward_q)}) reward_q = [] print() if ep % steps_to_save_model == 0: print("\nSaving model") model.save(os.getcwd() + "/model.pt") cur_time = time.time() time_elapsed = (cur_time - start_time) // 60 print("Time Elapsed: " + str(time_elapsed) + "\n")
def main(argv): try: opts, args = getopt.getopt(argv, "n:", ["n_trials="]) except getopt.GetoptError: print('test_navigation_single_agent.py -n <n_trials>') sys.exit(2) for opt, arg in opts: if opt in ('-n', '--n_trials'): n_trials = int(arg) random.seed(1) np.random.seed(1) ######## TEST SET SELECTION - PARAMETERS ######## test_multi_agent_setup = 1 # 1 for Medium size test, 2 for Big size test test_n_agents = 5 # Number of agents to test (3 - 5 - 7 for Medium, 5 - 7 - 10 for Big) test_malfunctions_enabled = True # Malfunctions enabled? test_agents_one_speed = True # Test agents with the same speed (1) or with 4 different speeds? ################################################# # Medium size if test_multi_agent_setup == 1: x_dim = 16*3 y_dim = 9*3 max_num_cities = 5 max_rails_between_cities = 2 max_rails_in_city = 3 # Big size if test_multi_agent_setup == 2: x_dim = 16*4 y_dim = 9*4 max_num_cities = 9 max_rails_between_cities = 5 max_rails_in_city = 5 stochastic_data = {'malfunction_rate': 80, # Rate of malfunction occurence of single agent 'min_duration': 15, # Minimal duration of malfunction 'max_duration': 50 # Max duration of malfunction } # Custom observation builder tree_depth = 2 TreeObservation = TreeObsForRailEnv(max_depth=tree_depth, predictor = ShortestPathPredictorForRailEnv(20)) np.savetxt(fname=path.join('NetsTest' , 'info.txt'), X=[x_dim,y_dim,test_n_agents,max_num_cities,max_rails_between_cities,max_rails_in_city,tree_depth],delimiter=';') # Different agent types (trains) with different speeds. if test_agents_one_speed: speed_ration_map = {1.: 1., # Fast passenger train 1. / 2.: 0.0, # Fast freight train 1. / 3.: 0.0, # Slow commuter train 1. / 4.: 0.0} # Slow freight train else: speed_ration_map = {1.: 0.25, # Fast passenger train 1. / 2.: 0.25, # Fast freight train 1. / 3.: 0.25, # Slow commuter train 1. / 4.: 0.25} # Slow freight train if test_malfunctions_enabled: env = RailEnv(width=x_dim, height=y_dim, rail_generator=sparse_rail_generator(max_num_cities=max_num_cities, # Number of cities in map (where train stations are) seed=14, # Random seed grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city), schedule_generator=sparse_schedule_generator(speed_ration_map), malfunction_generator_and_process_data=malfunction_from_params(stochastic_data), number_of_agents=test_n_agents, obs_builder_object=TreeObservation) else: env = RailEnv(width=x_dim, height=y_dim, rail_generator=sparse_rail_generator(max_num_cities=max_num_cities, # Number of cities in map (where train stations are) seed=14, # Random seed grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city), schedule_generator=sparse_schedule_generator(speed_ration_map), number_of_agents=test_n_agents, obs_builder_object=TreeObservation) env.reset() #env_renderer = RenderTool(env, gl="PILSVG", ) env_renderer = RenderTool(env, gl="PILSVG", agent_render_variant=AgentRenderVariant.AGENT_SHOWS_OPTIONS_AND_BOX, show_debug=False, screen_height=(1080*0.8), # Adjust these parameters to fit your resolution screen_width=(1920*0.8)) num_features_per_node = env.obs_builder.observation_dim nr_nodes = 0 for i in range(tree_depth + 1): nr_nodes += np.power(4, i) state_size = num_features_per_node * nr_nodes action_size = 5 # We set the number of episodes we would like to train on if 'n_trials' not in locals(): n_trials = 15000 # max_steps computation speed_weighted_mean = 0 for key in speed_ration_map.keys(): speed_weighted_mean += key * speed_ration_map[key] #max_steps = int(3 * (env.height + env.width)) max_steps = int((1/speed_weighted_mean) * 3 * (env.height + env.width)) #eps = 1. #eps_end = 0.005 #eps_decay = 0.9995 # And some variables to keep track of the performance action_dict = dict() final_action_dict = dict() action_prob_list = [] scores_window = deque(maxlen=100) done_window = deque(maxlen=100) scores = [] scores_list = [] deadlock_list =[] dones_list_window = [] dones_list = [] action_prob = [0] * action_size agent_obs = [None] * env.get_num_agents() agent_next_obs = [None] * env.get_num_agents() # Useless agent = Agent(state_size, action_size) # LOAD MODEL WEIGHTS TO TEST agent.qnetwork_local.load_state_dict(torch.load(path.join('NetsTest' , 'navigator_checkpoint3800_multi10_deadlock_global10.pth'))) record_images = False frame_step = 0 for trials in range(1, n_trials + 1): # Reset environment obs, info = env.reset()#(True, True) env_renderer.reset() # Build agent specific observations for a in range(env.get_num_agents()): agent_obs[a] = agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10) # Reset score and done score = 0 env_done = 0 # Run episode for step in range(max_steps): # Action for a in range(env.get_num_agents()): if info['action_required'][a]: action = agent.act(agent_obs[a], eps=0.) action_prob[action] += 1 else: action = 0 action_dict.update({a: action}) # Environment step obs, all_rewards, done, deadlocks, info = env.step(action_dict) env_renderer.render_env(show=True, show_predictions=True, show_observations=False) # Build agent specific observations and normalize for a in range(env.get_num_agents()): if obs[a]: agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10) score += all_rewards[a] / env.get_num_agents() if done['__all__']: break # Collection information about training tasks_finished = 0 for _idx in range(env.get_num_agents()): if done[_idx] == 1: tasks_finished += 1 done_window.append(tasks_finished / max(1, env.get_num_agents())) scores_window.append(score / max_steps) # save most recent score scores.append(np.mean(scores_window)) dones_list.append(tasks_finished / max(1, env.get_num_agents())) dones_list_window.append((np.mean(done_window))) scores_list.append(score / max_steps) deadlock_list.append(deadlocks.count(1)/max(1, env.get_num_agents())) if (np.sum(action_prob) == 0): action_prob_normalized = [0] * action_size else: action_prob_normalized = action_prob / np.sum(action_prob) print( '\rTesting {} Agents on ({},{}).\t Episode {}\t Score: {:.3f}\tDones: {:.2f}%\tDeadlocks: {:.2f}\t Action Probabilities: \t {}'.format( env.get_num_agents(), x_dim, y_dim, trials, score / max_steps, 100 * tasks_finished / max(1, env.get_num_agents()), deadlocks.count(1)/max(1, env.get_num_agents()), action_prob_normalized), end=" ") #if trials % 100 == 0: action_prob_list.append(action_prob_normalized) action_prob = [0] * action_size if trials % 50 == 0: np.savetxt(fname=path.join('NetsTest' , 'test_metrics.csv'), X=np.transpose(np.asarray([scores_list,scores,dones_list,dones_list_window,deadlock_list])), delimiter=';',newline='\n') np.savetxt(fname=path.join('NetsTest' , 'test_action_prob.csv'), X=np.asarray(action_prob_list), delimiter=';',newline='\n')
def train_agent(env_params, train_params): # Environment parameters n_agents = env_params.n_agents x_dim = env_params.x_dim y_dim = env_params.y_dim n_cities = env_params.n_cities max_rails_between_cities = env_params.max_rails_between_cities max_rails_in_city = env_params.max_rails_in_city seed = env_params.seed # Observation parameters observation_tree_depth = env_params.observation_tree_depth observation_radius = env_params.observation_radius observation_max_path_depth = env_params.observation_max_path_depth # Training parameters eps_start = train_params.eps_start eps_end = train_params.eps_end eps_decay = train_params.eps_decay n_episodes = train_params.n_episodes checkpoint_interval = train_params.checkpoint_interval n_eval_episodes = train_params.n_evaluation_episodes # Set the seeds random.seed(seed) np.random.seed(seed) # Break agents from time to time malfunction_parameters = MalfunctionParameters( malfunction_rate=1. / 10000, # Rate of malfunctions min_duration=15, # Minimal duration max_duration=50 # Max duration ) # Observation builder predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth) tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth, predictor=predictor) # Fraction of train which each speed speed_profiles = { 1.: 1.0, # Fast passenger train 1. / 2.: 0.0, # Fast freight train 1. / 3.: 0.0, # Slow commuter train 1. / 4.: 0.0 # Slow freight train } # Setup the environment env = RailEnv( width=x_dim, height=y_dim, rail_generator=sparse_rail_generator( max_num_cities=n_cities, grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=max_rails_in_city), schedule_generator=sparse_schedule_generator(speed_profiles), number_of_agents=n_agents, malfunction_generator_and_process_data=malfunction_from_params( malfunction_parameters), obs_builder_object=tree_observation, random_seed=seed) env.reset(regenerate_schedule=True, regenerate_rail=True) # Setup renderer if train_params.render: env_renderer = RenderTool(env, gl="PGL") # Calculate the state size given the depth of the tree observation and the number of features n_features_per_node = env.obs_builder.observation_dim n_nodes = 0 for i in range(observation_tree_depth + 1): n_nodes += np.power(4, i) state_size = n_features_per_node * n_nodes # The action space of flatland is 5 discrete actions action_size = 5 # Max number of steps per episode # This is the official formula used during evaluations # See details in flatland.envs.schedule_generators.sparse_schedule_generator max_steps = int(4 * 2 * (env.height + env.width + (n_agents / n_cities))) action_count = [0] * action_size action_dict = dict() agent_obs = [None] * env.get_num_agents() agent_prev_obs = [None] * env.get_num_agents() agent_prev_action = [2] * env.get_num_agents() update_values = False smoothed_normalized_score = -1.0 smoothed_eval_normalized_score = -1.0 smoothed_completion = 0.0 smoothed_eval_completion = 0.0 # Double Dueling DQN policy policy = DDDQNPolicy(state_size, action_size, train_params) # TensorBoard writer writer = SummaryWriter() writer.add_hparams(vars(train_params), {}) writer.add_hparams(vars(env_params), {}) training_timer = Timer() training_timer.start() print( "\n🚉 Training {} trains on {}x{} grid for {} episodes, evaluating on {} episodes every {} episodes.\n" .format(env.get_num_agents(), x_dim, y_dim, n_episodes, n_eval_episodes, checkpoint_interval)) for episode_idx in range(n_episodes + 1): # Timers step_timer = Timer() reset_timer = Timer() learn_timer = Timer() preproc_timer = Timer() # Reset environment reset_timer.start() obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True) reset_timer.end() if train_params.render: env_renderer.set_new_rail() score = 0 nb_steps = 0 actions_taken = [] # Build agent specific observations for agent in env.get_agent_handles(): if obs[agent]: agent_obs[agent] = normalize_observation( obs[agent], observation_tree_depth, observation_radius=observation_radius) agent_prev_obs[agent] = agent_obs[agent].copy() # Run episode for step in range(max_steps - 1): for agent in env.get_agent_handles(): if info['action_required'][agent]: # If an action is required, we want to store the obs at that step as well as the action update_values = True action = policy.act(agent_obs[agent], eps=eps_start) action_count[action] += 1 actions_taken.append(action) else: update_values = False action = 0 action_dict.update({agent: action}) # Environment step step_timer.start() next_obs, all_rewards, done, info = env.step(action_dict) step_timer.end() if train_params.render and episode_idx % checkpoint_interval == 0: env_renderer.render_env(show=True, frames=False, show_observations=False, show_predictions=False) for agent in range(env.get_num_agents()): # Update replay buffer and train agent # Only update the values when we are done or when an action was taken and thus relevant information is present if update_values or done[agent]: learn_timer.start() policy.step(agent_prev_obs[agent], agent_prev_action[agent], all_rewards[agent], agent_obs[agent], done[agent]) learn_timer.end() agent_prev_obs[agent] = agent_obs[agent].copy() agent_prev_action[agent] = action_dict[agent] # Preprocess the new observations if next_obs[agent]: preproc_timer.start() agent_obs[agent] = normalize_observation( next_obs[agent], observation_tree_depth, observation_radius=observation_radius) preproc_timer.end() score += all_rewards[agent] nb_steps = step if done['__all__']: break # Epsilon decay eps_start = max(eps_end, eps_decay * eps_start) # Collection information about training tasks_finished = sum(done[idx] for idx in env.get_agent_handles()) completion = tasks_finished / max(1, env.get_num_agents()) normalized_score = score / (max_steps * env.get_num_agents()) action_probs = action_count / np.sum(action_count) action_count = [1] * action_size # Smoothed values for terminal display and for more stable hyper-parameter tuning smoothing = 0.99 smoothed_normalized_score = smoothed_normalized_score * smoothing + normalized_score * ( 1.0 - smoothing) smoothed_completion = smoothed_completion * smoothing + completion * ( 1.0 - smoothing) # Print logs if episode_idx % checkpoint_interval == 0: torch.save( policy.qnetwork_local, './checkpoints/origin_multi-' + str(episode_idx) + '.pth') if train_params.render: env_renderer.close_window() print('\r🚂 Episode {}' '\t 🏆 Score: {:.3f}' ' Avg: {:.3f}' '\t 💯 Done: {:.2f}%' ' Avg: {:.2f}%' '\t 🎲 Epsilon: {:.2f} ' '\t 🔀 Action Probs: {}'.format(episode_idx, normalized_score, smoothed_normalized_score, 100 * completion, 100 * smoothed_completion, eps_start, format_action_prob(action_probs)), end=" ") # Evaluate policy if episode_idx % train_params.checkpoint_interval == 0: scores, completions, nb_steps_eval = eval_policy( env, policy, n_eval_episodes, max_steps) writer.add_scalar("evaluation/scores_min", np.min(scores), episode_idx) writer.add_scalar("evaluation/scores_max", np.max(scores), episode_idx) writer.add_scalar("evaluation/scores_mean", np.mean(scores), episode_idx) writer.add_scalar("evaluation/scores_std", np.std(scores), episode_idx) writer.add_histogram("evaluation/scores", np.array(scores), episode_idx) writer.add_scalar("evaluation/completions_min", np.min(completions), episode_idx) writer.add_scalar("evaluation/completions_max", np.max(completions), episode_idx) writer.add_scalar("evaluation/completions_mean", np.mean(completions), episode_idx) writer.add_scalar("evaluation/completions_std", np.std(completions), episode_idx) writer.add_histogram("evaluation/completions", np.array(completions), episode_idx) writer.add_scalar("evaluation/nb_steps_min", np.min(nb_steps_eval), episode_idx) writer.add_scalar("evaluation/nb_steps_max", np.max(nb_steps_eval), episode_idx) writer.add_scalar("evaluation/nb_steps_mean", np.mean(nb_steps_eval), episode_idx) writer.add_scalar("evaluation/nb_steps_std", np.std(nb_steps_eval), episode_idx) writer.add_histogram("evaluation/nb_steps", np.array(nb_steps_eval), episode_idx) smoothing = 0.9 smoothed_eval_normalized_score = smoothed_eval_normalized_score * smoothing + np.mean( scores) * (1.0 - smoothing) smoothed_eval_completion = smoothed_eval_completion * smoothing + np.mean( completions) * (1.0 - smoothing) writer.add_scalar("evaluation/smoothed_score", smoothed_eval_normalized_score, episode_idx) writer.add_scalar("evaluation/smoothed_completion", smoothed_eval_completion, episode_idx) # Save logs to tensorboard writer.add_scalar("training/score", normalized_score, episode_idx) writer.add_scalar("training/smoothed_score", smoothed_normalized_score, episode_idx) writer.add_scalar("training/completion", np.mean(completion), episode_idx) writer.add_scalar("training/smoothed_completion", np.mean(smoothed_completion), episode_idx) writer.add_scalar("training/nb_steps", nb_steps, episode_idx) writer.add_histogram("actions/distribution", np.array(actions_taken), episode_idx) writer.add_scalar("actions/nothing", action_probs[RailEnvActions.DO_NOTHING], episode_idx) writer.add_scalar("actions/left", action_probs[RailEnvActions.MOVE_LEFT], episode_idx) writer.add_scalar("actions/forward", action_probs[RailEnvActions.MOVE_FORWARD], episode_idx) writer.add_scalar("actions/right", action_probs[RailEnvActions.MOVE_RIGHT], episode_idx) writer.add_scalar("actions/stop", action_probs[RailEnvActions.STOP_MOVING], episode_idx) writer.add_scalar("training/epsilon", eps_start, episode_idx) writer.add_scalar("training/buffer_size", len(policy.memory), episode_idx) writer.add_scalar("training/loss", policy.loss, episode_idx) writer.add_scalar("timer/reset", reset_timer.get(), episode_idx) writer.add_scalar("timer/step", step_timer.get(), episode_idx) writer.add_scalar("timer/learn", learn_timer.get(), episode_idx) writer.add_scalar("timer/preproc", preproc_timer.get(), episode_idx) writer.add_scalar("timer/total", training_timer.get_current(), episode_idx)
scores = [] losses = [] env_count = 0 best_reward = -np.inf for episode_count in range(n_episodes): # print("EPISODE: ",episode_count) env_params = get_env(env_count) env = RailEnv( width=env_params['x_dim'], height=env_params['y_dim'], rail_generator=sparse_rail_generator( max_num_cities=env_params['n_cities'], grid_mode=False, max_rails_between_cities=max_rails_between_cities, max_rails_in_city=env_params['max_rails_in_city']), schedule_generator=sparse_schedule_generator(speed_profiles), number_of_agents=env_params['n_agents'], malfunction_generator_and_process_data=malfunction_from_params( malfunction_parameters), obs_builder_object=tree_observation, random_seed=42) env_count += 1 obs, info = env.reset(True, True) max_steps = int(4 * 2 * (env.height + env.width + (env_params['n_agents'] / env_params['n_cities'])))