Exemplo n.º 1
0
def test_get_shortest_paths():
    env = load_flatland_environment_from_file('test_002.pkl', 'env_data.tests')
    env.reset()
    actual = get_shortest_paths(env.distance_map)

    expected = {
        0: [
            WayPoint(position=(1, 1), direction=1),
            WayPoint(position=(1, 2), direction=1),
            WayPoint(position=(1, 3), direction=1),
            WayPoint(position=(2, 3), direction=2),
            WayPoint(position=(2, 4), direction=1),
            WayPoint(position=(2, 5), direction=1),
            WayPoint(position=(2, 6), direction=1),
            WayPoint(position=(2, 7), direction=1),
            WayPoint(position=(2, 8), direction=1),
            WayPoint(position=(2, 9), direction=1),
            WayPoint(position=(2, 10), direction=1),
            WayPoint(position=(2, 11), direction=1),
            WayPoint(position=(2, 12), direction=1),
            WayPoint(position=(2, 13), direction=1),
            WayPoint(position=(2, 14), direction=1),
            WayPoint(position=(2, 15), direction=1),
            WayPoint(position=(2, 16), direction=1),
            WayPoint(position=(2, 17), direction=1),
            WayPoint(position=(2, 18), direction=1)
        ],
        1: [
            WayPoint(position=(3, 18), direction=3),
            WayPoint(position=(3, 17), direction=3),
            WayPoint(position=(3, 16), direction=3),
            WayPoint(position=(2, 16), direction=0),
            WayPoint(position=(2, 15), direction=3),
            WayPoint(position=(2, 14), direction=3),
            WayPoint(position=(2, 13), direction=3),
            WayPoint(position=(2, 12), direction=3),
            WayPoint(position=(2, 11), direction=3),
            WayPoint(position=(2, 10), direction=3),
            WayPoint(position=(2, 9), direction=3),
            WayPoint(position=(2, 8), direction=3),
            WayPoint(position=(2, 7), direction=3),
            WayPoint(position=(2, 6), direction=3),
            WayPoint(position=(2, 5), direction=3),
            WayPoint(position=(2, 4), direction=3),
            WayPoint(position=(2, 3), direction=3),
            WayPoint(position=(2, 2), direction=3),
            WayPoint(position=(2, 1), direction=3)
        ]
    }

    for agent_handle in expected:
        assert np.array_equal(actual[agent_handle], expected[agent_handle]), \
            "[{}] actual={},expected={}".format(agent_handle, actual[agent_handle], expected[agent_handle])
Exemplo n.º 2
0
def test_get_shortest_paths_agent_handle():
    env = load_flatland_environment_from_file(
        'Level_distance_map_shortest_path.pkl', 'env_data.tests')
    env.reset()
    actual = get_shortest_paths(env.distance_map, agent_handle=6)

    print(actual, file=sys.stderr)

    expected = {
        6: [
            WayPoint(position=(5, 5), direction=0),
            WayPoint(position=(4, 5), direction=0),
            WayPoint(position=(3, 5), direction=0),
            WayPoint(position=(2, 5), direction=0),
            WayPoint(position=(1, 5), direction=0),
            WayPoint(position=(0, 5), direction=0),
            WayPoint(position=(0, 6), direction=1),
            WayPoint(position=(0, 7), direction=1),
            WayPoint(position=(0, 8), direction=1),
            WayPoint(position=(0, 9), direction=1),
            WayPoint(position=(0, 10), direction=1),
            WayPoint(position=(1, 10), direction=2),
            WayPoint(position=(2, 10), direction=2),
            WayPoint(position=(3, 10), direction=2),
            WayPoint(position=(4, 10), direction=2),
            WayPoint(position=(5, 10), direction=2),
            WayPoint(position=(6, 10), direction=2),
            WayPoint(position=(7, 10), direction=2),
            WayPoint(position=(8, 10), direction=2),
            WayPoint(position=(9, 10), direction=2),
            WayPoint(position=(10, 10), direction=2),
            WayPoint(position=(11, 10), direction=2),
            WayPoint(position=(12, 10), direction=2),
            WayPoint(position=(13, 10), direction=2),
            WayPoint(position=(14, 10), direction=2),
            WayPoint(position=(15, 10), direction=2),
            WayPoint(position=(16, 10), direction=2),
            WayPoint(position=(17, 10), direction=2),
            WayPoint(position=(18, 10), direction=2),
            WayPoint(position=(19, 10), direction=2),
            WayPoint(position=(20, 10), direction=2),
            WayPoint(position=(20, 9), direction=3),
            WayPoint(position=(20, 8), direction=3),
            WayPoint(position=(21, 8), direction=2),
            WayPoint(position=(21, 7), direction=3),
            WayPoint(position=(21, 6), direction=3),
            WayPoint(position=(21, 5), direction=3)
        ]
    }

    for agent_handle in expected:
        assert np.array_equal(actual[agent_handle], expected[agent_handle]), \
            "[{}] actual={},expected={}".format(agent_handle, actual[agent_handle], expected[agent_handle])
Exemplo n.º 3
0
def test_get_shortest_paths_max_depth():
    env = load_flatland_environment_from_file('test_002.pkl', 'env_data.tests')
    env.reset()
    actual = get_shortest_paths(env.distance_map, max_depth=2)

    expected = {
        0: [
            Waypoint(position=(1, 1), direction=1),
            Waypoint(position=(1, 2), direction=1)
        ],
        1: [
            Waypoint(position=(3, 18), direction=3),
            Waypoint(position=(3, 17), direction=3),
        ]
    }

    for agent_handle in expected:
        assert np.array_equal(actual[agent_handle], expected[agent_handle]), \
            "[{}] actual={},expected={}".format(agent_handle, actual[agent_handle], expected[agent_handle])
Exemplo n.º 4
0
from flatland.envs.rail_env_utils import load_flatland_environment_from_file

env = load_flatland_environment_from_file(
    "scratch/test-envs/Test_13/Level_0.pkl")

local_observation, info = env.reset(regenerate_rail=True,
                                    regenerate_schedule=True,
                                    activate_agents=False,
                                    random_seed=3835)

pass
Exemplo n.º 5
0
def train_agent(train_params):

    env = load_flatland_environment_from_file("scratch/test-envs/Test_2/Level_1.pkl")
    env.reset(regenerate_schedule=True, regenerate_rail=True )
    # Environment parameters
    n_agents = len(env.agents)
    print("n_agents= ", n_agents)
    print("env.get_num_agents(): ",env.get_num_agents())
    x_dim = env.width
    y_dim = env.height
    n_cities = 3
    #max_rails_between_cities = env_params.max_rails_between_cities
    #max_rails_in_city = env_params.max_rails_in_city
    seed = 2125

    # Observation parameters
    # observation_tree_depth = env_params.observation_tree_depth
    # observation_radius = env_params.observation_radius
    # observation_max_path_depth = env_params.observation_max_path_depth
    observation_tree_depth = 2
    observation_radius = 10
    observation_max_path_depth = 30

    # Training parameters
    eps_start = train_params.eps_start
    eps_end = train_params.eps_end
    eps_decay = train_params.eps_decay
    n_episodes = train_params.n_episodes
    checkpoint_interval = train_params.checkpoint_interval
    n_eval_episodes = train_params.n_evaluation_episodes

    # Set the seeds
    random.seed(seed)
    np.random.seed(seed)

    # Break agents from time to time
    malfunction_parameters = MalfunctionParameters(
        malfunction_rate=1. / 10000,  # Rate of malfunctions
        min_duration=15,  # Minimal duration
        max_duration=50  # Max duration
    )

    # Observation builder
    predictor = ShortestPathPredictorForRailEnv(observation_max_path_depth)
    tree_observation = TreeObsForRailEnv(max_depth=observation_tree_depth, predictor=predictor)

    # Fraction of train which each speed
    speed_profiles = {
        1.: 1.0,  # Fast passenger train
        1. / 2.: 0.0,  # Fast freight train
        1. / 3.: 0.0,  # Slow commuter train
        1. / 4.: 0.0  # Slow freight train
    }

    # Setup the environment
    # env = RailEnv(
    #     width=x_dim,
    #     height=y_dim,
    #     rail_generator=sparse_rail_generator(
    #         max_num_cities=n_cities,
    #         grid_mode=False,
    #         max_rails_between_cities=max_rails_between_cities,
    #         max_rails_in_city=max_rails_in_city
    #     ),
    #     schedule_generator=sparse_schedule_generator(speed_profiles),
    #     number_of_agents=n_agents,
    #     malfunction_generator_and_process_data=malfunction_from_params(malfunction_parameters),
    #     obs_builder_object=tree_observation,
    #     random_seed=seed
    # 

    # env.reset(regenerate_schedule=True, regenerate_rail=True)
       
    # Setup renderer
    if train_params.render:
        env_renderer = RenderTool(env, gl="PGL")

    # Calculate the state size given the depth of the tree observation and the number of features
    n_features_per_node = env.obs_builder.observation_dim
    n_nodes = 0
    for i in range(observation_tree_depth + 1):
        n_nodes += np.power(4, i)
    state_size = n_features_per_node * n_nodes

    # The action space of flatland is 5 discrete actions
    action_size = 5

    # Max number of steps per episode
    # This is the official formula used during evaluations
    # See details in flatland.envs.schedule_generators.sparse_schedule_generator
    max_steps = int(4 * 2 * (env.height + env.width + (n_agents / n_cities)))
    #max_steps = env._max_episode_steps
    print("max_steps = ",  max_steps)
    print("env._max_episode_steps= ",env._max_episode_steps)

    action_count = [0] * action_size
    action_dict = dict()
    agent_obs = [None] * env.get_num_agents()
    agent_prev_obs = [None] * env.get_num_agents()
    agent_prev_action = [2] * env.get_num_agents()
    update_values = False
    smoothed_normalized_score = -1.0
    smoothed_eval_normalized_score = -1.0
    smoothed_completion = 0.0
    smoothed_eval_completion = 0.0

    # Double Dueling DQN policy
    policy = DDDQNPolicy(state_size, action_size, train_params)

    # TensorBoard writer
    writer = SummaryWriter()
    writer.add_hparams(vars(train_params), {})
    #writer.add_hparams(vars(env_params), {})

    training_timer = Timer()
    training_timer.start()

    print("\n🚉 Training {} trains on {}x{} grid for {} episodes, evaluating on {} episodes every {} episodes.\n"
          .format(env.get_num_agents(), x_dim, y_dim, n_episodes, n_eval_episodes, checkpoint_interval))

    model = policy.qnetwork_local
    optimizer = policy.optimizer

    checkpoint = torch.load('./checkpoints/test_multi-6500.pth')
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    checkepisode_idx = checkpoint['episode_idx']
    eps_start = checkpoint['eps_start']
    smoothed_completion = checkpoint['Avg']
    print(checkepisode_idx)
    #for episode_idx in range(n_episodes+1):
    for episode_idx in range(checkepisode_idx,(n_episodes + 1)):
        # Timers
        step_timer = Timer()
        reset_timer = Timer()
        learn_timer = Timer()
        preproc_timer = Timer()

        # Reset environment
        reset_timer.start()
        obs, info = env.reset(regenerate_rail=True, regenerate_schedule=True)
        reset_timer.end()

        if train_params.render:
            env_renderer.set_new_rail()

        score = 0
        nb_steps = 0
        actions_taken = []

        # Build agent specific observations
        for agent in env.get_agent_handles():
            if obs[agent]:
                agent_obs[agent] = normalize_observation(obs[agent], observation_tree_depth, observation_radius=observation_radius)
                agent_prev_obs[agent] = agent_obs[agent].copy()

        # Run episode
        for step in range(max_steps - 1):
            for agent in env.get_agent_handles():
                if info['action_required'][agent]:
                    # If an action is required, we want to store the obs at that step as well as the action
                    update_values = True
                    action = policy.act(agent_obs[agent], eps=eps_start)
                    action_count[action] += 1
                    actions_taken.append(action)
                else:
                    update_values = False
                    action = 0
                action_dict.update({agent: action})

            # Environment step
            step_timer.start()
            next_obs, all_rewards, done, info = env.step(action_dict)
            step_timer.end()

            if train_params.render and episode_idx % checkpoint_interval == 0:
                env_renderer.render_env(
                    show=True,
                    frames=False,
                    show_observations=False,
                    show_predictions=False
                )

            for agent in range(env.get_num_agents()):
                # Update replay buffer and train agent
                # Only update the values when we are done or when an action was taken and thus relevant information is present
                if update_values or done[agent]:
                    learn_timer.start()
                    policy.step(agent_prev_obs[agent], agent_prev_action[agent], all_rewards[agent], agent_obs[agent], done[agent])
                    learn_timer.end()

                    agent_prev_obs[agent] = agent_obs[agent].copy()
                    agent_prev_action[agent] = action_dict[agent]

                # Preprocess the new observations
                if next_obs[agent]:
                    preproc_timer.start()
                    agent_obs[agent] = normalize_observation(next_obs[agent], observation_tree_depth, observation_radius=observation_radius)
                    preproc_timer.end()

                score += all_rewards[agent]

            nb_steps = step

            if done['__all__']:
                break

        # Epsilon decay
        eps_start = max(eps_end, eps_decay * eps_start)

        # Collection information about training
        tasks_finished = sum(done[idx] for idx in env.get_agent_handles())
        completion = tasks_finished / max(1, env.get_num_agents())
        normalized_score = score / (max_steps * env.get_num_agents())
        action_probs = action_count / np.sum(action_count)
        action_count = [1] * action_size

        # Smoothed values for terminal display and for more stable hyper-parameter tuning
        smoothing = 0.99
        smoothed_normalized_score = smoothed_normalized_score * smoothing + normalized_score * (1.0 - smoothing)
        smoothed_completion = smoothed_completion * smoothing + completion * (1.0 - smoothing)

        # Print logs
        if episode_idx % checkpoint_interval == 0:
            #save_checkpoint(episode_idx, policy.qnetwork_local, policy.qnetwork_local.optimizer, './checkpoints/test'+str(episode_idx)+'.pth')
            torch.save(policy.qnetwork_local, './checkpoints/run_test_multi-' + str(episode_idx) + '.pth')
            torch.save(
                {'episode_idx':episode_idx,
                'model_state_dict':policy.qnetwork_local.state_dict(),
                'optimizer_state_dict':policy.optimizer.state_dict(),
                'eps_start':eps_start,
                'Avg':smoothed_completion
                #'loss': policy.qnetwork_local.loss
            }, './checkpoints/test_multi-' + str(episode_idx) + '.pth')
            if train_params.render:
                env_renderer.close_window()

        print(
            '\r🚂 Episode {}'
            '\t 🏆 Score: {:.3f}'
            ' Avg: {:.3f}'
            '\t 💯 Done: {:.2f}%'
            ' Avg: {:.2f}%'
            '\t 🎲 Epsilon: {:.2f} '
            '\t 🔀 Action Probs: {}'.format(
                episode_idx,
                normalized_score,
                smoothed_normalized_score,
                100 * completion,
                100 * smoothed_completion,
                eps_start,
                format_action_prob(action_probs)
            ), end=" ")

        # Evaluate policy
        if episode_idx % train_params.checkpoint_interval == 0:
            scores, completions, nb_steps_eval = eval_policy(env, policy, n_eval_episodes, max_steps)
            writer.add_scalar("evaluation/scores_min", np.min(scores), episode_idx)
            writer.add_scalar("evaluation/scores_max", np.max(scores), episode_idx)
            writer.add_scalar("evaluation/scores_mean", np.mean(scores), episode_idx)
            writer.add_scalar("evaluation/scores_std", np.std(scores), episode_idx)
            writer.add_histogram("evaluation/scores", np.array(scores), episode_idx)
            writer.add_scalar("evaluation/completions_min", np.min(completions), episode_idx)
            writer.add_scalar("evaluation/completions_max", np.max(completions), episode_idx)
            writer.add_scalar("evaluation/completions_mean", np.mean(completions), episode_idx)
            writer.add_scalar("evaluation/completions_std", np.std(completions), episode_idx)
            writer.add_histogram("evaluation/completions", np.array(completions), episode_idx)
            writer.add_scalar("evaluation/nb_steps_min", np.min(nb_steps_eval), episode_idx)
            writer.add_scalar("evaluation/nb_steps_max", np.max(nb_steps_eval), episode_idx)
            writer.add_scalar("evaluation/nb_steps_mean", np.mean(nb_steps_eval), episode_idx)
            writer.add_scalar("evaluation/nb_steps_std", np.std(nb_steps_eval), episode_idx)
            writer.add_histogram("evaluation/nb_steps", np.array(nb_steps_eval), episode_idx)

            smoothing = 0.9
            smoothed_eval_normalized_score = smoothed_eval_normalized_score * smoothing + np.mean(scores) * (1.0 - smoothing)
            smoothed_eval_completion = smoothed_eval_completion * smoothing + np.mean(completions) * (1.0 - smoothing)
            writer.add_scalar("evaluation/smoothed_score", smoothed_eval_normalized_score, episode_idx)
            writer.add_scalar("evaluation/smoothed_completion", smoothed_eval_completion, episode_idx)

        # Save logs to tensorboard
        writer.add_scalar("training/score", normalized_score, episode_idx)
        writer.add_scalar("training/smoothed_score", smoothed_normalized_score, episode_idx)
        writer.add_scalar("training/completion", np.mean(completion), episode_idx)
        writer.add_scalar("training/smoothed_completion", np.mean(smoothed_completion), episode_idx)
        writer.add_scalar("training/nb_steps", nb_steps, episode_idx)
        writer.add_histogram("actions/distribution", np.array(actions_taken), episode_idx)
        writer.add_scalar("actions/nothing", action_probs[RailEnvActions.DO_NOTHING], episode_idx)
        writer.add_scalar("actions/left", action_probs[RailEnvActions.MOVE_LEFT], episode_idx)
        writer.add_scalar("actions/forward", action_probs[RailEnvActions.MOVE_FORWARD], episode_idx)
        writer.add_scalar("actions/right", action_probs[RailEnvActions.MOVE_RIGHT], episode_idx)
        writer.add_scalar("actions/stop", action_probs[RailEnvActions.STOP_MOVING], episode_idx)
        writer.add_scalar("training/epsilon", eps_start, episode_idx)
        writer.add_scalar("training/buffer_size", len(policy.memory), episode_idx)
        writer.add_scalar("training/loss", policy.loss, episode_idx)
        writer.add_scalar("timer/reset", reset_timer.get(), episode_idx)
        writer.add_scalar("timer/step", step_timer.get(), episode_idx)
        writer.add_scalar("timer/learn", learn_timer.get(), episode_idx)
        writer.add_scalar("timer/preproc", preproc_timer.get(), episode_idx)
        writer.add_scalar("timer/total", training_timer.get_current(), episode_idx)
def test_load():
    load_flatland_environment_from_file('test_001.pkl', 'env_data.tests')