Python Agent примеры использования

Язык программирования: Python

Пространство имен/Пакет: torch_training.dueling_double_dqn

Класс/Тип: Agent

Примеров на hotexamples.com: 6

Python Agent - 6 примеров найдено. Это лучшие примеры Python кода для torch_training.dueling_double_dqn.Agent, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Agent(5)

act(5)

step(3)

Основные методы

Agent (5)

act (5)

step (3)

Пример #1

Показать файл

if 'n_trials' not in locals():
    n_trials = 60000
max_steps = int(3 * (env.height + env.width))
eps = 1.
eps_end = 0.005
eps_decay = 0.9995
action_dict = dict()
final_action_dict = dict()
scores_window = deque(maxlen=100)
done_window = deque(maxlen=100)
scores = []
dones_list = []
action_prob = [0] * action_size
agent_obs = [None] * env.get_num_agents()
agent_next_obs = [None] * env.get_num_agents()
agent = Agent(state_size, action_size)
with path(torch_training.Nets, "navigator_checkpoint1000.pth") as file_in:
    agent.qnetwork_local.load_state_dict(torch.load(file_in))

record_images = False
frame_step = 0

for trials in range(1, n_trials + 1):

    # Reset environment
    obs, info = env.reset(True, True)
    env_renderer.reset()
    # Build agent specific observations
    for a in range(env.get_num_agents()):
        agent_obs[a] = agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10)
    # Reset score and done

Пример #2

Показать файл

Файл: test_navigation.py Проект: alessandrositta/Flatland_challenge

def main(argv):
    try:
        opts, args = getopt.getopt(argv, "n:", ["n_trials="])
    except getopt.GetoptError:
        print('test_navigation_single_agent.py -n <n_trials>')
        sys.exit(2)
    for opt, arg in opts:
        if opt in ('-n', '--n_trials'):
            n_trials = int(arg)

    random.seed(1)
    np.random.seed(1)

    ######## TEST SET SELECTION - PARAMETERS ########
    
    test_multi_agent_setup = 1             # 1 for Medium size test, 2 for Big size test
    test_n_agents = 5                      # Number of agents to test (3 - 5 - 7 for Medium, 5 - 7 - 10 for Big)
    test_malfunctions_enabled = True       # Malfunctions enabled?
    test_agents_one_speed = True           # Test agents with the same speed (1) or with 4 different speeds?

    #################################################

    # Medium size
    if test_multi_agent_setup == 1:
        x_dim = 16*3
        y_dim = 9*3
        max_num_cities = 5
        max_rails_between_cities = 2
        max_rails_in_city = 3

    # Big size
    if test_multi_agent_setup == 2:
        x_dim = 16*4
        y_dim = 9*4
        max_num_cities = 9
        max_rails_between_cities = 5
        max_rails_in_city = 5


    stochastic_data = {'malfunction_rate': 80,  # Rate of malfunction occurence of single agent
                       'min_duration': 15,  # Minimal duration of malfunction
                       'max_duration': 50  # Max duration of malfunction
                       }

    # Custom observation builder
    tree_depth = 2
    TreeObservation = TreeObsForRailEnv(max_depth=tree_depth, predictor = ShortestPathPredictorForRailEnv(20))

    np.savetxt(fname=path.join('NetsTest' , 'info.txt'), X=[x_dim,y_dim,test_n_agents,max_num_cities,max_rails_between_cities,max_rails_in_city,tree_depth],delimiter=';')

    # Different agent types (trains) with different speeds.
    if test_agents_one_speed:
        speed_ration_map = {1.: 1.,  # Fast passenger train
                            1. / 2.: 0.0,  # Fast freight train
                            1. / 3.: 0.0,  # Slow commuter train
                            1. / 4.: 0.0}  # Slow freight train
    else:
        speed_ration_map = {1.: 0.25,  # Fast passenger train
                            1. / 2.: 0.25,  # Fast freight train
                            1. / 3.: 0.25,  # Slow commuter train
                            1. / 4.: 0.25}  # Slow freight train

    
    if test_malfunctions_enabled:
        env = RailEnv(width=x_dim,
                      height=y_dim,
                      rail_generator=sparse_rail_generator(max_num_cities=max_num_cities,
                                                           # Number of cities in map (where train stations are)
                                                           seed=14,  # Random seed
                                                           grid_mode=False,
                                                           max_rails_between_cities=max_rails_between_cities,
                                                               max_rails_in_city=max_rails_in_city),
                    schedule_generator=sparse_schedule_generator(speed_ration_map),
                    malfunction_generator_and_process_data=malfunction_from_params(stochastic_data),
                    number_of_agents=test_n_agents,
                    obs_builder_object=TreeObservation)
    else:
        env = RailEnv(width=x_dim,
                      height=y_dim,
                      rail_generator=sparse_rail_generator(max_num_cities=max_num_cities,
                                                           # Number of cities in map (where train stations are)
                                                           seed=14,  # Random seed
                                                           grid_mode=False,
                                                           max_rails_between_cities=max_rails_between_cities,
                                                               max_rails_in_city=max_rails_in_city),
                    schedule_generator=sparse_schedule_generator(speed_ration_map),
                    number_of_agents=test_n_agents,
                    obs_builder_object=TreeObservation)
    
    env.reset()

    #env_renderer = RenderTool(env, gl="PILSVG", )
    env_renderer = RenderTool(env, gl="PILSVG",
                          agent_render_variant=AgentRenderVariant.AGENT_SHOWS_OPTIONS_AND_BOX,
                          show_debug=False,
                          screen_height=(1080*0.8),  # Adjust these parameters to fit your resolution
                          screen_width=(1920*0.8))
    num_features_per_node = env.obs_builder.observation_dim

    
    nr_nodes = 0
    for i in range(tree_depth + 1):
        nr_nodes += np.power(4, i)
    state_size = num_features_per_node * nr_nodes
    action_size = 5

    # We set the number of episodes we would like to train on
    if 'n_trials' not in locals():
        n_trials = 15000
    
    # max_steps computation
    speed_weighted_mean = 0

    for key in speed_ration_map.keys():
        speed_weighted_mean += key * speed_ration_map[key]
    
    #max_steps = int(3 * (env.height + env.width))
    max_steps = int((1/speed_weighted_mean) * 3 * (env.height + env.width))
    #eps = 1.
    #eps_end = 0.005
    #eps_decay = 0.9995

    # And some variables to keep track of the performance
    action_dict = dict()
    final_action_dict = dict()
    action_prob_list = []
    scores_window = deque(maxlen=100)
    done_window = deque(maxlen=100)
    scores = []
    scores_list = []
    deadlock_list =[]
    dones_list_window = []
    dones_list = []
    action_prob = [0] * action_size
    agent_obs = [None] * env.get_num_agents()
    agent_next_obs = [None] * env.get_num_agents() # Useless
    agent = Agent(state_size, action_size)
    
    # LOAD MODEL WEIGHTS TO TEST
    agent.qnetwork_local.load_state_dict(torch.load(path.join('NetsTest' , 'navigator_checkpoint3800_multi10_deadlock_global10.pth')))

    record_images = False
    frame_step = 0

    for trials in range(1, n_trials + 1):

        # Reset environment
        obs, info = env.reset()#(True, True)
        env_renderer.reset()
        # Build agent specific observations
        for a in range(env.get_num_agents()):
            agent_obs[a] = agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10)
        # Reset score and done
        score = 0
        env_done = 0

        # Run episode
        for step in range(max_steps):

            # Action
            for a in range(env.get_num_agents()):
                if info['action_required'][a]:
                    action = agent.act(agent_obs[a], eps=0.)
                    action_prob[action] += 1

                else:
                    action = 0

                action_dict.update({a: action})
            # Environment step
            obs, all_rewards, done, deadlocks, info = env.step(action_dict)

            env_renderer.render_env(show=True, show_predictions=True, show_observations=False)
            # Build agent specific observations and normalize
            for a in range(env.get_num_agents()):
                if obs[a]:
                    agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10)

                score += all_rewards[a] / env.get_num_agents()


            if done['__all__']:
                break

        # Collection information about training
        tasks_finished = 0
        for _idx in range(env.get_num_agents()):
            if done[_idx] == 1:
                tasks_finished += 1
        done_window.append(tasks_finished / max(1, env.get_num_agents()))
        scores_window.append(score / max_steps)  # save most recent score
        scores.append(np.mean(scores_window))
        dones_list.append(tasks_finished / max(1, env.get_num_agents()))
        dones_list_window.append((np.mean(done_window)))
        scores_list.append(score / max_steps)
        deadlock_list.append(deadlocks.count(1)/max(1, env.get_num_agents()))

        if (np.sum(action_prob) == 0):
            action_prob_normalized = [0] * action_size
        else:
            action_prob_normalized = action_prob / np.sum(action_prob)



        print(
                '\rTesting {} Agents on ({},{}).\t Episode {}\t Score: {:.3f}\tDones: {:.2f}%\tDeadlocks: {:.2f}\t Action Probabilities: \t {}'.format(
                    env.get_num_agents(), x_dim, y_dim,
                    trials,
                    score / max_steps,
                    100 * tasks_finished / max(1, env.get_num_agents()),
                    deadlocks.count(1)/max(1, env.get_num_agents()),
                    action_prob_normalized), end=" ")

        #if trials % 100 == 0:
        action_prob_list.append(action_prob_normalized)
        action_prob = [0] * action_size

        if trials % 50 == 0:

            np.savetxt(fname=path.join('NetsTest' , 'test_metrics.csv'), X=np.transpose(np.asarray([scores_list,scores,dones_list,dones_list_window,deadlock_list])), delimiter=';',newline='\n')
            np.savetxt(fname=path.join('NetsTest' , 'test_action_prob.csv'), X=np.asarray(action_prob_list), delimiter=';',newline='\n')

Пример #3

Показать файл

Файл: training_navigation_with_deadlocks.py Проект: alessandrositta/Flatland_challenge

def main(argv):
    try:
        opts, args = getopt.getopt(argv, "n:", ["n_trials="])
    except getopt.GetoptError:
        print('training_navigation.py -n <n_trials>')
        sys.exit(2)
    for opt, arg in opts:
        if opt in ('-n', '--n_trials'):
            n_trials = int(arg)

    random.seed(1)
    np.random.seed(1)

    #### Choose the desired setup ####

    multi_agent_setup = 1
    malfunctions_enabled = False
    agents_one_speed = True

    ##################################

    # Single agent (1)
    if multi_agent_setup == 1:
        x_dim = 35
        y_dim = 35
        n_agents = 1
        max_num_cities = 3
        max_rails_between_cities = 2
        max_rails_in_city = 3

    # Multi agent (3)
    if multi_agent_setup == 3:
        x_dim = 40
        y_dim = 40
        n_agents = 3
        max_num_cities = 4
        max_rails_between_cities = 2
        max_rails_in_city = 3

    # Multi agent (5)
    if multi_agent_setup == 5:
        x_dim = 16 * 3
        y_dim = 9 * 3
        n_agents = 5
        max_num_cities = 5
        max_rails_between_cities = 2
        max_rails_in_city = 3

    # Multi agent (10)
    if multi_agent_setup == 10:
        x_dim = 16 * 4
        y_dim = 9 * 4
        n_agents = 10
        max_num_cities = 9
        max_rails_between_cities = 5
        max_rails_in_city = 5

    # Use a the malfunction generator to break agents from time to time
    stochastic_data = {
        'malfunction_rate':
        80,  # Rate of malfunction occurence of single agent
        'min_duration': 15,  # Minimal duration of malfunction
        'max_duration': 50  # Max duration of malfunction
    }

    # Custom observation builder
    tree_depth = 2
    TreeObservation = TreeObsForRailEnv(
        max_depth=tree_depth, predictor=ShortestPathPredictorForRailEnv(20))

    np.savetxt(fname=path.join('Nets', 'info.txt'),
               X=[
                   x_dim, y_dim, n_agents, max_num_cities,
                   max_rails_between_cities, max_rails_in_city, tree_depth
               ],
               delimiter=';')

    # Different agent types (trains) with different speeds.
    if agents_one_speed:
        speed_ration_map = {
            1.: 1.,  # Fast passenger train
            1. / 2.: 0.0,  # Fast freight train
            1. / 3.: 0.0,  # Slow commuter train
            1. / 4.: 0.0
        }  # Slow freight train
    else:
        speed_ration_map = {
            1.: 0.25,  # Fast passenger train
            1. / 2.: 0.25,  # Fast freight train
            1. / 3.: 0.25,  # Slow commuter train
            1. / 4.: 0.25
        }  # Slow freight train

    if malfunctions_enabled:
        env = RailEnv(
            width=x_dim,
            height=y_dim,
            rail_generator=sparse_rail_generator(
                max_num_cities=max_num_cities,
                # Number of cities in map (where train stations are)
                seed=14,  # Random seed
                grid_mode=False,
                max_rails_between_cities=max_rails_between_cities,
                max_rails_in_city=max_rails_in_city),
            schedule_generator=sparse_schedule_generator(speed_ration_map),
            malfunction_generator_and_process_data=malfunction_from_params(
                stochastic_data),
            number_of_agents=n_agents,
            obs_builder_object=TreeObservation)
    else:
        env = RailEnv(
            width=x_dim,
            height=y_dim,
            rail_generator=sparse_rail_generator(
                max_num_cities=max_num_cities,
                # Number of cities in map (where train stations are)
                seed=14,  # Random seed
                grid_mode=False,
                max_rails_between_cities=max_rails_between_cities,
                max_rails_in_city=max_rails_in_city),
            schedule_generator=sparse_schedule_generator(speed_ration_map),
            number_of_agents=n_agents,
            obs_builder_object=TreeObservation)

    env.reset(True, True)

    # After training we want to render the results so we also load a renderer
    env_renderer = RenderTool(
        env,
        gl="PILSVG",
        screen_height=800,  # Adjust these parameters to fit your resolution
        screen_width=900)
    # Given the depth of the tree observation and the number of features per node we get the following state_size
    num_features_per_node = env.obs_builder.observation_dim

    nr_nodes = 0
    for i in range(tree_depth + 1):
        nr_nodes += np.power(4, i)
    state_size = num_features_per_node * nr_nodes

    # The action space of flatland is 5 discrete actions
    action_size = 5

    # We set the number of episodes we would like to train on
    if 'n_trials' not in locals():
        n_trials = 15000

    # And the max number of steps we want to take per episode
    max_steps = int(3 * (env.height + env.width))

    # Define training parameters
    eps = 1.
    eps_end = 0.005
    eps_decay = 0.998

    # And some variables to keep track of the progress
    action_dict = dict()
    final_action_dict = dict()
    scores_window = deque(maxlen=100)
    done_window = deque(maxlen=100)
    deadlock_window = deque(maxlen=100)
    deadlock_average = []
    scores = []
    dones_list = []
    #Metrics
    eps_list = []
    action_prob_list = []
    action_prob = [0] * action_size
    agent_obs = [None] * env.get_num_agents()
    agent_next_obs = [None] * env.get_num_agents()
    agent_obs_buffer = [None] * env.get_num_agents()
    agent_action_buffer = [2] * env.get_num_agents()
    cummulated_reward = np.zeros(env.get_num_agents())
    update_values = False
    # Now we load a Double dueling DQN agent
    agent = Agent(state_size, action_size)

    for trials in range(1, n_trials + 1):

        #print(torch.cuda.current_device())
        # Reset environment
        obs, info = env.reset(True, True)
        #env_renderer.reset()
        # Build agent specific observations
        for a in range(env.get_num_agents()):
            if obs[a]:
                agent_obs[a] = normalize_observation(obs[a],
                                                     tree_depth,
                                                     observation_radius=10)
                agent_obs_buffer[a] = agent_obs[a].copy()

        # Reset score and done
        score = 0
        env_done = 0

        # Run episode
        for step in range(max_steps):
            # Action
            for a in range(env.get_num_agents()):
                if info['action_required'][a]:
                    # If an action is require, we want to store the obs a that step as well as the action
                    update_values = True
                    action = agent.act(agent_obs[a], eps=eps)
                    action_prob[action] += 1
                else:
                    update_values = False
                    action = 0
                action_dict.update({a: action})

            # Environment step
            next_obs, all_rewards, done, deadlocks, info = env.step(
                action_dict)
            #env_renderer.render_env(show=True, show_predictions=True, show_observations=True)
            # Update replay buffer and train agent
            for a in range(env.get_num_agents()):
                # Only update the values when we are done or when an action was taken and thus relevant information is present
                if update_values or done[a]:
                    agent.step(agent_obs_buffer[a], agent_action_buffer[a],
                               all_rewards[a], agent_obs[a], done[a])
                    cummulated_reward[a] = 0.

                    agent_obs_buffer[a] = agent_obs[a].copy()
                    agent_action_buffer[a] = action_dict[a]
                if next_obs[a]:
                    agent_obs[a] = normalize_observation(next_obs[a],
                                                         tree_depth,
                                                         observation_radius=10)

                score += all_rewards[a] / env.get_num_agents()

            # Copy observation
            if done['__all__']:
                env_done = 1
                break

        # Epsilon decay
        eps = max(eps_end, eps_decay * eps)  # decrease epsilon

        # Collection information about training
        tasks_finished = 0
        for _idx in range(env.get_num_agents()):
            if done[_idx] == 1:
                tasks_finished += 1
        done_window.append(tasks_finished / max(1, env.get_num_agents()))
        scores_window.append(score / max_steps)  # save most recent score
        scores.append(np.mean(scores_window))
        deadlock_window.append(
            deadlocks.count(1) / max(1, env.get_num_agents()))
        deadlock_average.append(np.mean(deadlock_window))
        dones_list.append((np.mean(done_window)))

        eps_list.append(eps)
        action_prob_list.append(action_prob / np.sum(action_prob))
        print(
            '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f} %\tDeadlocks: {:.2f} \tEpsilon: {:.2f} \t Action Probabilities: \t {}'
            .format(env.get_num_agents(), x_dim, y_dim, trials,
                    np.mean(scores_window), 100 * np.mean(done_window),
                    np.mean(deadlock_window), eps,
                    action_prob / np.sum(action_prob)),
            end=" ")

        if trials % 100 == 0:
            print(
                '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'
                .format(env.get_num_agents(), x_dim, y_dim, trials,
                        np.mean(scores_window), 100 * np.mean(done_window),
                        eps, action_prob / np.sum(action_prob)))
            torch.save(
                agent.qnetwork_local.state_dict(),
                path.join('Nets',
                          ('navigator_checkpoint' + str(trials) + '.pth')))

            action_prob = [1] * action_size

        if trials % 50 == 0:

            np.savetxt(fname=path.join('Nets', 'metrics.csv'),
                       X=np.transpose(
                           np.asarray([
                               scores, dones_list, deadlock_average, eps_list
                           ])),
                       delimiter=';',
                       newline='\n')
            np.savetxt(fname=path.join('Nets', 'action_prob.csv'),
                       X=np.asarray(action_prob_list),
                       delimiter=';',
                       newline='\n')

    # Plot overall training progress at the end
    plt.plot(scores)
    plt.show()

Пример #4

Показать файл

Файл: multi_agent_training.py Проект: alessandrositta/Flatland_challenge

def main(argv):
    try:
        opts, args = getopt.getopt(argv, "n:", ["n_trials="])
    except getopt.GetoptError:
        print('training_navigation.py -n <n_trials>')
        sys.exit(2)
    for opt, arg in opts:
        if opt in ('-n', '--n_trials'):
            n_trials = int(arg)

    random.seed(1)
    np.random.seed(1)

    # Parameters for the Environment
    x_dim = 35
    y_dim = 35
    n_agents = 10


    # Use a the malfunction generator to break agents from time to time
    stochastic_data = {'malfunction_rate': 8000,  # Rate of malfunction occurence of single agent
                       'min_duration': 15,  # Minimal duration of malfunction
                       'max_duration': 50  # Max duration of malfunction
                       }

    # Custom observation builder
    TreeObservation = TreeObsForRailEnv(max_depth=2, predictor=ShortestPathPredictorForRailEnv(30))

    # Different agent types (trains) with different speeds.
    speed_ration_map = {1.: 0.25,  # Fast passenger train
                        1. / 2.: 0.25,  # Fast freight train
                        1. / 3.: 0.25,  # Slow commuter train
                        1. / 4.: 0.25}  # Slow freight train

    env = RailEnv(width=x_dim,
                  height=y_dim,
                  rail_generator=sparse_rail_generator(max_num_cities=3,
                                                       # Number of cities in map (where train stations are)
                                                       seed=1,  # Random seed
                                                       grid_mode=False,
                                                       max_rails_between_cities=2,
                                                       max_rails_in_city=3),
                  schedule_generator=sparse_schedule_generator(speed_ration_map),
                  number_of_agents=n_agents,
                  malfunction_generator_and_process_data=malfunction_from_params(stochastic_data),
                  obs_builder_object=TreeObservation)

    # After training we want to render the results so we also load a renderer
    env_renderer = RenderTool(env, gl="PILSVG", )
    # Given the depth of the tree observation and the number of features per node we get the following state_size
    num_features_per_node = env.obs_builder.observation_dim
    tree_depth = 2
    nr_nodes = 0
    for i in range(tree_depth + 1):
        nr_nodes += np.power(4, i)
    state_size = num_features_per_node * nr_nodes

    # The action space of flatland is 5 discrete actions
    action_size = 5

    # We set the number of episodes we would like to train on
    if 'n_trials' not in locals():
        n_trials = 15000

    # And the max number of steps we want to take per episode
    max_steps = int(4 * 2 * (20 + env.height + env.width))

    # Define training parameters
    eps = 1.
    eps_end = 0.005
    eps_decay = 0.998

    # And some variables to keep track of the progress
    action_dict = dict()
    final_action_dict = dict()
    scores_window = deque(maxlen=100)
    done_window = deque(maxlen=100)
    scores = []
    dones_list = []
    action_prob = [0] * action_size
    agent_obs = [None] * env.get_num_agents()
    agent_next_obs = [None] * env.get_num_agents()
    agent_obs_buffer = [None] * env.get_num_agents()
    agent_action_buffer = [2] * env.get_num_agents()
    cummulated_reward = np.zeros(env.get_num_agents())
    update_values = [False] * env.get_num_agents()
    # Now we load a Double dueling DQN agent
    agent = Agent(state_size, action_size)

    for trials in range(1, n_trials + 1):

        # Reset environment
        obs, info = env.reset(True, True)
        env_renderer.reset()
        # Build agent specific observations
        for a in range(env.get_num_agents()):
            if obs[a]:
                agent_obs[a] = normalize_observation(obs[a], tree_depth, observation_radius=10)
                agent_obs_buffer[a] = agent_obs[a].copy()

        # Reset score and done
        score = 0
        env_done = 0

        # Run episode
        while True:
            # Action
            for a in range(env.get_num_agents()):
                if info['action_required'][a]:
                    # If an action is require, we want to store the obs a that step as well as the action
                    update_values[a] = True
                    action = agent.act(agent_obs[a], eps=eps)
                    action_prob[action] += 1
                else:
                    update_values[a] = False
                    action = 0
                action_dict.update({a: action})

            # Environment step
            next_obs, all_rewards, done, info = env.step(action_dict)
            # Update replay buffer and train agent
            for a in range(env.get_num_agents()):
                # Only update the values when we are done or when an action was taken and thus relevant information is present
                if update_values[a] or done[a]:
                    agent.step(agent_obs_buffer[a], agent_action_buffer[a], all_rewards[a],
                               agent_obs[a], done[a])
                    cummulated_reward[a] = 0.

                    agent_obs_buffer[a] = agent_obs[a].copy()
                    agent_action_buffer[a] = action_dict[a]
                if next_obs[a]:
                    agent_obs[a] = normalize_observation(next_obs[a], tree_depth, observation_radius=10)

                score += all_rewards[a] / env.get_num_agents()

            # Copy observation
            if done['__all__']:
                env_done = 1
                break

        # Epsilon decay
        eps = max(eps_end, eps_decay * eps)  # decrease epsilon

        # Collection information about training
        tasks_finished = 0
        for current_agent in env.agents:
            if current_agent.status == RailAgentStatus.DONE_REMOVED:
                tasks_finished += 1
        done_window.append(tasks_finished / max(1, env.get_num_agents()))
        scores_window.append(score / max_steps)  # save most recent score
        scores.append(np.mean(scores_window))
        dones_list.append((np.mean(done_window)))

        print(
            '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
                env.get_num_agents(), x_dim, y_dim,
                trials,
                np.mean(scores_window),
                100 * np.mean(done_window),
                eps, action_prob / np.sum(action_prob)), end=" ")

        if trials % 100 == 0:
            print(
                '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
                    env.get_num_agents(), x_dim, y_dim,
                    trials,
                    np.mean(scores_window),
                    100 * np.mean(done_window),
                    eps, action_prob / np.sum(action_prob)))
            torch.save(agent.qnetwork_local.state_dict(),
                       './Nets/navigator_checkpoint' + str(trials) + '.pth')
            action_prob = [1] * action_size

    # Plot overall training progress at the end
    plt.plot(scores)
    plt.show()

Пример #5

Показать файл

def main(argv):
    try:
        opts, args = getopt.getopt(argv, "n:", ["n_episodes="])
    except getopt.GetoptError:
        print('training_navigation.py -n <n_episodes>')
        sys.exit(2)
    for opt, arg in opts:
        if opt in ('-n', '--n_episodes'):
            n_episodes = int(arg)

    ## Initialize the random
    random.seed(1)
    np.random.seed(1)

    # Initialize a random map with a random number of agents
    x_dim = np.random.randint(8, 20)
    y_dim = np.random.randint(8, 20)
    n_agents = np.random.randint(3, 8)
    n_goals = n_agents + np.random.randint(0, 3)
    min_dist = int(0.75 * min(x_dim, y_dim))
    tree_depth = 2
    print("main2")
    demo = False

    # Get an observation builder and predictor
    observation_helper = TreeObsForRailEnv(max_depth=tree_depth, predictor=ShortestPathPredictorForRailEnv())

    env = RailEnv(width=x_dim,
                  height=y_dim,
                  rail_generator=complex_rail_generator(nr_start_goal=n_goals, nr_extra=5, min_dist=min_dist,
                                                        max_dist=99999,
                                                        seed=0),
                  schedule_generator=complex_schedule_generator(),
                  obs_builder_object=observation_helper,
                  number_of_agents=n_agents)
    env.reset(True, True)

    handle = env.get_agent_handles()
    features_per_node = env.obs_builder.observation_dim
    nr_nodes = 0
    for i in range(tree_depth + 1):
        nr_nodes += np.power(4, i)
    state_size = 2 * features_per_node * nr_nodes  # We will use two time steps per observation --> 2x state_size
    action_size = 5

    # We set the number of episodes we would like to train on
    if 'n_episodes' not in locals():
        n_episodes = 60000

    # Set max number of steps per episode as well as other training relevant parameter
    max_steps = int(3 * (env.height + env.width))
    eps = 1.
    eps_end = 0.005
    eps_decay = 0.9995
    action_dict = dict()
    final_action_dict = dict()
    scores_window = deque(maxlen=100)
    done_window = deque(maxlen=100)
    time_obs = deque(maxlen=2)
    scores = []
    dones_list = []
    action_prob = [0] * action_size
    agent_obs = [None] * env.get_num_agents()
    agent_next_obs = [None] * env.get_num_agents()
    # Initialize the agent
    agent = Agent(state_size, action_size)

    # Here you can pre-load an agent
    if False:
        with path(torch_training.Nets, "avoid_checkpoint500.pth") as file_in:
            agent.qnetwork_local.load_state_dict(torch.load(file_in))

    # Do training over n_episodes
    for episodes in range(1, n_episodes + 1):
        """
        Training Curriculum: In order to get good generalization we change the number of agents
        and the size of the levels every 50 episodes.
        """
        if episodes % 50 == 0:
            x_dim = np.random.randint(8, 20)
            y_dim = np.random.randint(8, 20)
            n_agents = np.random.randint(3, 8)
            n_goals = n_agents + np.random.randint(0, 3)
            min_dist = int(0.75 * min(x_dim, y_dim))
            env = RailEnv(width=x_dim,
                          height=y_dim,
                          rail_generator=complex_rail_generator(nr_start_goal=n_goals, nr_extra=5, min_dist=min_dist,
                                                                max_dist=99999,
                                                                seed=0),
                          schedule_generator=complex_schedule_generator(),
                          obs_builder_object=TreeObsForRailEnv(max_depth=3,
                                                               predictor=ShortestPathPredictorForRailEnv()),
                          number_of_agents=n_agents)

            # Adjust the parameters according to the new env.
            max_steps = int(3 * (env.height + env.width))
            agent_obs = [None] * env.get_num_agents()
            agent_next_obs = [None] * env.get_num_agents()

        # Reset environment
        obs, info = env.reset(True, True)

        # Setup placeholder for finals observation of a single agent. This is necessary because agents terminate at
        # different times during an episode
        final_obs = agent_obs.copy()
        final_obs_next = agent_next_obs.copy()

        # Build agent specific observations
        for a in range(env.get_num_agents()):
            data, distance, agent_data = split_tree_into_feature_groups(obs[a], tree_depth)
            data = norm_obs_clip(data)
            distance = norm_obs_clip(distance)
            agent_data = np.clip(agent_data, -1, 1)
            obs[a] = np.concatenate((np.concatenate((data, distance)), agent_data))

        # Accumulate two time steps of observation (Here just twice the first state)
        for i in range(2):
            time_obs.append(obs)

        # Build the agent specific double ti
        for a in range(env.get_num_agents()):
            agent_obs[a] = np.concatenate((time_obs[0][a], time_obs[1][a]))

        score = 0
        env_done = 0
        # Run episode
        for step in range(max_steps):

            # Action
            for a in range(env.get_num_agents()):
                if demo:
                    eps = 0
                # action = agent.act(np.array(obs[a]), eps=eps)
                action = agent.act(agent_obs[a], eps=eps)
                action_prob[action] += 1
                action_dict.update({a: action})
            # Environment step

            next_obs, all_rewards, done, _ = env.step(action_dict)
            for a in range(env.get_num_agents()):
                data, distance, agent_data = split_tree_into_feature_groups(next_obs[a], tree_depth)
                data = norm_obs_clip(data)
                distance = norm_obs_clip(distance)
                agent_data = np.clip(agent_data, -1, 1)
                next_obs[a] = np.concatenate((np.concatenate((data, distance)), agent_data))
            time_obs.append(next_obs)

            # Update replay buffer and train agent
            for a in range(env.get_num_agents()):
                agent_next_obs[a] = np.concatenate((time_obs[0][a], time_obs[1][a]))
                if done[a]:
                    final_obs[a] = agent_obs[a].copy()
                    final_obs_next[a] = agent_next_obs[a].copy()
                    final_action_dict.update({a: action_dict[a]})
                if not demo and not done[a]:
                    agent.step(agent_obs[a], action_dict[a], all_rewards[a], agent_next_obs[a], done[a])
                score += all_rewards[a] / env.get_num_agents()

            agent_obs = agent_next_obs.copy()
            if done['__all__']:
                env_done = 1
                for a in range(env.get_num_agents()):
                    agent.step(final_obs[a], final_action_dict[a], all_rewards[a], final_obs_next[a], done[a])
                break
        # Epsilon decay
        eps = max(eps_end, eps_decay * eps)  # decrease epsilon

        done_window.append(env_done)
        scores_window.append(score / max_steps)  # save most recent score
        scores.append(np.mean(scores_window))
        dones_list.append((np.mean(done_window)))

        print(
            '\rTraining {} Agents on ({},{}).\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
                env.get_num_agents(), x_dim, y_dim,
                episodes,
                np.mean(scores_window),
                100 * np.mean(done_window),
                eps, action_prob / np.sum(action_prob)), end=" ")

        if episodes % 100 == 0:
            print(
                '\rTraining {} Agents.\t Episode {}\t Average Score: {:.3f}\tDones: {:.2f}%\tEpsilon: {:.2f} \t Action Probabilities: \t {}'.format(
                    env.get_num_agents(),
                    episodes,
                    np.mean(scores_window),
                    100 * np.mean(done_window),
                    eps,
                    action_prob / np.sum(action_prob)))
            torch.save(agent.qnetwork_local.state_dict(),
                       './Nets/avoid_checkpoint' + str(episodes) + '.pth')
            action_prob = [1] * action_size
    plt.plot(scores)
    plt.show()

Пример #6

Показать файл

features_per_node = 9
tree_depth = 3
nodes = 0
for i in range(tree_depth + 1):
    nodes += np.power(4, i)
state_size = features_per_node * nodes
action_size = 5
action_dict = dict()
nr_trials_per_test = 100
test_results = []
test_times = []
test_dones = []
sequential_agent_test = False

# Load your agent
agent = Agent(state_size, action_size)
agent.qnetwork_local.load_state_dict(
    torch.load('../torch_training/Nets/avoid_checkpoint500.pth'))

# Load the necessary Observation Builder and Predictor
predictor = ShortestPathPredictorForRailEnv()
observation_builder = TreeObsForRailEnv(max_depth=tree_depth,
                                        predictor=predictor)

start_time_scoring = time.time()

score_board = []
for test_nr in parameters:
    current_parameters = parameters[test_nr]
    test_score, test_dones, test_time = run_test(
        current_parameters,