예제 #1
0
    def __init__(self, **mpe_args):
        """Create a new Multi-Agent Particle env compatible with RLlib.
        Arguments:
            mpe_args (dict): Arguments to pass to the underlying
                make_env.make_env instance.
        Examples:
            from rllib_env import RLlibMultiAgentParticleEnv
            env = RLlibMultiAgentParticleEnv(scenario_name="simple_reference")
            print(env.reset())
        """

        self._env = make_env(**mpe_args)
        self.num_agents = self._env.n
        self.agent_ids = list(range(self.num_agents))

        self.observation_space_dict = self._make_dict(
            self._env.observation_space)
        self.action_space_dict = self._make_dict(self._env.action_space)
예제 #2
0
def train(arglist):
    """
    Run MADDPG algorithm using passed in commandline arguments

    Args:
        arglist (argparse.Namespace): Parsed commandline arguments object
    """
    tf.reset_default_graph()

    if arglist.seed is not None:
        np.random.seed(arglist.seed)
        tf.set_random_seed(arglist.seed)

    with tf_util.make_session(config=None,
                              num_cpu=1,
                              make_default=False,
                              graph=None):
        # with tf_util.single_threaded_session():
        ###########################################
        #         Create environment              #
        ###########################################
        env = make_env(arglist.scenario,
                       arglist=arglist,
                       done=arglist.done_callback,
                       logging=arglist.logging,
                       benchmark=arglist.benchmark)

        ###########################################
        #        Create agent trainers            #
        ###########################################
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        num_adversaries = min(env.n, arglist.num_adversaries)
        trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)

        print("Number of Adversaries: {}".format(num_adversaries))
        print('Experiment: {}. Using good policy {} and adv policy {}'.format(
            arglist.exp_name, arglist.good_policy, arglist.adv_policy))

        ###########################################
        #              Initialize                 #
        ###########################################
        tf_util.initialize()

        ###########################################
        #   Load previous results, if necessary   #
        ###########################################
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir

        # if arglist.display or arglist.restore or arglist.benchmark or arglist.load_dir is not None:
        if arglist.restore or arglist.benchmark or arglist.load_dir is not None:
            print('Loading previous state...')

            # Set model file
            if arglist.model_file == "":
                arglist.model_file = arglist.exp_name

            print("Model File: " + arglist.load_dir + arglist.model_file)
            tf_util.load_state(arglist.load_dir + arglist.model_file)

        ###########################################
        #       Create the save directory         #
        ###########################################
        if not os.path.exists(arglist.save_dir):
            os.makedirs(arglist.save_dir, exist_ok=True)

        if not os.path.exists(arglist.plots_dir):
            os.makedirs(arglist.plots_dir, exist_ok=True)

        ###########################################
        #             Set parameters              #
        ###########################################
        # Sum of rewards for all agents
        episode_rewards = [0.0]

        # This was changed so that a reward can be tracked for fixed policy agents as well as learning agents
        # Individual agent reward
        # agent_rewards = [[0.0] for _ in range(env.n)]
        agent_rewards = [[0.0] for _ in range(len(env.world.agents))]

        # Retrieve previous episode count
        try:
            prev_ep_ct = int(arglist.model_file.split("_")[-1])
        except ValueError:
            print("Starting from untrained network...")
            prev_ep_ct = 0
        ep_ct = prev_ep_ct + arglist.num_episodes

        # Sum of rewards for training curve
        final_ep_rewards = []

        # Agent rewards for training curve
        final_ep_ag_rewards = []

        # Placeholder for benchmarking info
        agent_info = [[[]]]

        saver = tf.train.Saver()
        obs_n = env.reset()
        episode_step = 0
        train_step = 0
        t_start = time.time()
        progress = False

        # Save more often if you have fewer episodes
        arglist.save_rate = min(arglist.save_rate, arglist.num_episodes)

        # Initialize loss file for each agent
        if arglist.log_loss:
            for i in range(len(env.world.agents)):
                log_loss(arglist, ep_ct, "agent_{}".format(i), initialize=True)

        ###########################################
        #                 Start                   #
        ###########################################
        print('Starting iterations...')
        while True:
            # TODO: Switch to is isinstance()
            # if type(env.world.scripted_agents[0].action) == type(None):
            #     print("Error")

            # Get action
            action_n = [
                agent.action(obs) for agent, obs in zip(trainers, obs_n)
            ]

            # Environment step
            new_obs_n, rew_n, done_n, info_n = env.step(action_n)

            # Logging step
            if arglist.logging:
                env.log(
                    len(episode_rewards) + prev_ep_ct, episode_step, new_obs_n,
                    rew_n, done_n, info_n)

            # Update information
            episode_step += 1

            # Check if all agents are done
            # done = all(done_n)

            # Check if any agents are done
            done = any(done_n)

            terminal = (episode_step >= arglist.max_episode_len)

            # Collect experience
            for i, agent in enumerate(trainers):
                agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i],
                                 done_n[i], terminal)
            obs_n = new_obs_n

            for i, rew in enumerate(rew_n):
                episode_rewards[-1] += rew
                agent_rewards[i][-1] += rew

            # For displaying learned policies
            if arglist.display:
                time.sleep(0.1)
                env.render()
                if done or terminal:
                    print('Episode Reward: {}'.format(
                        [rew[-1] for rew in agent_rewards]))
                    time.sleep(0.5)
                    obs_n = env.reset()
                    episode_step = 0
                    episode_rewards.append(0)
                    for a in agent_rewards:
                        a.append(0)
                    agent_info.append([[]])
                continue

            if done or terminal:
                obs_n = env.reset()
                episode_step = 0
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])

            # Increment global step counter
            train_step += 1

            # For benchmarking learned policies
            if arglist.benchmark:
                for i, info in enumerate(info_n):
                    agent_info[-1][i].append(info_n['n'])

                if train_step > arglist.benchmark_iters and (done or terminal):
                    file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
                    print('Finished benchmarking, now saving...')
                    with open(file_name, 'wb') as fp:
                        pickle.dump(agent_info[:-1], fp)
                    break
                continue

            # In testing mode, don't perform model updates
            if arglist.testing:
                if len(episode_rewards) > arglist.num_episodes:
                    print("episodes: {}, "
                          "mean episode reward: {}, time: {}".format(
                              len(episode_rewards),
                              np.mean(episode_rewards[-arglist.save_rate:]),
                              round(time.time() - t_start, 3)))
                    env.logger.save("State",
                                    arglist.save_dir,
                                    filename=arglist.exp_name + '_state' +
                                    '_' + str(prev_ep_ct) + arglist.log_append)
                    break
                continue

            # Update all trainers, if not in display or benchmark mode
            loss = None
            for agent in trainers:
                agent.preupdate()
            for i, agent in enumerate(trainers):
                loss = agent.update(trainers, train_step)
                if arglist.log_loss and loss is not None:
                    log_loss(arglist,
                             ep_ct,
                             "agent_{}".format(i),
                             loss=loss[1])

            if len(episode_rewards) % 100 == 0 and progress:
                print("Episode {} Reached. Time: {}".format(
                    len(episode_rewards),
                    time.time() - t_start))
                progress = False
            elif len(episode_rewards) % 100 != 0 and not progress:
                progress = True

            # Save model, display training output
            if (terminal or done) and (len(episode_rewards) % arglist.save_rate
                                       == 0):
                # TODO: Implement some checks so that we don't overwrite old networks unintentionally?

                # Save model state
                tf_util.save_state(arglist.save_dir + arglist.exp_name + '_' +
                                   str(len(episode_rewards) + prev_ep_ct),
                                   saver=saver)

                # Print statement depends on whether or not there are adversaries
                if num_adversaries == 0:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, time: {}"
                        .format(train_step,
                                len(episode_rewards) + prev_ep_ct,
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                round(time.time() - t_start, 3)))
                else:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}"
                        .format(train_step,
                                len(episode_rewards) + prev_ep_ct,
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                [
                                    np.mean(reward[-arglist.save_rate:])
                                    for reward in agent_rewards
                                ], round(time.time() - t_start, 3)))

                # Reset start time to current time
                t_start = time.time()

                # Keep track of final episode reward
                final_ep_rewards.append(
                    np.mean(episode_rewards[-arglist.save_rate:]))
                for reward in agent_rewards:
                    final_ep_ag_rewards.append(
                        np.mean(reward[-arglist.save_rate:]))

            # Saves final episode reward for plotting training curve later
            if len(episode_rewards) > arglist.num_episodes:
                rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl'
                with open(rew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_rewards, fp)

                agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
                with open(agrew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_ag_rewards, fp)

                # Log agent data for run
                env.logger.save("State",
                                arglist.save_dir,
                                filename=arglist.exp_name + '_state' + '_' +
                                str(len(episode_rewards) + prev_ep_ct))

                print('...Finished total of {} episodes.'.format(
                    len(episode_rewards)))
                break
예제 #3
0
def train(arglist):
    """
    Run MADDPG algorithm using passed in commandline arguments

    Args:
        arglist (argparse.Namespace): Parsed commandline arguments object
    """
    # Assign roles
    training_role = arglist.training_role[0]
    if arglist.training_role[0] == "defender":
        opponent_role = "attacker"
        opponent_index = 0
    elif arglist.training_role[0] == "attacker":
        opponent_role = "defender"
        opponent_index = 1
    else:
        raise Exception("training role error!")

    # suppress tensorflow warnings
    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

    tf.reset_default_graph()

    if arglist.seed is not None:
        np.random.seed(arglist.seed)
        tf.set_random_seed(arglist.seed)

    with tf_util.make_session(config=None, num_cpu=1, make_default=False, graph=None):
        # with tf_util.single_threaded_session():
        ###########################################
        #         Create environment              #
        ###########################################
        env = make_env(arglist.scenario, arglist=arglist, done=arglist.done_callback,
                       logging=arglist.logging, benchmark=arglist.benchmark)

        ###########################################
        #        Create agent trainers            #
        ###########################################
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        num_adversaries = min(env.n, arglist.num_adversaries)
        trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist)

        print("Training super {} against level 0 to {} opponent."
              .format(arglist.training_role[0], arglist.level))
        print("Number of Adversaries: {}".format(num_adversaries))
        print('Experiment: {}. Using good policy {} and adv policy {}'.format(arglist.exp_name,
                                                                              arglist.good_policy,
                                                                              arglist.adv_policy))

        ###########################################
        #              Initialize                 #
        ###########################################
        tf_util.initialize()

        ###########################################
        #   Load previous results, if necessary   #
        ###########################################
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir

        # if arglist.display or arglist.restore or arglist.benchmark or arglist.load_dir is not None:
        if ((arglist.restore or arglist.load_dir is not None) and arglist.level != 0) or arglist.benchmark:
            print('Loading previous state...')

            print("Level-k folder: " + arglist.load_dir)

            for opp_level in range(0, arglist.level + 1):
                opp_model_file = "level_{}_{}".format(opp_level, opponent_role)
                tf_util.load_state(fname=arglist.load_dir + opp_model_file,
                                   var_prefix="level_{}_{}_{}".format(opp_level, opponent_role, opponent_index))

        ###########################################
        #       Create the save directory         #
        ###########################################
        if not os.path.exists(arglist.save_dir):
            os.makedirs(arglist.save_dir, exist_ok=True)
        if not os.path.exists(arglist.plots_dir):
            os.makedirs(arglist.plots_dir, exist_ok=True)

        ###########################################
        #             Set parameters              #
        ###########################################
        # Sum of rewards for all agents
        episode_rewards = [0.0]

        # This was changed so that a reward can be tracked for fixed policy agents as well as learning agents
        # Individual agent reward
        # agent_rewards = [[0.0] for _ in range(env.n)]
        agent_rewards = [[0.0] for _ in range(len(env.world.agents))]

        # Retrieve previous episode count
        try:
            prev_ep_ct = int(arglist.model_file.split("_")[-1])
        except ValueError:
            print("Starting from untrained network...")
            prev_ep_ct = 0
        ep_ct = prev_ep_ct + arglist.num_episodes

        # Sum of rewards for training curve
        final_ep_rewards = []

        # Agent rewards for training curve
        final_ep_ag_rewards = []

        # Placeholder for benchmarking info
        agent_info = [[[]]]

        saver = tf.train.Saver()
        obs_n = env.reset()
        episode_step = 0
        train_step = 0
        t_start = time.time()
        progress = False

        # Save more often if you have fewer episodes
        arglist.save_rate = min(arglist.save_rate, arglist.num_episodes)

        # Initialize loss file for each agent
        if arglist.log_loss:
            for i in range(len(env.world.agents)):
                log_loss(arglist, ep_ct, "agent_{}".format(i), initialize=True)

        ###########################################
        #                 Start                   #
        ###########################################
        print('Starting iterations...')

        # Initialize opponent selection distribution to uniform
        p_opponent_selection = np.ones(arglist.level + 1) / (arglist.level + 1)

        # Initialize evaluate_flag
        evaluate_flag = False
        evaluation_done = False

        # initialize worst performing level list
        worst_performing_levels = []

        while True:
            # TODO: Switch to is isinstance()
            # if type(env.world.scripted_agents[0].action) == type(None):
            #     print("Error")


            # Get opponent and training agents' indices
            good_update_index, opponent_select_index, opponent_select_level = get_update_indices(training_role,
                                                                                                 opponent_role,
                                                                                                 p_opponent_selection,
                                                                                                 arglist)
            updating_indices = [good_update_index]

            # Get action
            good_trainer = trainers[good_update_index]
            opp_trainer = trainers[opponent_select_index]

            if good_update_index > opponent_select_index:
                selected_trainers = [opp_trainer, good_trainer]
            elif good_update_index < opponent_select_index:
                selected_trainers = [good_trainer, opp_trainer]
            else:
                raise Exception("Trainer index selection error!")

            action_n = [agent.action(obs) for agent, obs in zip(selected_trainers, obs_n)]

            # Environment step
            new_obs_n, rew_n, done_n, info_n = env.step(action_n)

            # Logging step
            if arglist.logging:
                env.log(len(episode_rewards) + prev_ep_ct, episode_step, new_obs_n, rew_n, done_n, info_n)

            # Update information
            episode_step += 1

            # Check if all agents are done
            # done = all(done_n)

            # Check if any agents are done
            done = any(done_n)

            terminal = (episode_step >= arglist.max_episode_len)

            # Collect experience
            for i, agent in enumerate(selected_trainers):
                agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal)
            obs_n = new_obs_n

            for i, rew in enumerate(rew_n):
                episode_rewards[-1] += rew
                agent_rewards[i][-1] += rew

            # For displaying learned policies
            if arglist.display:
                time.sleep(0.1)
                env.render()
                if done or terminal:
                    print('Episode Reward: {}'.format([rew[-1] for rew in agent_rewards]))
                    time.sleep(0.5)
                    obs_n = env.reset()
                    episode_step = 0
                    episode_rewards.append(0)
                    for a in agent_rewards:
                        a.append(0)
                    agent_info.append([[]])
                continue

            if done or terminal:
                obs_n = env.reset()
                episode_step = 0
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])

            # Increment global step counter
            train_step += 1

            # For benchmarking learned policies
            if arglist.benchmark:
                for i, info in enumerate(info_n):
                    agent_info[-1][i].append(info_n['n'])

                if train_step > arglist.benchmark_iters and (done or terminal):
                    file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
                    print('Finished benchmarking, now saving...')
                    with open(file_name, 'wb') as fp:
                        pickle.dump(agent_info[:-1], fp)
                    break
                continue
            '''
            # For displaying learned policies
            if arglist.display:
                time.sleep(0.1)
                env.render()
                # print("Mean Episode Reward: {}".format([np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards]))
                continue
            '''

            # In testing mode, don't perform model updates
            if arglist.testing:
                if len(episode_rewards) > arglist.num_episodes:
                    print("episodes: {}, "
                          "mean episode reward: {}, time: {}".format(len(episode_rewards),
                                                                     np.mean(episode_rewards[-arglist.save_rate:]),
                                                                     round(time.time() - t_start, 3)))
                    env.logger.save("State",
                                    arglist.save_dir,
                                    filename=arglist.exp_name + '_state' + '_' + str(prev_ep_ct) + arglist.log_append)
                    break
                continue

            # Test current super agent's performance against all other agents
            # Check if all level has been evaluated and update p_select
            if (terminal or done) and evaluate_flag and evaluation_done:
                evaluate_flag = False
                level_performances = level_performances / arglist.evaluate_length
                np.set_printoptions(precision=2)
                print("Evaluation complete, against level 0 to {} performances: {}".format(arglist.level,
                                                                                           level_performances))

                worst_level = np.argmin(level_performances)
                worst_performing_levels.append(worst_level)
                print("Worst performing level is {}".format(worst_level))

                # update p_select #TODO: check some other distributions
                # p_opponent_selection = np.ones(arglist.level + 1) * 0.6 / arglist.level
                p_opponent_selection[worst_level] = 1
                print("Opponent selection probability set to: {}".format(p_opponent_selection))

            # Pop evaluation list and update current evaluate level
            if (terminal or done) and evaluate_flag:
                last_episode_agent_reward = agent_rewards[get_role_index(arglist.training_role[0])][-2]
                level_performances[evaluate_level] += last_episode_agent_reward

                if len(evaluate_levels) == 0:
                    evaluation_done = True
                else:
                    evaluate_level = evaluate_levels.pop(0)  # get the level to evaluate next

                # set up p_selection distribution
                p_opponent_selection = np.zeros(arglist.level + 1)
                p_opponent_selection[evaluate_level] = 1

            # set up evaluate schedules
            if (terminal or done) and (len(episode_rewards) % arglist.evaluate_rate == 0):
                print("Freezing current super-agent's network and performing evaluation.")
                evaluate_flag = True
                evaluation_done = False
                eval_len = arglist.evaluate_length
                evaluate_levels = []
                level_performances = np.zeros(arglist.level + 1)
                for level in range(arglist.level + 1):
                    for i in range(eval_len):
                        evaluate_levels.append(level)
                evaluate_level = evaluate_levels.pop(0)



            # In evaluate mode, don't perform model updates
            if evaluate_flag:
                continue



            # If not in display or benchmark mode, update trainers with index in updating_indices.
            loss = None
            for i, agent in enumerate(trainers):
                if i in updating_indices:
                    agent.preupdate()
            for i, agent in enumerate(trainers):
                if i in updating_indices:
                    loss = agent.update(selected_trainers, train_step)
                if arglist.log_loss and loss is not None:
                    log_loss(arglist, ep_ct, "agent_{}".format(i), loss=loss[1])

            if len(episode_rewards) % 100 == 0 and progress:
                print("Episode {} Reached. Time: {:.2f} s".format(len(episode_rewards), time.time() - t_start))
                progress = False
            elif len(episode_rewards) % 100 != 0 and not progress:
                progress = True

            # Save model, display training output
            if (terminal or done) and (len(episode_rewards) % arglist.save_rate == 0):
                # TODO: Implement some checks so that we don't overwrite old networks unintentionally?

                # Save model state
                tf_util.save_state(arglist.save_dir + arglist.exp_name + '_' + str(len(episode_rewards) + prev_ep_ct),
                                   saver=saver)

                # Print statement depends on whether or not there are adversaries
                if num_adversaries == 0:
                    print("steps: {}, episodes: {}, mean episode reward: {}, time: {}".format(
                        train_step,
                        len(episode_rewards) + prev_ep_ct,
                        np.mean(episode_rewards[-arglist.save_rate:]),
                        round(time.time() - t_start, 3)))
                else:
                    print("steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format(
                        train_step,
                        len(episode_rewards) + prev_ep_ct,
                        np.mean(episode_rewards[-arglist.save_rate:]),
                        [np.mean(reward[-arglist.save_rate:]) for reward in agent_rewards],
                        round(time.time() - t_start, 3)))
                    if arglist.level_k_select_print:
                        print("Opponent selection probability: {}".format(p_opponent_selection))

                # Reset start time to current time
                t_start = time.time()

                # Keep track of final episode reward
                final_ep_rewards.append(np.mean(episode_rewards[-arglist.save_rate:]))
                for reward in agent_rewards:
                    final_ep_ag_rewards.append(np.mean(reward[-arglist.save_rate:]))


                # Pickle dump trainning curve info
                if not os.path.exists(arglist.plots_dir):
                    os.makedirs(arglist.plots_dir)

                rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl'
                with open(rew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_rewards, fp)

                agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
                with open(agrew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_ag_rewards, fp)

                worst_level_file_name = arglist.plots_dir + arglist.exp_name + '_worst_performing_level.pkl'
                with open(worst_level_file_name, 'wb') as fp:
                    pickle.dump(worst_performing_levels, fp)


            # Saves final episode reward for plotting training curve later
            if len(episode_rewards) > arglist.num_episodes:
                if not os.path.exists(arglist.plots_dir):
                    os.makedirs(arglist.plots_dir)

                rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl'
                with open(rew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_rewards, fp)

                agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
                with open(agrew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_ag_rewards, fp)

                # Log agent data for run
                env.logger.save("State", arglist.save_dir,
                                filename=arglist.exp_name + '_state' + '_' + str(len(episode_rewards) + prev_ep_ct))

                print('...Finished total of {} episodes.'.format(len(episode_rewards)))
                print('...Worst performing history: {}'.format(worst_performing_levels))
                break