Пример #1
0
def random_maddpg_test(arglist):
    debug = False
    num_tasks = arglist.num_task_transfer  # 总共有多少个任务
    list_of_taskenv = []  # env list
    graph = tf.Graph()
    with graph.as_default():
        with U.single_threaded_session():
            if debug:
                begin = time_begin()
            # 1.1创建common actor
            env = make_env(arglist.scenario, reward_type=arglist.reward_type)
            env.set_map(sample_map(arglist.test_data_dir + arglist.test_data_name + "_1.h5"))
            # Create agent trainers
            obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
            num_adversaries = min(env.n, arglist.num_adversaries)
            actors = get_trainers(env, "actor_", num_adversaries, obs_shape_n, arglist, type=0)
            for i in range(num_tasks):
                list_of_taskenv.append(make_env(arglist.scenario, reward_type=arglist.reward_type))
            print('Using good policy {} and adv policy {}'.format(arglist.good_policy, arglist.adv_policy))

            # 1.2 Initialize
            U.initialize()

            model_name = arglist.load_dir.split('/')[-2] + '/'
            path = arglist.pictures_dir_transfer_test + model_name
            mkdir(path)
            for i in range(num_tasks):
                mkdir(os.path.join(path, "task_" + str(i)))
            # 2.1 加载checkpoints
            # model_load_dir = os.path.join(arglist.load_dir, str(model_number * arglist.save_rate), 'model.ckpt')
            # print('From ', model_load_dir, ' Loading previous state...')
            # U.load_state(model_load_dir)

            # 3.1 全局变量初始化
            global_steps = np.zeros(num_tasks)  # global timesteps for each env
            episodes_rewards = [[0.0] for _ in range(num_tasks)]  # 每个元素为在一个episode中所有agents rewards的和
            # agent_rewards[i]中的每个元素记录单个agent在一个episode中所有rewards的和
            agent_rewards = [[[0.0] for _ in range(env.n)] for _ in range(num_tasks)]

            energy_consumptions_for_test = [[] for _ in range(num_tasks)]
            j_index = [[] for _ in range(num_tasks)]
            aver_cover = [[] for _ in range(num_tasks)]
            instantaneous_dis = [[] for _ in range(num_tasks)]
            instantaneous_out_the_map = [[] for _ in range(num_tasks)]
            energy_efficiency = [[] for _ in range(num_tasks)]
            instantaneous_accmulated_reward = [[] for _ in range(num_tasks)]

            # 3.2 局部变量初始化
            local_steps = np.zeros(num_tasks)  # local timesteps for each env
            energy_one_episode = [[] for _ in range(num_tasks)]
            j_index_one_episode = [[] for _ in range(num_tasks)]
            aver_cover_one_episode = [[] for _ in range(num_tasks)]
            over_map_counter = np.zeros(num_tasks)
            over_map_one_episode = [[] for _ in range(num_tasks)]
            disconnected_number_counter = np.zeros(num_tasks)
            disconnected_number_one_episode = [[] for _ in range(num_tasks)]
            episode_reward_step = np.zeros(num_tasks)  # 累加一个episode里每一步的所有智能体的平均reward
            accmulated_reward_one_episode = [[] for _ in range(num_tasks)]
            route_one_episode = [[] for _ in range(num_tasks)]

            bl_coverage = 0.8
            bl_jainindex = 0.8
            bl_loss = 100

            # 3.3 初始化ENV
            obs_n_list = []
            for i in range(num_tasks):
                obs_n = list_of_taskenv[i].reset()
                list_of_taskenv[i].set_map(
                    sample_map(arglist.test_data_dir + arglist.test_data_name + "_" + str(i + 1) + ".h5", random=False))
                obs_n_list.append(obs_n)

            # 3.4
            history_n = [[queue.Queue(arglist.history_length) for _ in range(env.n)] for _ in range(num_tasks)]
            for i in range(num_tasks):
                for j in range(env.n):
                    for _ in range(arglist.history_length):
                        history_n[i][j].put(obs_n_list[i][j])
            # 4 test
            episode_start_time = time.time()
            print('Starting iterations...')
            episode_number = 0
            while True:
                for task_index in range(num_tasks):
                    # 3.1更新环境
                    current_env = list_of_taskenv[task_index]
                    # get action
                    action_n = [agent.action(obs) for agent, obs in zip(actors, history_n[task_index])]
                    # environment step
                    new_obs_n, rew_n, done_n, info_n = current_env.step(action_n)
                    local_steps[task_index] += 1  # 更新局部计数器
                    global_steps[task_index] += 1  # 更新全局计数器
                    done = all(done_n)
                    terminal = (local_steps[task_index] >= arglist.max_episode_len)

                    # 更新obs
                    obs_n_list[task_index] = new_obs_n
                    # 更新reward
                    for i, rew in enumerate(rew_n):
                        episodes_rewards[task_index][-1] += rew
                        agent_rewards[task_index][i][-1] += rew
                    # energy
                    energy_one_episode[task_index].append(current_env.get_energy())
                    # fair index
                    j_index_one_episode[task_index].append(current_env.get_jain_index())
                    # coverage
                    aver_cover_one_episode[task_index].append(current_env.get_aver_cover())
                    # over map counter
                    over_map_counter[task_index] += current_env.get_over_map()
                    over_map_one_episode[task_index].append(over_map_counter[task_index])
                    # disconnected counter
                    disconnected_number_counter[task_index] += current_env.get_dis()
                    disconnected_number_one_episode[task_index].append(disconnected_number_counter[task_index])
                    # reward
                    episode_reward_step[task_index] += np.mean(rew_n)
                    accmulated_reward_one_episode[task_index].append(episode_reward_step[task_index])
                    route = current_env.get_agent_pos()
                    route_one_episode[task_index].append(route)

                    episode_number = math.ceil(global_steps[task_index] / arglist.max_episode_len)
                    if done or terminal:
                        # 记录每个episode的变量
                        energy_consumptions_for_test[task_index].append(energy_one_episode[task_index][-1])  # energy
                        j_index[task_index].append(j_index_one_episode[task_index][-1])  # fairness index
                        aver_cover[task_index].append(aver_cover_one_episode[task_index][-1])  # coverage
                        instantaneous_dis[task_index].append(
                            disconnected_number_one_episode[task_index][-1])  # disconnected
                        instantaneous_out_the_map[task_index].append(
                            over_map_one_episode[task_index][-1])  # out of the map
                        instantaneous_accmulated_reward[task_index].append(
                            accmulated_reward_one_episode[task_index][-1])  # reward
                        energy_efficiency[task_index].append(aver_cover_one_episode[task_index][-1]
                                                             * j_index_one_episode[task_index][-1] /
                                                             energy_one_episode[task_index][-1])  # efficiency

                        episode_end_time = time.time()
                        episode_time = episode_end_time - episode_start_time
                        episode_start_time = episode_end_time
                        print('Task %d, Episode: %d - energy_consumptions: %s, efficiency: %s, time %s' % (
                            task_index,
                            episode_number,
                            str(current_env.get_energy_origin()),
                            str(energy_efficiency[task_index][-1]),
                            str(round(episode_time, 3))))
                        current_path = os.path.join(path, "task_" + str(task_index))
                        if arglist.draw_picture_test:
                            file_path = os.path.join(current_path,
                                                     "random_model_test.log")
                            if episode_number == arglist.num_test_episodes:
                                report = '\nOK===============report=====================\nRadom maddpg Model-testing ' \
                                         + str(arglist.num_test_episodes) + ' episodes\'s result:' \
                                         + '\n!!!Max energy efficiency: ' \
                                         + str(np.max(energy_efficiency[task_index])) \
                                         + '\n!!!Average energy efficiency:' \
                                         + str(np.mean(energy_efficiency[task_index])) \
                                         + '\nAverage average attained coverage: ' \
                                         + str(np.mean(aver_cover[task_index])) + \
                                         '\nAverage Jaint\'s fairness index: ' \
                                         + str(np.mean(j_index[task_index])) + \
                                         '\nAverage normalized average energy consumptions:' \
                                         + str(np.mean(energy_consumptions_for_test[task_index])) \
                                         + "\n" + "==========================end=============================\n"

                                draw_util.drawTest("random",
                                                   current_path+"random_maddpg",
                                                   energy_efficiency[task_index],
                                                   energy_consumptions_for_test[task_index],
                                                   aver_cover[task_index],
                                                   j_index[task_index],
                                                   instantaneous_accmulated_reward[task_index],
                                                   instantaneous_dis[task_index],
                                                   instantaneous_out_the_map[task_index],
                                                   len(aver_cover[task_index]),
                                                   bl_coverage,
                                                   bl_jainindex,
                                                   bl_loss,
                                                   False)
                            else:
                                report = '\nRandom maddpg Model-' \
                                         + '-episode ' + str(episode_number) + ' result:' \
                                         + '\n!!!Energy efficiency: ' \
                                         + str(energy_efficiency[task_index][-1]) \
                                         + '\nAverage attained coverage: ' \
                                         + str(aver_cover[task_index][-1]) + \
                                         '\nJaint\'s fairness index: ' \
                                         + str(j_index[task_index][-1]) + \
                                         '\nnormalized average energy consumptions: ' \
                                         + str(energy_consumptions_for_test[task_index][-1]) \
                                         + "\n"

                            with open(file_path, 'a+') as file:
                                file.write(report)

                        # reset custom statistics variabl between episode and epoch------------------------------------

                        if task_index == num_tasks - 1:
                            energy_one_episode = [[] for _ in range(num_tasks)]
                            j_index_one_episode = [[] for _ in range(num_tasks)]
                            aver_cover_one_episode = [[] for _ in range(num_tasks)]
                            over_map_counter = np.zeros(num_tasks)
                            over_map_one_episode = [[] for _ in range(num_tasks)]
                            disconnected_number_counter = np.zeros(num_tasks)
                            disconnected_number_one_episode = [[] for _ in range(num_tasks)]
                            episode_reward_step = np.zeros(num_tasks)
                            accmulated_reward_one_episode = [[] for _ in range(num_tasks)]
                            route_one_episode = [[] for _ in range(num_tasks)]

                        # 重置局部变量
                        obs_n_list[task_index] = current_env.reset()  # 重置env
                        current_env.set_map(
                            sample_map(
                                arglist.test_data_dir + arglist.test_data_name + "_" + str(task_index + 1) + ".h5",
                                random=False))
                        local_steps[task_index] = 0  # 重置局部计数器

                        # 更新全局变量
                        episodes_rewards[task_index].append(0)  # 添加新的元素
                        for reward in agent_rewards[task_index]:
                            reward.append(0)

                if episode_number > arglist.num_test_episodes:
                    break
Пример #2
0
def test(arglist):
    debug = False
    num_tasks = arglist.num_task  # 总共有多少个任务
    list_of_taskenv = []  # env list
    load_path = arglist.load_dir
    with U.single_threaded_session():
        if debug:
            begin = time_begin()
        # 1.1创建每个任务的actor trainer和critic trainer
        trainers_list = []
        env = make_env(arglist.scenario, arglist.benchmark)
        # Create agent trainers
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        num_adversaries = min(env.n, arglist.num_adversaries)
        for i in range(num_tasks):
            list_of_taskenv.append(make_env(arglist.scenario))
            trainers = get_trainers(list_of_taskenv[i],
                                    "task_" + str(i + 1) + "_",
                                    num_adversaries,
                                    obs_shape_n,
                                    arglist)
            trainers_list.append(trainers)
    
        print('Using good policy {} and adv policy {}'.format(arglist.good_policy, arglist.adv_policy))
    
        global_steps_tensor = tf.Variable(tf.zeros(num_tasks), trainable=False)  # global timesteps for each env
        global_steps_ph = tf.placeholder(tf.float32, [num_tasks])
        global_steps_assign_op = tf.assign(global_steps_tensor, global_steps_ph)
        model_number = int(arglist.num_episodes / arglist.save_rate)
        saver = tf.train.Saver(max_to_keep=model_number)
    
        efficiency_list = []
        for i in range(num_tasks):
            efficiency_list.append(tf.placeholder(tf.float32, shape=None, name="efficiency_placeholder" + str(i)))
        efficiency_summary_list = []
        for i in range(num_tasks):
            efficiency_summary_list.append(tf.summary.scalar("efficiency_%s" % i, efficiency_list[i]))
        writer = tf.summary.FileWriter("../summary/efficiency")
    
        # Initialize
        U.initialize()
        for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES):
            print(var)

        if debug:
            print(time_end(begin, "initialize"))
            begin = time_begin()
            
        model_name = arglist.load_dir.split('/')[-2] + '/'
        mkdir(arglist.pictures_dir_test + model_name)
        model_index_step = 0
        model_number_total = arglist.train_num_episodes / arglist.save_rate
        max_model_index = 0
        max_average_energy_efficiency = 0

        while True:
            if model_index_step >= model_number_total:
                with open(arglist.pictures_dir_test + model_name + 'test_report' + '.txt', 'a+') as file:
                    report = '\nModel ' + str(max_model_index) + ' attained max average energy efficiency' + \
                             '\nMax average energy efficiency:' + str(max_average_energy_efficiency)
                    file.write(report)
                break
            else:
                model_index_step += 1
            
            # 1.4 加载checkpoints
            if arglist.load_dir == "":
                arglist.load_dir = arglist.save_dir
            if arglist.display or arglist.restore or arglist.benchmark:
                print('Loading previous state...')
                model_load_dir = arglist.load_dir + str(model_index_step * arglist.save_rate - 1) + '/'
                U.load_state(arglist.load_dir)
            # global_steps = tf.get_default_session().run(global_steps_tensor)

            # 1.5 初始化ENV
            obs_n_list = []
            for i in range(num_tasks):
                obs_n = list_of_taskenv[i].reset()
                obs_n_list.append(obs_n)

            # 1.2 全局变量初始化
            episodes_rewards = [[0.0] for _ in range(num_tasks)]  # 每个元素为在一个episode中所有agents rewards的和
            # agent_rewards[i]中的每个元素记录单个agent在一个episode中所有rewards的和
            agent_rewards = [[[0.0] for _ in range(env.n)] for _ in range(num_tasks)]
            final_ep_rewards = [[] for _ in range(num_tasks)]  # sum of rewards for training curve
            final_ep_ag_rewards = [[] for _ in range(num_tasks)]  # agent rewards for training curve

            energy_consumptions_for_test = [[] for _ in range(num_tasks)]
            j_index = [[] for _ in range(num_tasks)]
            aver_cover = [[] for _ in range(num_tasks)]
            instantaneous_dis = [[] for _ in range(num_tasks)]
            instantaneous_out_the_map = [[] for _ in range(num_tasks)]
            energy_efficiency = [[] for _ in range(num_tasks)]
            instantaneous_accmulated_reward = [[] for _ in range(num_tasks)]

            # 1.3 局部变量初始化
            local_steps = np.zeros(num_tasks)  # local timesteps for each env
            energy_one_episode = [[] for _ in range(num_tasks)]
            j_index_one_episode = [[] for _ in range(num_tasks)]
            aver_cover_one_episode = [[] for _ in range(num_tasks)]
            over_map_counter = np.zeros(num_tasks)
            over_map_one_episode = [[] for _ in range(num_tasks)]
            disconnected_number_counter = np.zeros(num_tasks)
            disconnected_number_one_episode = [[] for _ in range(num_tasks)]
            episode_reward_step = np.zeros(num_tasks)  # 累加一个episode里每一步的所有智能体的平均reward
            accmulated_reward_one_episode = [[] for _ in range(num_tasks)]
            route_one_episode = [[] for _ in range(num_tasks)]
            

            bl_coverage = 0.8
            bl_jainindex = 0.8
            bl_loss = 100
            energy_efficiency = []

            print('Starting iterations...')
            while True:
                for task_index in range(num_tasks):
                    # 2.1更新环境,采集样本
                    current_env = list_of_taskenv[task_index]
                    current_trainers = trainers_list[task_index]
                    # get action
                    action_n = [agent.action(obs) for agent, obs in zip(trainers, obs_n)]
                    # environment step
                    new_obs_n, rew_n, done_n, info_n = current_env.step(action_n)
                    if debug:
                        print(time_end(begin, "env.step"))
                        begin = time_begin()
                    local_steps[task_index] += 1  # 更新局部计数器
                    global_steps[task_index] += 1  # 更新全局计数器
                    done = all(done_n)
                    terminal = (local_steps[task_index] >= arglist.max_episode_len)
                    # 收集experience
                    for i in range(env.n):
                        current_trainers[i].experience(obs_n_list[task_index][i], action_n[i], rew_n[i], new_obs_n[i],
                                                       done_n[i], terminal)

                    # 更新obs
                    obs_n_list[task_index] = new_obs_n
                    # 更新reward
                    for i, rew in enumerate(rew_n):
                        episodes_rewards[task_index][-1] += rew
                        agent_rewards[task_index][i][-1] += rew
                    # energy
                    energy_one_episode[task_index].append(current_env.get_energy())
                    # fair index
                    j_index_one_episode[task_index].append(current_env.get_jain_index())
                    # coverage
                    aver_cover_one_episode[task_index].append(current_env.get_aver_cover())
                    # over map counter
                    over_map_counter[task_index] += current_env.get_over_map()
                    over_map_one_episode[task_index].append(over_map_counter[task_index])
                    # disconnected counter
                    disconnected_number_counter[task_index] += current_env.get_dis()
                    disconnected_number_one_episode[task_index].append(disconnected_number_counter[task_index])
                    # reward
                    episode_reward_step[task_index] += np.mean(rew_n)
                    accmulated_reward_one_episode[task_index].append(episode_reward_step[task_index])
                    route = current_env.get_agent_pos()
                    route_one_episode[task_index].append(route)

                    if done or terminal:
                        # reset custom statistics variabl between episode and epoch---------------------------------------------
                        instantaneous_accmulated_reward.append(accmulated_reward_one_episode[-1])
                        j_index.append(j_index_one_episode[-1])
                        instantaneous_dis.append(disconnected_number_one_episode[-1])
                        instantaneous_out_the_map.append(over_map_one_episode[-1])
                        aver_cover.append(aver_cover_one_episode[-1])
                        energy_consumptions_for_test.append(energy_one_episode[-1])
                        energy_efficiency.append(aver_cover_one_episode[-1] * j_index_one_episode[-1] / energy_one_episode[-1])
                        print('Episode: %d - energy_consumptions: %s ' % (train_step / arglist.max_episode_len,
                                                                        str(env._get_energy_origin())))

                        if task_index == num_tasks - 1:
                            energy_one_episode = [[] for _ in range(num_tasks)]
                            j_index_one_episode = [[] for _ in range(num_tasks)]
                            aver_cover_one_episode = [[] for _ in range(num_tasks)]
                            over_map_counter = np.zeros(num_tasks)
                            over_map_one_episode = [[] for _ in range(num_tasks)]
                            disconnected_number_counter = np.zeros(num_tasks)
                            disconnected_number_one_episode = [[] for _ in range(num_tasks)]
                            episode_reward_step = np.zeros(num_tasks)
                            accmulated_reward_one_episode = [[] for _ in range(num_tasks)]
                            route_one_episode = [[] for _ in range(num_tasks)]

                        if arglist.draw_picture_test:
                            if len(episode_rewards) % arglist.save_rate == 0:
                                if np.mean(energy_efficiency) > max_average_energy_efficiency:
                                    max_model_index = model_index_step * arglist.save_rate - 1
                                    max_average_energy_efficiency = np.mean(energy_efficiency)
                                with open(arglist.pictures_dir_test + model_name + 'test_report' + '.txt', 'a+') as file:
                                    report = '\nModel-' + str(model_index_step * arglist.save_rate - 1) + \
                                             '-testing ' + str(arglist.num_episodes) + ' episodes\'s result:' + \
                                             '\nAverage average attained coverage: ' + str(np.mean(aver_cover)) + \
                                             '\nAverage Jaint\'s fairness index: ' + str(np.mean(j_index)) + \
                                             '\nAverage normalized average energy consumptions:' + str(np.mean(energy_consumptions_for_test)) + \
                                             '\nAverage energy efficiency:' + str(np.mean(energy_efficiency)) + '\n'
                                    file.write(report)
                                draw_util.drawTest(model_index_step * arglist.save_rate - 1, arglist.pictures_dir_test + model_name,
                                                   energy_consumptions_for_test, aver_cover, j_index,
                                                   instantaneous_accmulated_reward, instantaneous_dis, instantaneous_out_the_map
                                                   , len(aver_cover), bl_coverage, bl_jainindex, bl_loss, energy_efficiency, False)
                        # reset custom statistics variabl between episode and epoch----------------------------------------

                    # for displaying learned policies
                    if arglist.draw_picture_test:
                        if len(episode_rewards) > arglist.num_episodes:
                            break
                        continue

                    # saves final episode reward for plotting training curve later
                    if len(episode_rewards) > arglist.num_episodes:
                        rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl'
                        with open(rew_file_name, 'wb') as fp:
                            pickle.dump(final_ep_rewards, fp)
                        agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
                        with open(agrew_file_name, 'wb') as fp:
                            pickle.dump(final_ep_ag_rewards, fp)
                        print('...Finished total of {} episodes.'.format(len(episode_rewards)))
                        break
Пример #3
0
def train(arglist):
    debug = False
    arglist.save_dir = arglist.save_dir + "_batch_size_" + str(
        arglist.batch_size) + "_buffer_size_" + str(arglist.buffer_size)
    with U.single_threaded_session():
        if debug:
            begin = time_begin()
        # Create environment
        env = make_env(arglist.scenario, arglist, arglist.benchmark)
        if debug:
            print(time_end(begin, "step 0"))
            begin = time_begin()
        # Create agent trainers
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]
        num_adversaries = min(env.n, arglist.num_adversaries)
        if debug:
            print(time_end(begin, "step 1"))
            begin = time_begin()
        trainers = get_trainers(env, "task_", num_adversaries, obs_shape_n,
                                arglist)
        print('Using good policy {} and adv policy {}'.format(
            arglist.good_policy, arglist.adv_policy))
        if debug:
            print(time_end(begin, "step2"))
            begin = time_begin()

        efficiency = tf.placeholder(tf.float32,
                                    shape=None,
                                    name="efficiency_placeholder")
        efficiency_summary = tf.summary.scalar("efficiency", efficiency)
        p_losses_ph = tf.placeholder(tf.float32, shape=[env.n], name="p_loss")
        p_losses_summary = tf.summary.histogram("loss", p_losses_ph)
        q_losses_ph = tf.placeholder(tf.float32, shape=[env.n], name="q_loss")
        q_losses_summary = tf.summary.histogram("loss", q_losses_ph)
        loss_summary = tf.summary.merge([q_losses_summary, p_losses_summary],
                                        name="loss")
        writer = tf.summary.FileWriter("../summary/efficiency")
        writer2 = tf.summary.FileWriter("../summary/loss")

        # Initialize
        U.initialize()
        for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES):
            print(var)
        if debug:
            print(time_end(begin, "step3"))
            begin = time_begin()

        # Load previous results, if necessary
        if arglist.load_dir == "":
            arglist.load_dir = arglist.save_dir
        if arglist.display or arglist.restore or arglist.benchmark:
            print('Loading previous state...')
            U.load_state(arglist.load_dir)
        if not os.path.exists(arglist.save_dir):
            os.makedirs(arglist.save_dir)

        episode_rewards = [0.0]  # sum of rewards for all agents
        agent_rewards = [[0.0]
                         for _ in range(env.n)]  # individual agent reward
        final_ep_rewards = []  # sum of rewards for training curve
        final_ep_ag_rewards = []  # agent rewards for training curve
        agent_info = [[[]]]  # placeholder for benchmarking info
        model_number = int(arglist.num_episodes / arglist.save_rate)
        saver = tf.train.Saver(max_to_keep=model_number)
        episode_step = 0
        train_step = 0
        t_start = time.time()
        # custom statistics variable------------------------------------------------------------------------------------
        loss_all = []
        aver_cover = []
        j_index = []
        instantaneous_accmulated_reward = []
        instantaneous_dis = []
        instantaneous_out_the_map = []
        # q_value = []
        energy_consumptions_for_test = []
        bl_coverage = 0.8
        bl_jainindex = 0.8
        bl_loss = 100
        energy_efficiency = []

        over_map_counter = 0
        over_map_one_episode = []
        aver_cover_one_episode = []
        j_index_one_episode = []
        disconnected_number_counter = 0
        disconnected_number_one_episode = []
        accmulated_reward_one_episode = []
        actions = []
        energy_one_episode = []
        route = []
        obs_n = env.reset()

        episode_reward_step = 0

        model_name = arglist.load_dir.split(
            '/')[-3] + '/' + arglist.load_dir.split('/')[-2] + '/'
        if FLAGS.greedy_action:
            model_name = model_name + 'greedy/'
        elif FLAGS.random_action:
            model_name = model_name + 'random/'

        # if debug:
        #     print(time_end(begin, "initialize"))
        #     begin = time_begin()
        print('Starting iterations...')
        episode_begin_time = time.time()
        while True:
            # get action
            action_n = [
                agent.action(obs) for agent, obs in zip(trainers, obs_n)
            ]
            # environment step
            new_obs_n, rew_n, done_n, info_n = env.step(action_n)

            episode_step += 1
            done = all(done_n)
            terminal = (episode_step >= arglist.max_episode_len)
            # collect experience
            for i, agent in enumerate(trainers):
                agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i],
                                 done_n[i], terminal)
            obs_n = new_obs_n

            for i, rew in enumerate(rew_n):
                episode_rewards[-1] += rew
                agent_rewards[i][-1] += rew

            # increment custom statistics variables in the epoch--------------------------------------------------------
            episode_reward_step += np.mean(rew_n)
            j_index_one_episode.append(env.get_jain_index())
            over_map_counter += env.get_over_map()
            over_map_one_episode.append(over_map_counter)
            disconnected_number_counter += env.get_dis()
            disconnected_number_one_episode.append(disconnected_number_counter)
            aver_cover_one_episode.append(env.get_aver_cover())
            energy_one_episode.append(env.get_energy())
            s_route = env.get_state()
            for route_i in range(0, FLAGS.num_uav * 2, 2):
                tmp = [s_route[route_i], s_route[route_i + 1]]
                route.append(tmp)
            accmulated_reward_one_episode.append(episode_reward_step)
            # if debug:
            #     print(time_end(begin, "others"))
            #     begin = time_begin()
            if done or terminal:
                model_name = arglist.save_dir.split('/')[-1] + '/'
                episode_number = int(train_step / arglist.max_episode_len)
                temp_efficiency = np.array(aver_cover_one_episode) * np.array(
                    j_index_one_episode) / np.array(energy_one_episode)
                draw_util.draw_single_episode(
                    arglist.pictures_dir_train + model_name +
                    "single_episode/", episode_number, temp_efficiency,
                    aver_cover_one_episode, j_index_one_episode,
                    energy_one_episode, disconnected_number_one_episode,
                    over_map_one_episode, accmulated_reward_one_episode)

                # reset custom statistics variabl between episode and epoch---------------------------------------------
                instantaneous_accmulated_reward.append(
                    accmulated_reward_one_episode[-1])
                j_index.append(j_index_one_episode[-1])
                instantaneous_dis.append(disconnected_number_one_episode[-1])
                instantaneous_out_the_map.append(over_map_one_episode[-1])
                aver_cover.append(aver_cover_one_episode[-1])
                energy_consumptions_for_test.append(energy_one_episode[-1])
                energy_efficiency.append(aver_cover_one_episode[-1] *
                                         j_index_one_episode[-1] /
                                         energy_one_episode[-1])
                episode_end_time = time.time()

                # plot fig
                efficiency_s = tf.get_default_session().run(
                    efficiency_summary,
                    feed_dict={efficiency: energy_efficiency[episode_number]})
                writer.add_summary(efficiency_s, global_step=episode_number)
                # plt fig
                print(
                    'Episode: %d - energy_consumptions: %s, efficiency: %s, time %s'
                    %
                    (train_step / arglist.max_episode_len,
                     str(env.get_energy_origin()), str(energy_efficiency[-1]),
                     str(round(episode_end_time - episode_begin_time, 3))))
                episode_begin_time = episode_end_time
                # draw picture of this episode
                if arglist.draw_picture_test and aver_cover[-1] >= bl_coverage and j_index[-1] >= bl_jainindex \
                        and instantaneous_dis[-1] <= bl_loss:
                    episode_number_name = 'episode_' + str(episode_number)
                    draw_util.draw(episode_number_name,
                                   arglist.pictures_dir_test + model_name,
                                   energy_one_episode, route, actions,
                                   aver_cover_one_episode, j_index_one_episode,
                                   accmulated_reward_one_episode,
                                   disconnected_number_one_episode,
                                   over_map_one_episode,
                                   arglist.max_episode_len)

                j_index_one_episode = []
                over_map_counter = 0
                over_map_one_episode = []
                disconnected_number_counter = 0
                disconnected_number_one_episode = []
                aver_cover_one_episode = []
                energy_one_episode = []
                route = []
                episode_reward_step = 0
                accmulated_reward_one_episode = []

                if arglist.draw_picture_test:
                    if len(episode_rewards) % arglist.save_rate == 0:
                        episode_number_name = train_step / arglist.max_episode_len
                        draw_util.drawTest(
                            episode_number_name,
                            arglist.pictures_dir_train + model_name,
                            energy_consumptions_for_test, aver_cover, j_index,
                            instantaneous_accmulated_reward,
                            instantaneous_dis, instantaneous_out_the_map,
                            len(aver_cover), bl_coverage, bl_jainindex,
                            bl_loss, energy_efficiency, False)
                # reset custom statistics variabl between episode and epoch---------------------------------------------

                obs_n = env.reset()
                episode_step = 0
                episode_rewards.append(0)
                for a in agent_rewards:
                    a.append(0)
                agent_info.append([[]])

            # increment global step counter
            train_step += 1

            # for benchmarking learned policies
            if arglist.benchmark:
                for i, info in enumerate(info_n):
                    agent_info[-1][i].append(info_n['n'])
                if train_step > arglist.benchmark_iters and (done or terminal):
                    file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl'
                    print('Finished benchmarking, now saving...')
                    with open(file_name, 'wb') as fp:
                        pickle.dump(agent_info[:-1], fp)
                    break
                continue

            # for displaying learned policies
            if arglist.draw_picture_test:
                if len(episode_rewards) > arglist.num_episodes:
                    break
                continue
            if arglist.display:
                time.sleep(0.1)
                env.render()
                continue

            # update all trainers, if not in display or benchmark mode
            p_loss_list = []
            q_loss_list = []
            for agent in trainers:
                agent.preupdate()
            for agent in trainers:
                temp = agent.update(trainers, train_step)
                if temp is not None:
                    p_loss_list.append(temp[1])
                    q_loss_list.append(temp[0])
            if len(p_loss_list) == env.n:
                loss_s = tf.get_default_session().run(loss_summary,
                                                      feed_dict={
                                                          p_losses_ph:
                                                          p_loss_list,
                                                          q_losses_ph:
                                                          q_loss_list
                                                      })
                writer2.add_summary(loss_s, global_step=train_step)

            # save model, display training output
            if terminal and (len(episode_rewards) % arglist.save_rate == 0):
                episode_number_name = train_step / arglist.max_episode_len
                save_dir_custom = arglist.save_dir + "/" + str(
                    episode_number_name) + '/'
                # save_dir
                U.save_state(save_dir_custom, saver=saver)
                # print statement depends on whether or not there are adversaries
                if num_adversaries == 0:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, time: {}"
                        .format(train_step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                round(time.time() - t_start, 3)))
                else:
                    print(
                        "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}"
                        .format(train_step, len(episode_rewards),
                                np.mean(episode_rewards[-arglist.save_rate:]),
                                [
                                    np.mean(rew[-arglist.save_rate:])
                                    for rew in agent_rewards
                                ], round(time.time() - t_start, 3)))
                t_start = time.time()
                # Keep track of final episode reward
                final_ep_rewards.append(
                    np.mean(episode_rewards[-arglist.save_rate:]))
                for rew in agent_rewards:
                    final_ep_ag_rewards.append(
                        np.mean(rew[-arglist.save_rate:]))
                # draw custom statistics picture when save the model----------------------------------------------------
                if arglist.draw_picture_train:
                    episode_number_name = train_step / arglist.max_episode_len
                    model_name = arglist.save_dir.split('/')[-1] + '/'
                    draw_util.draw_episodes(
                        episode_number_name, arglist.pictures_dir_train +
                        model_name + "all_episodes/", aver_cover, j_index,
                        energy_consumptions_for_test, instantaneous_dis,
                        instantaneous_out_the_map, energy_efficiency,
                        instantaneous_accmulated_reward, len(aver_cover))

            # saves final episode reward for plotting training curve later
            if len(episode_rewards) > arglist.num_episodes:
                rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl'
                with open(rew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_rewards, fp)
                agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl'
                with open(agrew_file_name, 'wb') as fp:
                    pickle.dump(final_ep_ag_rewards, fp)
                print('...Finished total of {} episodes.'.format(
                    len(episode_rewards)))
                break