def train(arglist): debug = False arglist.save_dir = arglist.save_dir + "batch_size" + str(arglist.batch_size) + "buffer_size" + str(arglist.buffer_size) with U.single_threaded_session(): if debug: begin = time_begin() # Create environment env = make_env(arglist.scenario, arglist, arglist.benchmark) if debug: print(time_end(begin, "step 0")) begin = time_begin() # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) if debug: print(time_end(begin, "step 1")) begin = time_begin() trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) print('Using good policy {} and adv policy {}'.format(arglist.good_policy, arglist.adv_policy)) if debug: print(time_end(begin, "step2")) begin = time_begin() efficiency = tf.placeholder(tf.float32, shape=None, name="efficiency_placeholder") efficiency_summary = tf.summary.scalar("efficiency", efficiency) p_losses_ph = tf.placeholder(tf.float32, shape=[env.n], name="p_loss") p_losses_summary = tf.summary.histogram("loss", p_losses_ph) q_losses_ph = tf.placeholder(tf.float32, shape=[env.n], name="q_loss") q_losses_summary = tf.summary.histogram("loss", q_losses_ph) loss_summary = tf.summary.merge([q_losses_summary, p_losses_summary], name="loss") writer = tf.summary.FileWriter("../summary/efficiency") writer2 = tf.summary.FileWriter("../summary/loss") # Initialize U.initialize() if debug: print(time_end(begin, "step3")) begin = time_begin() # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.display or arglist.restore or arglist.benchmark: print('Loading previous state...') U.load_state(arglist.load_dir) if not os.path.exists(arglist.save_dir): os.makedirs(arglist.save_dir) episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info model_number = int(arglist.num_episodes / arglist.save_rate) saver = tf.train.Saver(max_to_keep=model_number) episode_step = 0 train_step = 0 t_start = time.time() # custom statistics variable------------------------------------------------------------------------------------ loss_all = [] aver_cover = [] j_index = [] instantaneous_accmulated_reward = [] instantaneous_dis = [] instantaneous_out_the_map = [] # q_value = [] energy_consumptions_for_test = [] bl_coverage = 0.8 bl_jainindex = 0.8 bl_loss = 100 energy_efficiency = [] over_map_counter = 0 over_map_one_episode = [] aver_cover_one_episode = [] j_index_one_episode = [] disconnected_number_counter = 0 disconnected_number_one_episode = [] accmulated_reward_one_episode = [] actions = [] energy_one_episode = [] route = [] obs_n = env.reset() episode_reward_step = 0 model_name = arglist.load_dir.split('/')[-3] + '/' + arglist.load_dir.split('/')[-2] + '/' if FLAGS.greedy_action: model_name = model_name + 'greedy/' elif FLAGS.random_action: model_name = model_name + 'random/' # if debug: # print(time_end(begin, "initialize")) # begin = time_begin() print('Starting iterations...') episode_begin_time = time.time() while True: # get action action_n = [agent.action(obs) for agent, obs in zip(trainers, obs_n)] # environment step new_obs_n, rew_n, done_n, info_n = env.step(action_n) episode_step += 1 done = all(done_n) terminal = (episode_step >= arglist.max_episode_len) # collect experience for i, agent in enumerate(trainers): agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) obs_n = new_obs_n for i, rew in enumerate(rew_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew # increment custom statistics variables in the epoch-------------------------------------------------------- episode_reward_step += np.mean(rew_n) j_index_one_episode.append(env._get_jain_index()) over_map_counter += env._get_over_map() over_map_one_episode.append(over_map_counter) disconnected_number_counter += env._get_dis() disconnected_number_one_episode.append(disconnected_number_counter) aver_cover_one_episode.append(env._get_aver_cover()) energy_one_episode.append(env._get_energy()) s_route = env._get_state() for route_i in range(0, FLAGS.num_uav * 2, 2): tmp = [s_route[route_i], s_route[route_i + 1]] route.append(tmp) accmulated_reward_one_episode.append(episode_reward_step) # if debug: # print(time_end(begin, "others")) # begin = time_begin() if done or terminal: model_name = arglist.save_dir.split('/')[-1] + '/' episode_number = int(train_step / arglist.max_episode_len) temp_efficiency = np.array(aver_cover_one_episode) * np.array( j_index_one_episode) / np.array(energy_one_episode) draw_util.draw_single_episode(arglist.pictures_dir_train + model_name + "single_episode/", episode_number, temp_efficiency, aver_cover_one_episode, j_index_one_episode, energy_one_episode, disconnected_number_one_episode, over_map_one_episode, accmulated_reward_one_episode) # reset custom statistics variabl between episode and epoch--------------------------------------------- instantaneous_accmulated_reward.append(accmulated_reward_one_episode[-1]) j_index.append(j_index_one_episode[-1]) instantaneous_dis.append(disconnected_number_one_episode[-1]) instantaneous_out_the_map.append(over_map_one_episode[-1]) aver_cover.append(aver_cover_one_episode[-1]) energy_consumptions_for_test.append(energy_one_episode[-1]) energy_efficiency.append(aver_cover_one_episode[-1] * j_index_one_episode[-1] / energy_one_episode[-1]) episode_end_time = time.time() # plot fig efficiency_s = tf.get_default_session().run(efficiency_summary, feed_dict={efficiency: energy_efficiency[episode_number]}) writer.add_summary(efficiency_s, global_step=episode_number) # plt fig print('Episode: %d - energy_consumptions: %s, efficiency: %s, time %s' % (train_step / arglist.max_episode_len, str(env._get_energy_origin()), str(energy_efficiency[-1]), str(round(episode_end_time - episode_begin_time, 3)))) episode_begin_time = episode_end_time # draw picture of this episode if arglist.draw_picture_test and aver_cover[-1] >= bl_coverage and j_index[-1] >= bl_jainindex \ and instantaneous_dis[-1] <= bl_loss: episode_number_name = 'episode_' + str(episode_number) draw_util.draw(episode_number_name, arglist.pictures_dir_test + model_name, energy_one_episode, route, actions, aver_cover_one_episode, j_index_one_episode, accmulated_reward_one_episode, disconnected_number_one_episode, over_map_one_episode, arglist.max_episode_len) j_index_one_episode = [] over_map_counter = 0 over_map_one_episode = [] disconnected_number_counter = 0 disconnected_number_one_episode = [] aver_cover_one_episode = [] energy_one_episode = [] route = [] episode_reward_step = 0 accmulated_reward_one_episode = [] if arglist.draw_picture_test: if len(episode_rewards) % arglist.save_rate == 0: episode_number_name = train_step / arglist.max_episode_len draw_util.drawTest(episode_number_name, arglist.pictures_dir_train + model_name, energy_consumptions_for_test, aver_cover, j_index, instantaneous_accmulated_reward, instantaneous_dis, instantaneous_out_the_map , len(aver_cover), bl_coverage, bl_jainindex, bl_loss, energy_efficiency, False) # reset custom statistics variabl between episode and epoch--------------------------------------------- obs_n = env.reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) # increment global step counter train_step += 1 # for benchmarking learned policies if arglist.benchmark: for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) if train_step > arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue # for displaying learned policies if arglist.draw_picture_test: if len(episode_rewards) > arglist.num_episodes: break continue if arglist.display: time.sleep(0.1) env.render() continue # update all trainers, if not in display or benchmark mode p_loss_list = [] q_loss_list = [] for agent in trainers: agent.preupdate() for agent in trainers: temp = agent.update(trainers, train_step) if temp is not None: p_loss_list.append(temp[1]) q_loss_list.append(temp[0]) if len(p_loss_list) == env.n: loss_s = tf.get_default_session().run(loss_summary, feed_dict={p_losses_ph: p_loss_list, q_losses_ph: q_loss_list}) writer2.add_summary(loss_s, global_step=train_step) # save model, display training output if terminal and (len(episode_rewards) % arglist.save_rate == 0): episode_number_name = train_step / arglist.max_episode_len save_dir_custom = arglist.save_dir + "/" + str(episode_number_name) + '/' # save_dir U.save_state(save_dir_custom, saver=saver) # print statement depends on whether or not there are adversaries if num_adversaries == 0: print("steps: {}, episodes: {}, mean episode reward: {}, time: {}".format( train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time()-t_start, 3))) else: print("steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format( train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), [np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards], round(time.time()-t_start, 3))) t_start = time.time() # Keep track of final episode reward final_ep_rewards.append(np.mean(episode_rewards[-arglist.save_rate:])) for rew in agent_rewards: final_ep_ag_rewards.append(np.mean(rew[-arglist.save_rate:])) # draw custom statistics picture when save the model---------------------------------------------------- if arglist.draw_picture_train: episode_number_name = train_step / arglist.max_episode_len model_name = arglist.save_dir.split('/')[-1] + '/' draw_util.draw_episodes(episode_number_name, arglist.pictures_dir_train + model_name + "all_episodes/", aver_cover, j_index, energy_consumptions_for_test, instantaneous_dis, instantaneous_out_the_map, energy_efficiency, instantaneous_accmulated_reward, len(aver_cover)) # saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) print('...Finished total of {} episodes.'.format(len(episode_rewards))) break
def test(arglist): debug = False num_tasks = arglist.num_task # 总共有多少个任务 list_of_taskenv = [] # env list load_path = arglist.load_dir with U.single_threaded_session(): if debug: begin = time_begin() # 1.1创建每个任务的actor trainer和critic trainer trainers_list = [] env = make_env(arglist.scenario, arglist.benchmark) # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) for i in range(num_tasks): list_of_taskenv.append(make_env(arglist.scenario)) trainers = get_trainers(list_of_taskenv[i], "task_" + str(i + 1) + "_", num_adversaries, obs_shape_n, arglist) trainers_list.append(trainers) print('Using good policy {} and adv policy {}'.format(arglist.good_policy, arglist.adv_policy)) global_steps_tensor = tf.Variable(tf.zeros(num_tasks), trainable=False) # global timesteps for each env global_steps_ph = tf.placeholder(tf.float32, [num_tasks]) global_steps_assign_op = tf.assign(global_steps_tensor, global_steps_ph) model_number = int(arglist.num_episodes / arglist.save_rate) saver = tf.train.Saver(max_to_keep=model_number) efficiency_list = [] for i in range(num_tasks): efficiency_list.append(tf.placeholder(tf.float32, shape=None, name="efficiency_placeholder" + str(i))) efficiency_summary_list = [] for i in range(num_tasks): efficiency_summary_list.append(tf.summary.scalar("efficiency_%s" % i, efficiency_list[i])) writer = tf.summary.FileWriter("../summary/efficiency") # Initialize U.initialize() for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES): print(var) if debug: print(time_end(begin, "initialize")) begin = time_begin() model_name = arglist.load_dir.split('/')[-2] + '/' mkdir(arglist.pictures_dir_test + model_name) model_index_step = 0 model_number_total = arglist.train_num_episodes / arglist.save_rate max_model_index = 0 max_average_energy_efficiency = 0 while True: if model_index_step >= model_number_total: with open(arglist.pictures_dir_test + model_name + 'test_report' + '.txt', 'a+') as file: report = '\nModel ' + str(max_model_index) + ' attained max average energy efficiency' + \ '\nMax average energy efficiency:' + str(max_average_energy_efficiency) file.write(report) break else: model_index_step += 1 # 1.4 加载checkpoints if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.display or arglist.restore or arglist.benchmark: print('Loading previous state...') model_load_dir = arglist.load_dir + str(model_index_step * arglist.save_rate - 1) + '/' U.load_state(arglist.load_dir) # global_steps = tf.get_default_session().run(global_steps_tensor) # 1.5 初始化ENV obs_n_list = [] for i in range(num_tasks): obs_n = list_of_taskenv[i].reset() obs_n_list.append(obs_n) # 1.2 全局变量初始化 episodes_rewards = [[0.0] for _ in range(num_tasks)] # 每个元素为在一个episode中所有agents rewards的和 # agent_rewards[i]中的每个元素记录单个agent在一个episode中所有rewards的和 agent_rewards = [[[0.0] for _ in range(env.n)] for _ in range(num_tasks)] final_ep_rewards = [[] for _ in range(num_tasks)] # sum of rewards for training curve final_ep_ag_rewards = [[] for _ in range(num_tasks)] # agent rewards for training curve energy_consumptions_for_test = [[] for _ in range(num_tasks)] j_index = [[] for _ in range(num_tasks)] aver_cover = [[] for _ in range(num_tasks)] instantaneous_dis = [[] for _ in range(num_tasks)] instantaneous_out_the_map = [[] for _ in range(num_tasks)] energy_efficiency = [[] for _ in range(num_tasks)] instantaneous_accmulated_reward = [[] for _ in range(num_tasks)] # 1.3 局部变量初始化 local_steps = np.zeros(num_tasks) # local timesteps for each env energy_one_episode = [[] for _ in range(num_tasks)] j_index_one_episode = [[] for _ in range(num_tasks)] aver_cover_one_episode = [[] for _ in range(num_tasks)] over_map_counter = np.zeros(num_tasks) over_map_one_episode = [[] for _ in range(num_tasks)] disconnected_number_counter = np.zeros(num_tasks) disconnected_number_one_episode = [[] for _ in range(num_tasks)] episode_reward_step = np.zeros(num_tasks) # 累加一个episode里每一步的所有智能体的平均reward accmulated_reward_one_episode = [[] for _ in range(num_tasks)] route_one_episode = [[] for _ in range(num_tasks)] bl_coverage = 0.8 bl_jainindex = 0.8 bl_loss = 100 energy_efficiency = [] print('Starting iterations...') while True: for task_index in range(num_tasks): # 2.1更新环境,采集样本 current_env = list_of_taskenv[task_index] current_trainers = trainers_list[task_index] # get action action_n = [agent.action(obs) for agent, obs in zip(trainers, obs_n)] # environment step new_obs_n, rew_n, done_n, info_n = current_env.step(action_n) if debug: print(time_end(begin, "env.step")) begin = time_begin() local_steps[task_index] += 1 # 更新局部计数器 global_steps[task_index] += 1 # 更新全局计数器 done = all(done_n) terminal = (local_steps[task_index] >= arglist.max_episode_len) # 收集experience for i in range(env.n): current_trainers[i].experience(obs_n_list[task_index][i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) # 更新obs obs_n_list[task_index] = new_obs_n # 更新reward for i, rew in enumerate(rew_n): episodes_rewards[task_index][-1] += rew agent_rewards[task_index][i][-1] += rew # energy energy_one_episode[task_index].append(current_env.get_energy()) # fair index j_index_one_episode[task_index].append(current_env.get_jain_index()) # coverage aver_cover_one_episode[task_index].append(current_env.get_aver_cover()) # over map counter over_map_counter[task_index] += current_env.get_over_map() over_map_one_episode[task_index].append(over_map_counter[task_index]) # disconnected counter disconnected_number_counter[task_index] += current_env.get_dis() disconnected_number_one_episode[task_index].append(disconnected_number_counter[task_index]) # reward episode_reward_step[task_index] += np.mean(rew_n) accmulated_reward_one_episode[task_index].append(episode_reward_step[task_index]) route = current_env.get_agent_pos() route_one_episode[task_index].append(route) if done or terminal: # reset custom statistics variabl between episode and epoch--------------------------------------------- instantaneous_accmulated_reward.append(accmulated_reward_one_episode[-1]) j_index.append(j_index_one_episode[-1]) instantaneous_dis.append(disconnected_number_one_episode[-1]) instantaneous_out_the_map.append(over_map_one_episode[-1]) aver_cover.append(aver_cover_one_episode[-1]) energy_consumptions_for_test.append(energy_one_episode[-1]) energy_efficiency.append(aver_cover_one_episode[-1] * j_index_one_episode[-1] / energy_one_episode[-1]) print('Episode: %d - energy_consumptions: %s ' % (train_step / arglist.max_episode_len, str(env._get_energy_origin()))) if task_index == num_tasks - 1: energy_one_episode = [[] for _ in range(num_tasks)] j_index_one_episode = [[] for _ in range(num_tasks)] aver_cover_one_episode = [[] for _ in range(num_tasks)] over_map_counter = np.zeros(num_tasks) over_map_one_episode = [[] for _ in range(num_tasks)] disconnected_number_counter = np.zeros(num_tasks) disconnected_number_one_episode = [[] for _ in range(num_tasks)] episode_reward_step = np.zeros(num_tasks) accmulated_reward_one_episode = [[] for _ in range(num_tasks)] route_one_episode = [[] for _ in range(num_tasks)] if arglist.draw_picture_test: if len(episode_rewards) % arglist.save_rate == 0: if np.mean(energy_efficiency) > max_average_energy_efficiency: max_model_index = model_index_step * arglist.save_rate - 1 max_average_energy_efficiency = np.mean(energy_efficiency) with open(arglist.pictures_dir_test + model_name + 'test_report' + '.txt', 'a+') as file: report = '\nModel-' + str(model_index_step * arglist.save_rate - 1) + \ '-testing ' + str(arglist.num_episodes) + ' episodes\'s result:' + \ '\nAverage average attained coverage: ' + str(np.mean(aver_cover)) + \ '\nAverage Jaint\'s fairness index: ' + str(np.mean(j_index)) + \ '\nAverage normalized average energy consumptions:' + str(np.mean(energy_consumptions_for_test)) + \ '\nAverage energy efficiency:' + str(np.mean(energy_efficiency)) + '\n' file.write(report) draw_util.drawTest(model_index_step * arglist.save_rate - 1, arglist.pictures_dir_test + model_name, energy_consumptions_for_test, aver_cover, j_index, instantaneous_accmulated_reward, instantaneous_dis, instantaneous_out_the_map , len(aver_cover), bl_coverage, bl_jainindex, bl_loss, energy_efficiency, False) # reset custom statistics variabl between episode and epoch---------------------------------------- # for displaying learned policies if arglist.draw_picture_test: if len(episode_rewards) > arglist.num_episodes: break continue # saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) print('...Finished total of {} episodes.'.format(len(episode_rewards))) break
def test(arglist): with U.single_threaded_session(): # Create environment env = make_env(arglist.scenario, arglist, arglist.benchmark) # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) print('Using good policy {} and adv policy {}'.format( arglist.good_policy, arglist.adv_policy)) model_name = arglist.load_dir.split('/')[-2] + '/' if FLAGS.greedy_action: model_name = model_name + 'greedy/' elif FLAGS.random_action: model_name = model_name + 'random/' model_index_step = 0 model_number_total = arglist.train_num_episodes / arglist.save_rate max_model_index = 0 max_average_energy_efficiency = 0 draw_util.mkdir(arglist.pictures_dir_test + model_name) while True: # Initialize U.initialize() if model_index_step >= model_number_total: with open( arglist.pictures_dir_test + model_name + 'test_report' + '.txt', 'a+') as file: report = '\nModel ' + str(max_model_index) + ' attained max average energy efficiency' + \ '\nMax average energy efficiency:' + str(max_average_energy_efficiency) file.write(report) break else: model_index_step += 1 # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.display or arglist.restore or arglist.benchmark: print('Loading previous state...') #model_load_dir = arglist.load_dir + str(model_index_step * arglist.save_rate - 1) + '/' model_load_dir = arglist.load_dir + str(3299) + '/' U.load_state(model_load_dir) episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info saver = tf.train.Saver() obs_n = env.reset() episode_step = 0 train_step = 0 t_start = time.time() # custom statistics variable-------------------------------------------------------------------------------- loss_all = [] aver_cover = [] j_index = [] instantaneous_accmulated_reward = [] instantaneous_dis = [] instantaneous_out_the_map = [] # q_value = [] energy_consumptions_for_test = [] bl_coverage = 0.8 bl_jainindex = 0.8 bl_loss = 100 energy_efficiency = [] over_map_counter = 0 over_map_one_episode = [] aver_cover_one_episode = [] j_index_one_episode = [] disconnected_number_counter = 0 disconnected_number_one_episode = [] accmulated_reward_one_episode = [] actions = [] energy_one_episode = [] route = [] episode_reward_step = 0 print('Starting iterations...') route_dict = {} for i in range(FLAGS.num_uav): key_temp = "UAV" + str(i + 1) route_dict[key_temp] = [] while True: # get action action_n = [ agent.action(obs) for agent, obs in zip(trainers, obs_n) ] # environment step new_obs_n, rew_n, done_n, info_n = env.step(action_n) episode_step += 1 done = all(done_n) terminal = (episode_step >= arglist.max_episode_len) # collect experience for i, agent in enumerate(trainers): agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) obs_n = new_obs_n for i, rew in enumerate(rew_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew # increment custom statistics variables in the epoch-------------------------------------------------------- episode_reward_step += np.mean(rew_n) j_index_one_episode.append(env._get_jain_index()) over_map_counter += env._get_over_map() over_map_one_episode.append(over_map_counter) disconnected_number_counter += env._get_dis() disconnected_number_one_episode.append( disconnected_number_counter) aver_cover_one_episode.append(env._get_aver_cover()) energy_one_episode.append(env._get_energy()) s_route = env._get_state() for index, route_i in enumerate(range(0, FLAGS.num_uav * 2, 2)): # for piao zong route_dict["UAV" + str(index + 1)].append( [s_route[route_i], s_route[route_i + 1]]) # for piao zong accmulated_reward_one_episode.append(episode_reward_step) if done or terminal: ### for piaozong uav_poss_file = "~/UAVNumber_" + str( FLAGS.num_uav) + ".json" route_str = json.dumps(route_dict) with open(uav_poss_file, "w+") as f: f.write(route_str) ### for piaozong # reset custom statistics variabl between episode and epoch--------------------------------------------- instantaneous_accmulated_reward.append( accmulated_reward_one_episode[-1]) j_index.append(j_index_one_episode[-1]) instantaneous_dis.append( disconnected_number_one_episode[-1]) instantaneous_out_the_map.append(over_map_one_episode[-1]) aver_cover.append(aver_cover_one_episode[-1]) energy_consumptions_for_test.append(energy_one_episode[-1]) energy_efficiency.append(aver_cover_one_episode[-1] * j_index_one_episode[-1] / energy_one_episode[-1]) print('Episode: %d - energy_consumptions: %s ' % (train_step / arglist.max_episode_len, str(env._get_energy_origin()))) j_index_one_episode = [] over_map_counter = 0 over_map_one_episode = [] disconnected_number_counter = 0 disconnected_number_one_episode = [] aver_cover_one_episode = [] energy_one_episode = [] route = [] episode_reward_step = 0 accmulated_reward_one_episode = [] if arglist.draw_picture_test: if len(episode_rewards) % arglist.save_rate == 0: if np.mean(energy_efficiency ) > max_average_energy_efficiency: max_model_index = model_index_step * arglist.save_rate - 1 max_average_energy_efficiency = np.mean( energy_efficiency) with open( arglist.pictures_dir_test + model_name + 'test_report' + '.txt', 'a+') as file: report = '\nModel-' + str(model_index_step * arglist.save_rate - 1) + \ '-testing ' + str(arglist.num_episodes) + ' episodes\'s result:' + \ '\nAverage average attained coverage: ' + str(np.mean(aver_cover)) + \ '\nAverage Jaint\'s fairness index: ' + str(np.mean(j_index)) + \ '\nAverage normalized average energy consumptions:' + str(np.mean(energy_consumptions_for_test)) + \ '\nAverage energy efficiency:' + str(np.mean(energy_efficiency)) + '\n' file.write(report) draw_util.drawTest( model_index_step * arglist.save_rate - 1, arglist.pictures_dir_test + model_name, energy_consumptions_for_test, aver_cover, j_index, instantaneous_accmulated_reward, instantaneous_dis, instantaneous_out_the_map, len(aver_cover), bl_coverage, bl_jainindex, bl_loss, energy_efficiency, False) # reset custom statistics variabl between episode and epoch----------------------------------------- obs_n = env.reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) # increment global step counter train_step += 1 # for benchmarking learned policies if arglist.benchmark: for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) if train_step > arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue # for displaying learned policies if arglist.draw_picture_test: if len(episode_rewards) > arglist.num_episodes: break continue if arglist.display: time.sleep(0.1) env.render() continue # update all trainers, if not in display or benchmark mode loss = None for agent in trainers: agent.preupdate() for agent in trainers: loss = agent.update(trainers, train_step) # save model, display training output if terminal and (len(episode_rewards) % arglist.save_rate == 0): episode_number_name = train_step / arglist.max_episode_len save_dir_custom = arglist.save_dir + str( episode_number_name) + '/' U.save_state(save_dir_custom, saver=saver) # print statement depends on whether or not there are adversaries if num_adversaries == 0: print( "steps: {}, episodes: {}, mean episode reward: {}, time: {}" .format( train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time() - t_start, 3))) else: print( "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}" .format( train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), [ np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards ], round(time.time() - t_start, 3))) t_start = time.time() # Keep track of final episode reward final_ep_rewards.append( np.mean(episode_rewards[-arglist.save_rate:])) for rew in agent_rewards: final_ep_ag_rewards.append( np.mean(rew[-arglist.save_rate:])) # draw custom statistics picture when save the model---------------------------------------------------- if arglist.draw_picture_train: episode_number_name = train_step / arglist.max_episode_len model_name = arglist.save_dir.split('/')[-2] + '/' draw_util.draw_episode( episode_number_name, arglist.pictures_dir_train + model_name, aver_cover, j_index, instantaneous_accmulated_reward, instantaneous_dis, instantaneous_out_the_map, loss_all, len(aver_cover)) # saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) print('...Finished total of {} episodes.'.format( len(episode_rewards))) break
def train(arglist, restore_model_number): debug = False multi_process = arglist.mp num_tasks = arglist.num_task_transfer # 总共有多少个任务 list_of_taskenv = [] # env list save_path = arglist.save_dir if not os.path.exists(save_path): os.makedirs(save_path) with U.single_threaded_session(): sess = tf.get_default_session() if debug: begin = time_begin() # 1.1创建每个任务的actor trainer和critic trainer env = make_env(arglist.scenario, reward_type=arglist.reward_type) env.set_map( sample_map(arglist.test_data_dir + arglist.test_data_name + "_1.h5")) # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) actor_0 = get_trainers(env, "actor_", num_adversaries, obs_shape_n, arglist, type=0, session=sess) # 1.2创建每个任务的actor trainer和critic trainer critic_list = [] # 所有任务critic的list actor_list = [] print('Using good policy {} and adv policy {}'.format( arglist.good_policy, arglist.adv_policy)) # 1.2 全局变量初始化 episodes_rewards = [[0.0] for _ in range(num_tasks) ] # 每个元素为在一个episode中所有agents rewards的和 # agent_rewards[i]中的每个元素记录单个agent在一个episode中所有rewards的和 agent_rewards = [[[0.0] for _ in range(env.n)] for _ in range(num_tasks)] final_ep_rewards = [[] for _ in range(num_tasks) ] # sum of rewards for training curve final_ep_ag_rewards = [[] for _ in range(num_tasks) ] # agent rewards for training curve energy_consumptions_for_test = [[] for _ in range(num_tasks)] j_index = [[] for _ in range(num_tasks)] aver_cover = [[] for _ in range(num_tasks)] instantaneous_dis = [[] for _ in range(num_tasks)] instantaneous_out_the_map = [[] for _ in range(num_tasks)] energy_efficiency = [[] for _ in range(num_tasks)] instantaneous_accmulated_reward = [[] for _ in range(num_tasks)] model_number = int(arglist.num_train_episodes / arglist.save_rate) saver = tf.train.Saver(max_to_keep=model_number) # 1.3 局部变量初始化 global_steps = np.zeros(num_tasks) local_steps = np.zeros(num_tasks) # local timesteps for each env energy_one_episode = [[] for _ in range(num_tasks)] j_index_one_episode = [[] for _ in range(num_tasks)] aver_cover_one_episode = [[] for _ in range(num_tasks)] over_map_counter = np.zeros(num_tasks) over_map_one_episode = [[] for _ in range(num_tasks)] disconnected_number_counter = np.zeros(num_tasks) disconnected_number_one_episode = [[] for _ in range(num_tasks)] episode_reward_step = np.zeros( num_tasks) # 累加一个episode里每一步的所有智能体的平均reward accmulated_reward_one_episode = [[] for _ in range(num_tasks)] route_one_episode = [[] for _ in range(num_tasks)] if debug: print(time_end(begin, "step3")) begin = time_begin() # 1.4 加载checkpoints if arglist.load_dir == "": arglist.load_dir = os.path.join(save_path, str(restore_model_number), "model.ckpt") if arglist.transfer_restore: print('Loading previous state...') U.load_state(arglist.load_dir) for i in range(num_tasks): list_of_taskenv.append( make_env(arglist.scenario, reward_type=arglist.reward_type)) critic_trainers = get_trainers(list_of_taskenv[i], "task_" + str(i + 1) + "_", num_adversaries, obs_shape_n, arglist, actors=actor_0, type=1, session=sess) actor_trainers = get_trainers(list_of_taskenv[i], "task_" + str(i + 1) + "_", num_adversaries, obs_shape_n, arglist, actor_env_name="actor_", type=2, session=sess) actor_list.append(actor_trainers) critic_list.append(critic_trainers) # Initialize U.initialize() for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES): print(var) # 1.5 初始化ENV obs_n_list = [] for i in range(num_tasks): obs_n = list_of_taskenv[i].reset() list_of_taskenv[i].set_map( sample_map(arglist.test_data_dir + arglist.test_data_name + "_" + str(i + 1) + ".h5")) obs_n_list.append(obs_n) if debug: print(time_end(begin, "initialize")) begin = time_begin() # 2.训练 t_start = time.time() print('Starting iterations...') episode_start_time = time.time() state_dim = obs_shape_n[0][0] history_n = [[ queue.Queue(arglist.history_length) for _ in range(env.n) ] for _ in range(num_tasks)] for i in range(num_tasks): for j in range(env.n): for _ in range(arglist.history_length): history_n[i][j].put(obs_n_list[i][j]) while True: for task_index in range(num_tasks): # 2.1更新环境,采集样本 current_env = list_of_taskenv[task_index] # get action # action_n = [agent.action(obs) for agent, obs in zip(actor_0, obs_n_list[task_index])] action_n = [ agent.action(obs) for agent, obs in zip(actor_0, history_n[task_index]) ] # environment step new_obs_n, rew_n, done_n, info_n = current_env.step(action_n) current_critics = critic_list[task_index] current_actors = actor_list[task_index] if debug: print(time_end(begin, "env.step")) begin = time_begin() local_steps[task_index] += 1 # 更新局部计数器 global_steps[task_index] += 1 # 更新全局计数器 done = all(done_n) terminal = (local_steps[task_index] >= arglist.max_episode_len) # 收集experience for i in range(env.n): current_critics[i].experience(obs_n_list[task_index][i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) # 更新obs obs_n_list[task_index] = new_obs_n for i in range(env.n): history_n[task_index][i].get() history_n[task_index][i].put(new_obs_n[i]) # 更新reward for i, rew in enumerate(rew_n): episodes_rewards[task_index][-1] += rew agent_rewards[task_index][i][-1] += rew # 2.2,优化每一个任务的critic and acotr for critic in current_critics: critic.preupdate() for critic in current_critics: critic.update(current_critics, global_steps[task_index]) for index, actor in enumerate(current_actors): actor.update(current_actors, current_critics, global_steps[task_index], index) if debug: print(time_end(begin, "update actor")) begin = time_begin() # 2.4 记录和更新train信息 # energy energy_one_episode[task_index].append(current_env.get_energy()) # fair index j_index_one_episode[task_index].append( current_env.get_jain_index()) # coverage aver_cover_one_episode[task_index].append( current_env.get_aver_cover()) # over map counter over_map_counter[task_index] += current_env.get_over_map() over_map_one_episode[task_index].append( over_map_counter[task_index]) # disconnected counter disconnected_number_counter[task_index] += current_env.get_dis( ) disconnected_number_one_episode[task_index].append( disconnected_number_counter[task_index]) # reward episode_reward_step[task_index] += np.mean(rew_n) accmulated_reward_one_episode[task_index].append( episode_reward_step[task_index]) route = current_env.get_agent_pos() route_one_episode[task_index].append(route) if debug: print(time_end(begin, "others")) begin = time_begin() episode_number = math.ceil(global_steps[task_index] / arglist.max_episode_len) if done or terminal: model_name = save_path.split('/')[-2] + '/' temp_efficiency = np.array( aver_cover_one_episode[task_index]) * np.array( j_index_one_episode[task_index]) / np.array( energy_one_episode[task_index]) draw_util.draw_single_episode( arglist.pictures_dir_transfer_train + model_name + "single_episode_task_" + str(task_index) + "/", episode_number, temp_efficiency, aver_cover_one_episode[task_index], j_index_one_episode[task_index], energy_one_episode[task_index], disconnected_number_one_episode[task_index], over_map_one_episode[task_index], accmulated_reward_one_episode[task_index]) # 记录每个episode的变量 energy_consumptions_for_test[task_index].append( energy_one_episode[task_index][-1]) # energy j_index[task_index].append( j_index_one_episode[task_index][-1]) # fairness index aver_cover[task_index].append( aver_cover_one_episode[task_index][-1]) # coverage instantaneous_dis[task_index].append( disconnected_number_one_episode[task_index] [-1]) # disconnected instantaneous_out_the_map[task_index].append( over_map_one_episode[task_index][-1]) # out of the map instantaneous_accmulated_reward[task_index].append( accmulated_reward_one_episode[task_index] [-1]) # reward energy_efficiency[task_index].append( aver_cover_one_episode[task_index][-1] * j_index_one_episode[task_index][-1] / energy_one_episode[task_index][-1]) # efficiency episode_end_time = time.time() episode_time = episode_end_time - episode_start_time episode_start_time = episode_end_time with open( arglist.pictures_dir_transfer_train + model_name + "task_" + str(task_index) + '_train_info' + '.txt', 'a+') as f: info = "Task index: %d, Episode number %d, energy consumption: %s, efficiency: %s, time: %s" % ( task_index, episode_number, str(current_env.get_energy_origin()), str(energy_efficiency[task_index][-1]), str(round(episode_time, 3))) f.write(info + "\n") print(info) # 应该在每个重置每个episode中的局部变量-------------------------------------------- if task_index == num_tasks - 1: energy_one_episode = [[] for _ in range(num_tasks)] j_index_one_episode = [[] for _ in range(num_tasks)] aver_cover_one_episode = [[] for _ in range(num_tasks)] over_map_counter = np.zeros(num_tasks) over_map_one_episode = [[] for _ in range(num_tasks)] disconnected_number_counter = np.zeros(num_tasks) disconnected_number_one_episode = [ [] for _ in range(num_tasks) ] episode_reward_step = np.zeros(num_tasks) accmulated_reward_one_episode = [ [] for _ in range(num_tasks) ] route_one_episode = [[] for _ in range(num_tasks)] # 重置局部变量 obs_n_list[task_index] = current_env.reset() # 重置env current_env.set_map( sample_map(arglist.test_data_dir + arglist.test_data_name + "_" + str(task_index + 1) + ".h5")) local_steps[task_index] = 0 # 重置局部计数器 # 更新全局变量 episodes_rewards[task_index].append(0) # 添加新的元素 for reward in agent_rewards[task_index]: reward.append(0) # save model, display training output if terminal and (episode_number % arglist.save_rate == 0): # tf.get_default_session().run(global_steps_assign_op, feed_dict={global_steps_ph: global_steps}) # save_dir_custom = os.path.join(save_path, str(episode_number), 'model.ckpt') # U.save_state(save_dir_custom, saver=saver) # print statement depends on whether or not there are adversaries # 最新save_rate个episode的平均reward save_rate_mean_reward = np.mean( episodes_rewards[task_index][-arglist.save_rate:]) if num_adversaries == 0: print( "steps: {}, episodes: {}, mean episode reward: {}, time: {}" .format(global_steps[task_index], episode_number, save_rate_mean_reward, round(time.time() - t_start, 3))) else: print( "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}" .format(global_steps[task_index], episode_number, save_rate_mean_reward, [ np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards[task_index] ], round(time.time() - t_start, 3))) t_start = time.time() final_ep_rewards[task_index].append(save_rate_mean_reward) for rew in agent_rewards[task_index]: final_ep_ag_rewards[task_index].append( np.mean(rew[-arglist.save_rate:])) # 保存train曲线 if arglist.draw_picture_train: # model_name = save_path.split('/')[-2] + '/' draw_util.draw_episodes( episode_number, arglist.pictures_dir_transfer_train + model_name + "all_episodes_task_" + str(task_index) + "/", aver_cover[task_index], j_index[task_index], energy_consumptions_for_test[task_index], instantaneous_dis[task_index], instantaneous_out_the_map[task_index], energy_efficiency[task_index], instantaneous_accmulated_reward[task_index], len(aver_cover[task_index])) # saves final episode reward for plotting training curve later if episode_number > arglist.num_train_episodes: mkdir(arglist.plots_dir) rew_file_name = arglist.plots_dir + arglist.exp_name + str( task_index) + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + str( task_index) + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) print('...Finished total of {} episodes.'.format( episode_number)) if episode_number > arglist.num_train_episodes: break
def random_maddpg_test(arglist): debug = False num_tasks = arglist.num_task_transfer # 总共有多少个任务 list_of_taskenv = [] # env list graph = tf.Graph() with graph.as_default(): with U.single_threaded_session(): if debug: begin = time_begin() # 1.1创建common actor env = make_env(arglist.scenario, reward_type=arglist.reward_type) env.set_map(sample_map(arglist.test_data_dir + arglist.test_data_name + "_1.h5")) # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) actors = get_trainers(env, "actor_", num_adversaries, obs_shape_n, arglist, type=0) for i in range(num_tasks): list_of_taskenv.append(make_env(arglist.scenario, reward_type=arglist.reward_type)) print('Using good policy {} and adv policy {}'.format(arglist.good_policy, arglist.adv_policy)) # 1.2 Initialize U.initialize() model_name = arglist.load_dir.split('/')[-2] + '/' path = arglist.pictures_dir_transfer_test + model_name mkdir(path) for i in range(num_tasks): mkdir(os.path.join(path, "task_" + str(i))) # 2.1 加载checkpoints # model_load_dir = os.path.join(arglist.load_dir, str(model_number * arglist.save_rate), 'model.ckpt') # print('From ', model_load_dir, ' Loading previous state...') # U.load_state(model_load_dir) # 3.1 全局变量初始化 global_steps = np.zeros(num_tasks) # global timesteps for each env episodes_rewards = [[0.0] for _ in range(num_tasks)] # 每个元素为在一个episode中所有agents rewards的和 # agent_rewards[i]中的每个元素记录单个agent在一个episode中所有rewards的和 agent_rewards = [[[0.0] for _ in range(env.n)] for _ in range(num_tasks)] energy_consumptions_for_test = [[] for _ in range(num_tasks)] j_index = [[] for _ in range(num_tasks)] aver_cover = [[] for _ in range(num_tasks)] instantaneous_dis = [[] for _ in range(num_tasks)] instantaneous_out_the_map = [[] for _ in range(num_tasks)] energy_efficiency = [[] for _ in range(num_tasks)] instantaneous_accmulated_reward = [[] for _ in range(num_tasks)] # 3.2 局部变量初始化 local_steps = np.zeros(num_tasks) # local timesteps for each env energy_one_episode = [[] for _ in range(num_tasks)] j_index_one_episode = [[] for _ in range(num_tasks)] aver_cover_one_episode = [[] for _ in range(num_tasks)] over_map_counter = np.zeros(num_tasks) over_map_one_episode = [[] for _ in range(num_tasks)] disconnected_number_counter = np.zeros(num_tasks) disconnected_number_one_episode = [[] for _ in range(num_tasks)] episode_reward_step = np.zeros(num_tasks) # 累加一个episode里每一步的所有智能体的平均reward accmulated_reward_one_episode = [[] for _ in range(num_tasks)] route_one_episode = [[] for _ in range(num_tasks)] bl_coverage = 0.8 bl_jainindex = 0.8 bl_loss = 100 # 3.3 初始化ENV obs_n_list = [] for i in range(num_tasks): obs_n = list_of_taskenv[i].reset() list_of_taskenv[i].set_map( sample_map(arglist.test_data_dir + arglist.test_data_name + "_" + str(i + 1) + ".h5", random=False)) obs_n_list.append(obs_n) # 3.4 history_n = [[queue.Queue(arglist.history_length) for _ in range(env.n)] for _ in range(num_tasks)] for i in range(num_tasks): for j in range(env.n): for _ in range(arglist.history_length): history_n[i][j].put(obs_n_list[i][j]) # 4 test episode_start_time = time.time() print('Starting iterations...') episode_number = 0 while True: for task_index in range(num_tasks): # 3.1更新环境 current_env = list_of_taskenv[task_index] # get action action_n = [agent.action(obs) for agent, obs in zip(actors, history_n[task_index])] # environment step new_obs_n, rew_n, done_n, info_n = current_env.step(action_n) local_steps[task_index] += 1 # 更新局部计数器 global_steps[task_index] += 1 # 更新全局计数器 done = all(done_n) terminal = (local_steps[task_index] >= arglist.max_episode_len) # 更新obs obs_n_list[task_index] = new_obs_n # 更新reward for i, rew in enumerate(rew_n): episodes_rewards[task_index][-1] += rew agent_rewards[task_index][i][-1] += rew # energy energy_one_episode[task_index].append(current_env.get_energy()) # fair index j_index_one_episode[task_index].append(current_env.get_jain_index()) # coverage aver_cover_one_episode[task_index].append(current_env.get_aver_cover()) # over map counter over_map_counter[task_index] += current_env.get_over_map() over_map_one_episode[task_index].append(over_map_counter[task_index]) # disconnected counter disconnected_number_counter[task_index] += current_env.get_dis() disconnected_number_one_episode[task_index].append(disconnected_number_counter[task_index]) # reward episode_reward_step[task_index] += np.mean(rew_n) accmulated_reward_one_episode[task_index].append(episode_reward_step[task_index]) route = current_env.get_agent_pos() route_one_episode[task_index].append(route) episode_number = math.ceil(global_steps[task_index] / arglist.max_episode_len) if done or terminal: # 记录每个episode的变量 energy_consumptions_for_test[task_index].append(energy_one_episode[task_index][-1]) # energy j_index[task_index].append(j_index_one_episode[task_index][-1]) # fairness index aver_cover[task_index].append(aver_cover_one_episode[task_index][-1]) # coverage instantaneous_dis[task_index].append( disconnected_number_one_episode[task_index][-1]) # disconnected instantaneous_out_the_map[task_index].append( over_map_one_episode[task_index][-1]) # out of the map instantaneous_accmulated_reward[task_index].append( accmulated_reward_one_episode[task_index][-1]) # reward energy_efficiency[task_index].append(aver_cover_one_episode[task_index][-1] * j_index_one_episode[task_index][-1] / energy_one_episode[task_index][-1]) # efficiency episode_end_time = time.time() episode_time = episode_end_time - episode_start_time episode_start_time = episode_end_time print('Task %d, Episode: %d - energy_consumptions: %s, efficiency: %s, time %s' % ( task_index, episode_number, str(current_env.get_energy_origin()), str(energy_efficiency[task_index][-1]), str(round(episode_time, 3)))) current_path = os.path.join(path, "task_" + str(task_index)) if arglist.draw_picture_test: file_path = os.path.join(current_path, "random_model_test.log") if episode_number == arglist.num_test_episodes: report = '\nOK===============report=====================\nRadom maddpg Model-testing ' \ + str(arglist.num_test_episodes) + ' episodes\'s result:' \ + '\n!!!Max energy efficiency: ' \ + str(np.max(energy_efficiency[task_index])) \ + '\n!!!Average energy efficiency:' \ + str(np.mean(energy_efficiency[task_index])) \ + '\nAverage average attained coverage: ' \ + str(np.mean(aver_cover[task_index])) + \ '\nAverage Jaint\'s fairness index: ' \ + str(np.mean(j_index[task_index])) + \ '\nAverage normalized average energy consumptions:' \ + str(np.mean(energy_consumptions_for_test[task_index])) \ + "\n" + "==========================end=============================\n" draw_util.drawTest("random", current_path+"random_maddpg", energy_efficiency[task_index], energy_consumptions_for_test[task_index], aver_cover[task_index], j_index[task_index], instantaneous_accmulated_reward[task_index], instantaneous_dis[task_index], instantaneous_out_the_map[task_index], len(aver_cover[task_index]), bl_coverage, bl_jainindex, bl_loss, False) else: report = '\nRandom maddpg Model-' \ + '-episode ' + str(episode_number) + ' result:' \ + '\n!!!Energy efficiency: ' \ + str(energy_efficiency[task_index][-1]) \ + '\nAverage attained coverage: ' \ + str(aver_cover[task_index][-1]) + \ '\nJaint\'s fairness index: ' \ + str(j_index[task_index][-1]) + \ '\nnormalized average energy consumptions: ' \ + str(energy_consumptions_for_test[task_index][-1]) \ + "\n" with open(file_path, 'a+') as file: file.write(report) # reset custom statistics variabl between episode and epoch------------------------------------ if task_index == num_tasks - 1: energy_one_episode = [[] for _ in range(num_tasks)] j_index_one_episode = [[] for _ in range(num_tasks)] aver_cover_one_episode = [[] for _ in range(num_tasks)] over_map_counter = np.zeros(num_tasks) over_map_one_episode = [[] for _ in range(num_tasks)] disconnected_number_counter = np.zeros(num_tasks) disconnected_number_one_episode = [[] for _ in range(num_tasks)] episode_reward_step = np.zeros(num_tasks) accmulated_reward_one_episode = [[] for _ in range(num_tasks)] route_one_episode = [[] for _ in range(num_tasks)] # 重置局部变量 obs_n_list[task_index] = current_env.reset() # 重置env current_env.set_map( sample_map( arglist.test_data_dir + arglist.test_data_name + "_" + str(task_index + 1) + ".h5", random=False)) local_steps[task_index] = 0 # 重置局部计数器 # 更新全局变量 episodes_rewards[task_index].append(0) # 添加新的元素 for reward in agent_rewards[task_index]: reward.append(0) if episode_number > arglist.num_test_episodes: break