def test(arglist): debug = False num_tasks = arglist.num_task # 总共有多少个任务 list_of_taskenv = [] # env list load_path = arglist.load_dir with U.single_threaded_session(): if debug: begin = time_begin() # 1.1创建每个任务的actor trainer和critic trainer trainers_list = [] env = make_env(arglist.scenario, arglist.benchmark) # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) for i in range(num_tasks): list_of_taskenv.append(make_env(arglist.scenario)) trainers = get_trainers(list_of_taskenv[i], "task_" + str(i + 1) + "_", num_adversaries, obs_shape_n, arglist) trainers_list.append(trainers) print('Using good policy {} and adv policy {}'.format(arglist.good_policy, arglist.adv_policy)) global_steps_tensor = tf.Variable(tf.zeros(num_tasks), trainable=False) # global timesteps for each env global_steps_ph = tf.placeholder(tf.float32, [num_tasks]) global_steps_assign_op = tf.assign(global_steps_tensor, global_steps_ph) model_number = int(arglist.num_episodes / arglist.save_rate) saver = tf.train.Saver(max_to_keep=model_number) efficiency_list = [] for i in range(num_tasks): efficiency_list.append(tf.placeholder(tf.float32, shape=None, name="efficiency_placeholder" + str(i))) efficiency_summary_list = [] for i in range(num_tasks): efficiency_summary_list.append(tf.summary.scalar("efficiency_%s" % i, efficiency_list[i])) writer = tf.summary.FileWriter("../summary/efficiency") # Initialize U.initialize() for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES): print(var) if debug: print(time_end(begin, "initialize")) begin = time_begin() model_name = arglist.load_dir.split('/')[-2] + '/' mkdir(arglist.pictures_dir_test + model_name) model_index_step = 0 model_number_total = arglist.train_num_episodes / arglist.save_rate max_model_index = 0 max_average_energy_efficiency = 0 while True: if model_index_step >= model_number_total: with open(arglist.pictures_dir_test + model_name + 'test_report' + '.txt', 'a+') as file: report = '\nModel ' + str(max_model_index) + ' attained max average energy efficiency' + \ '\nMax average energy efficiency:' + str(max_average_energy_efficiency) file.write(report) break else: model_index_step += 1 # 1.4 加载checkpoints if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.display or arglist.restore or arglist.benchmark: print('Loading previous state...') model_load_dir = arglist.load_dir + str(model_index_step * arglist.save_rate - 1) + '/' U.load_state(arglist.load_dir) # global_steps = tf.get_default_session().run(global_steps_tensor) # 1.5 初始化ENV obs_n_list = [] for i in range(num_tasks): obs_n = list_of_taskenv[i].reset() obs_n_list.append(obs_n) # 1.2 全局变量初始化 episodes_rewards = [[0.0] for _ in range(num_tasks)] # 每个元素为在一个episode中所有agents rewards的和 # agent_rewards[i]中的每个元素记录单个agent在一个episode中所有rewards的和 agent_rewards = [[[0.0] for _ in range(env.n)] for _ in range(num_tasks)] final_ep_rewards = [[] for _ in range(num_tasks)] # sum of rewards for training curve final_ep_ag_rewards = [[] for _ in range(num_tasks)] # agent rewards for training curve energy_consumptions_for_test = [[] for _ in range(num_tasks)] j_index = [[] for _ in range(num_tasks)] aver_cover = [[] for _ in range(num_tasks)] instantaneous_dis = [[] for _ in range(num_tasks)] instantaneous_out_the_map = [[] for _ in range(num_tasks)] energy_efficiency = [[] for _ in range(num_tasks)] instantaneous_accmulated_reward = [[] for _ in range(num_tasks)] # 1.3 局部变量初始化 local_steps = np.zeros(num_tasks) # local timesteps for each env energy_one_episode = [[] for _ in range(num_tasks)] j_index_one_episode = [[] for _ in range(num_tasks)] aver_cover_one_episode = [[] for _ in range(num_tasks)] over_map_counter = np.zeros(num_tasks) over_map_one_episode = [[] for _ in range(num_tasks)] disconnected_number_counter = np.zeros(num_tasks) disconnected_number_one_episode = [[] for _ in range(num_tasks)] episode_reward_step = np.zeros(num_tasks) # 累加一个episode里每一步的所有智能体的平均reward accmulated_reward_one_episode = [[] for _ in range(num_tasks)] route_one_episode = [[] for _ in range(num_tasks)] bl_coverage = 0.8 bl_jainindex = 0.8 bl_loss = 100 energy_efficiency = [] print('Starting iterations...') while True: for task_index in range(num_tasks): # 2.1更新环境,采集样本 current_env = list_of_taskenv[task_index] current_trainers = trainers_list[task_index] # get action action_n = [agent.action(obs) for agent, obs in zip(trainers, obs_n)] # environment step new_obs_n, rew_n, done_n, info_n = current_env.step(action_n) if debug: print(time_end(begin, "env.step")) begin = time_begin() local_steps[task_index] += 1 # 更新局部计数器 global_steps[task_index] += 1 # 更新全局计数器 done = all(done_n) terminal = (local_steps[task_index] >= arglist.max_episode_len) # 收集experience for i in range(env.n): current_trainers[i].experience(obs_n_list[task_index][i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) # 更新obs obs_n_list[task_index] = new_obs_n # 更新reward for i, rew in enumerate(rew_n): episodes_rewards[task_index][-1] += rew agent_rewards[task_index][i][-1] += rew # energy energy_one_episode[task_index].append(current_env.get_energy()) # fair index j_index_one_episode[task_index].append(current_env.get_jain_index()) # coverage aver_cover_one_episode[task_index].append(current_env.get_aver_cover()) # over map counter over_map_counter[task_index] += current_env.get_over_map() over_map_one_episode[task_index].append(over_map_counter[task_index]) # disconnected counter disconnected_number_counter[task_index] += current_env.get_dis() disconnected_number_one_episode[task_index].append(disconnected_number_counter[task_index]) # reward episode_reward_step[task_index] += np.mean(rew_n) accmulated_reward_one_episode[task_index].append(episode_reward_step[task_index]) route = current_env.get_agent_pos() route_one_episode[task_index].append(route) if done or terminal: # reset custom statistics variabl between episode and epoch--------------------------------------------- instantaneous_accmulated_reward.append(accmulated_reward_one_episode[-1]) j_index.append(j_index_one_episode[-1]) instantaneous_dis.append(disconnected_number_one_episode[-1]) instantaneous_out_the_map.append(over_map_one_episode[-1]) aver_cover.append(aver_cover_one_episode[-1]) energy_consumptions_for_test.append(energy_one_episode[-1]) energy_efficiency.append(aver_cover_one_episode[-1] * j_index_one_episode[-1] / energy_one_episode[-1]) print('Episode: %d - energy_consumptions: %s ' % (train_step / arglist.max_episode_len, str(env._get_energy_origin()))) if task_index == num_tasks - 1: energy_one_episode = [[] for _ in range(num_tasks)] j_index_one_episode = [[] for _ in range(num_tasks)] aver_cover_one_episode = [[] for _ in range(num_tasks)] over_map_counter = np.zeros(num_tasks) over_map_one_episode = [[] for _ in range(num_tasks)] disconnected_number_counter = np.zeros(num_tasks) disconnected_number_one_episode = [[] for _ in range(num_tasks)] episode_reward_step = np.zeros(num_tasks) accmulated_reward_one_episode = [[] for _ in range(num_tasks)] route_one_episode = [[] for _ in range(num_tasks)] if arglist.draw_picture_test: if len(episode_rewards) % arglist.save_rate == 0: if np.mean(energy_efficiency) > max_average_energy_efficiency: max_model_index = model_index_step * arglist.save_rate - 1 max_average_energy_efficiency = np.mean(energy_efficiency) with open(arglist.pictures_dir_test + model_name + 'test_report' + '.txt', 'a+') as file: report = '\nModel-' + str(model_index_step * arglist.save_rate - 1) + \ '-testing ' + str(arglist.num_episodes) + ' episodes\'s result:' + \ '\nAverage average attained coverage: ' + str(np.mean(aver_cover)) + \ '\nAverage Jaint\'s fairness index: ' + str(np.mean(j_index)) + \ '\nAverage normalized average energy consumptions:' + str(np.mean(energy_consumptions_for_test)) + \ '\nAverage energy efficiency:' + str(np.mean(energy_efficiency)) + '\n' file.write(report) draw_util.drawTest(model_index_step * arglist.save_rate - 1, arglist.pictures_dir_test + model_name, energy_consumptions_for_test, aver_cover, j_index, instantaneous_accmulated_reward, instantaneous_dis, instantaneous_out_the_map , len(aver_cover), bl_coverage, bl_jainindex, bl_loss, energy_efficiency, False) # reset custom statistics variabl between episode and epoch---------------------------------------- # for displaying learned policies if arglist.draw_picture_test: if len(episode_rewards) > arglist.num_episodes: break continue # saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) print('...Finished total of {} episodes.'.format(len(episode_rewards))) break
def train(arglist, restore_model_number): debug = False multi_process = arglist.mp num_tasks = arglist.num_task_transfer # 总共有多少个任务 list_of_taskenv = [] # env list save_path = arglist.save_dir if not os.path.exists(save_path): os.makedirs(save_path) with U.single_threaded_session(): sess = tf.get_default_session() if debug: begin = time_begin() # 1.1创建每个任务的actor trainer和critic trainer env = make_env(arglist.scenario, reward_type=arglist.reward_type) env.set_map( sample_map(arglist.test_data_dir + arglist.test_data_name + "_1.h5")) # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) actor_0 = get_trainers(env, "actor_", num_adversaries, obs_shape_n, arglist, type=0, session=sess) # 1.2创建每个任务的actor trainer和critic trainer critic_list = [] # 所有任务critic的list actor_list = [] print('Using good policy {} and adv policy {}'.format( arglist.good_policy, arglist.adv_policy)) # 1.2 全局变量初始化 episodes_rewards = [[0.0] for _ in range(num_tasks) ] # 每个元素为在一个episode中所有agents rewards的和 # agent_rewards[i]中的每个元素记录单个agent在一个episode中所有rewards的和 agent_rewards = [[[0.0] for _ in range(env.n)] for _ in range(num_tasks)] final_ep_rewards = [[] for _ in range(num_tasks) ] # sum of rewards for training curve final_ep_ag_rewards = [[] for _ in range(num_tasks) ] # agent rewards for training curve energy_consumptions_for_test = [[] for _ in range(num_tasks)] j_index = [[] for _ in range(num_tasks)] aver_cover = [[] for _ in range(num_tasks)] instantaneous_dis = [[] for _ in range(num_tasks)] instantaneous_out_the_map = [[] for _ in range(num_tasks)] energy_efficiency = [[] for _ in range(num_tasks)] instantaneous_accmulated_reward = [[] for _ in range(num_tasks)] model_number = int(arglist.num_train_episodes / arglist.save_rate) saver = tf.train.Saver(max_to_keep=model_number) # 1.3 局部变量初始化 global_steps = np.zeros(num_tasks) local_steps = np.zeros(num_tasks) # local timesteps for each env energy_one_episode = [[] for _ in range(num_tasks)] j_index_one_episode = [[] for _ in range(num_tasks)] aver_cover_one_episode = [[] for _ in range(num_tasks)] over_map_counter = np.zeros(num_tasks) over_map_one_episode = [[] for _ in range(num_tasks)] disconnected_number_counter = np.zeros(num_tasks) disconnected_number_one_episode = [[] for _ in range(num_tasks)] episode_reward_step = np.zeros( num_tasks) # 累加一个episode里每一步的所有智能体的平均reward accmulated_reward_one_episode = [[] for _ in range(num_tasks)] route_one_episode = [[] for _ in range(num_tasks)] if debug: print(time_end(begin, "step3")) begin = time_begin() # 1.4 加载checkpoints if arglist.load_dir == "": arglist.load_dir = os.path.join(save_path, str(restore_model_number), "model.ckpt") if arglist.transfer_restore: print('Loading previous state...') U.load_state(arglist.load_dir) for i in range(num_tasks): list_of_taskenv.append( make_env(arglist.scenario, reward_type=arglist.reward_type)) critic_trainers = get_trainers(list_of_taskenv[i], "task_" + str(i + 1) + "_", num_adversaries, obs_shape_n, arglist, actors=actor_0, type=1, session=sess) actor_trainers = get_trainers(list_of_taskenv[i], "task_" + str(i + 1) + "_", num_adversaries, obs_shape_n, arglist, actor_env_name="actor_", type=2, session=sess) actor_list.append(actor_trainers) critic_list.append(critic_trainers) # Initialize U.initialize() for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES): print(var) # 1.5 初始化ENV obs_n_list = [] for i in range(num_tasks): obs_n = list_of_taskenv[i].reset() list_of_taskenv[i].set_map( sample_map(arglist.test_data_dir + arglist.test_data_name + "_" + str(i + 1) + ".h5")) obs_n_list.append(obs_n) if debug: print(time_end(begin, "initialize")) begin = time_begin() # 2.训练 t_start = time.time() print('Starting iterations...') episode_start_time = time.time() state_dim = obs_shape_n[0][0] history_n = [[ queue.Queue(arglist.history_length) for _ in range(env.n) ] for _ in range(num_tasks)] for i in range(num_tasks): for j in range(env.n): for _ in range(arglist.history_length): history_n[i][j].put(obs_n_list[i][j]) while True: for task_index in range(num_tasks): # 2.1更新环境,采集样本 current_env = list_of_taskenv[task_index] # get action # action_n = [agent.action(obs) for agent, obs in zip(actor_0, obs_n_list[task_index])] action_n = [ agent.action(obs) for agent, obs in zip(actor_0, history_n[task_index]) ] # environment step new_obs_n, rew_n, done_n, info_n = current_env.step(action_n) current_critics = critic_list[task_index] current_actors = actor_list[task_index] if debug: print(time_end(begin, "env.step")) begin = time_begin() local_steps[task_index] += 1 # 更新局部计数器 global_steps[task_index] += 1 # 更新全局计数器 done = all(done_n) terminal = (local_steps[task_index] >= arglist.max_episode_len) # 收集experience for i in range(env.n): current_critics[i].experience(obs_n_list[task_index][i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) # 更新obs obs_n_list[task_index] = new_obs_n for i in range(env.n): history_n[task_index][i].get() history_n[task_index][i].put(new_obs_n[i]) # 更新reward for i, rew in enumerate(rew_n): episodes_rewards[task_index][-1] += rew agent_rewards[task_index][i][-1] += rew # 2.2,优化每一个任务的critic and acotr for critic in current_critics: critic.preupdate() for critic in current_critics: critic.update(current_critics, global_steps[task_index]) for index, actor in enumerate(current_actors): actor.update(current_actors, current_critics, global_steps[task_index], index) if debug: print(time_end(begin, "update actor")) begin = time_begin() # 2.4 记录和更新train信息 # energy energy_one_episode[task_index].append(current_env.get_energy()) # fair index j_index_one_episode[task_index].append( current_env.get_jain_index()) # coverage aver_cover_one_episode[task_index].append( current_env.get_aver_cover()) # over map counter over_map_counter[task_index] += current_env.get_over_map() over_map_one_episode[task_index].append( over_map_counter[task_index]) # disconnected counter disconnected_number_counter[task_index] += current_env.get_dis( ) disconnected_number_one_episode[task_index].append( disconnected_number_counter[task_index]) # reward episode_reward_step[task_index] += np.mean(rew_n) accmulated_reward_one_episode[task_index].append( episode_reward_step[task_index]) route = current_env.get_agent_pos() route_one_episode[task_index].append(route) if debug: print(time_end(begin, "others")) begin = time_begin() episode_number = math.ceil(global_steps[task_index] / arglist.max_episode_len) if done or terminal: model_name = save_path.split('/')[-2] + '/' temp_efficiency = np.array( aver_cover_one_episode[task_index]) * np.array( j_index_one_episode[task_index]) / np.array( energy_one_episode[task_index]) draw_util.draw_single_episode( arglist.pictures_dir_transfer_train + model_name + "single_episode_task_" + str(task_index) + "/", episode_number, temp_efficiency, aver_cover_one_episode[task_index], j_index_one_episode[task_index], energy_one_episode[task_index], disconnected_number_one_episode[task_index], over_map_one_episode[task_index], accmulated_reward_one_episode[task_index]) # 记录每个episode的变量 energy_consumptions_for_test[task_index].append( energy_one_episode[task_index][-1]) # energy j_index[task_index].append( j_index_one_episode[task_index][-1]) # fairness index aver_cover[task_index].append( aver_cover_one_episode[task_index][-1]) # coverage instantaneous_dis[task_index].append( disconnected_number_one_episode[task_index] [-1]) # disconnected instantaneous_out_the_map[task_index].append( over_map_one_episode[task_index][-1]) # out of the map instantaneous_accmulated_reward[task_index].append( accmulated_reward_one_episode[task_index] [-1]) # reward energy_efficiency[task_index].append( aver_cover_one_episode[task_index][-1] * j_index_one_episode[task_index][-1] / energy_one_episode[task_index][-1]) # efficiency episode_end_time = time.time() episode_time = episode_end_time - episode_start_time episode_start_time = episode_end_time with open( arglist.pictures_dir_transfer_train + model_name + "task_" + str(task_index) + '_train_info' + '.txt', 'a+') as f: info = "Task index: %d, Episode number %d, energy consumption: %s, efficiency: %s, time: %s" % ( task_index, episode_number, str(current_env.get_energy_origin()), str(energy_efficiency[task_index][-1]), str(round(episode_time, 3))) f.write(info + "\n") print(info) # 应该在每个重置每个episode中的局部变量-------------------------------------------- if task_index == num_tasks - 1: energy_one_episode = [[] for _ in range(num_tasks)] j_index_one_episode = [[] for _ in range(num_tasks)] aver_cover_one_episode = [[] for _ in range(num_tasks)] over_map_counter = np.zeros(num_tasks) over_map_one_episode = [[] for _ in range(num_tasks)] disconnected_number_counter = np.zeros(num_tasks) disconnected_number_one_episode = [ [] for _ in range(num_tasks) ] episode_reward_step = np.zeros(num_tasks) accmulated_reward_one_episode = [ [] for _ in range(num_tasks) ] route_one_episode = [[] for _ in range(num_tasks)] # 重置局部变量 obs_n_list[task_index] = current_env.reset() # 重置env current_env.set_map( sample_map(arglist.test_data_dir + arglist.test_data_name + "_" + str(task_index + 1) + ".h5")) local_steps[task_index] = 0 # 重置局部计数器 # 更新全局变量 episodes_rewards[task_index].append(0) # 添加新的元素 for reward in agent_rewards[task_index]: reward.append(0) # save model, display training output if terminal and (episode_number % arglist.save_rate == 0): # tf.get_default_session().run(global_steps_assign_op, feed_dict={global_steps_ph: global_steps}) # save_dir_custom = os.path.join(save_path, str(episode_number), 'model.ckpt') # U.save_state(save_dir_custom, saver=saver) # print statement depends on whether or not there are adversaries # 最新save_rate个episode的平均reward save_rate_mean_reward = np.mean( episodes_rewards[task_index][-arglist.save_rate:]) if num_adversaries == 0: print( "steps: {}, episodes: {}, mean episode reward: {}, time: {}" .format(global_steps[task_index], episode_number, save_rate_mean_reward, round(time.time() - t_start, 3))) else: print( "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}" .format(global_steps[task_index], episode_number, save_rate_mean_reward, [ np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards[task_index] ], round(time.time() - t_start, 3))) t_start = time.time() final_ep_rewards[task_index].append(save_rate_mean_reward) for rew in agent_rewards[task_index]: final_ep_ag_rewards[task_index].append( np.mean(rew[-arglist.save_rate:])) # 保存train曲线 if arglist.draw_picture_train: # model_name = save_path.split('/')[-2] + '/' draw_util.draw_episodes( episode_number, arglist.pictures_dir_transfer_train + model_name + "all_episodes_task_" + str(task_index) + "/", aver_cover[task_index], j_index[task_index], energy_consumptions_for_test[task_index], instantaneous_dis[task_index], instantaneous_out_the_map[task_index], energy_efficiency[task_index], instantaneous_accmulated_reward[task_index], len(aver_cover[task_index])) # saves final episode reward for plotting training curve later if episode_number > arglist.num_train_episodes: mkdir(arglist.plots_dir) rew_file_name = arglist.plots_dir + arglist.exp_name + str( task_index) + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + str( task_index) + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) print('...Finished total of {} episodes.'.format( episode_number)) if episode_number > arglist.num_train_episodes: break
def train(arglist): debug = False num_tasks = arglist.num_task # 总共有多少个任务 list_of_taskenv = [] # env list save_path = arglist.save_dir if not os.path.exists(save_path): os.makedirs(save_path) print("ok") with U.single_threaded_session(): if debug: begin = time_begin() # 1.初始化 # 1.1创建一个actor env = make_env(arglist.scenario, arglist) env.set_map(sample_map(arglist.data_path + "_1.h5")) obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) policy = get_trainers(env, "pi_0_", num_adversaries, obs_shape_n, arglist, is_actor=True, acotr=None) # 1.2创建每个任务的critic model_list = [] # 所有任务critic的list for i in range(num_tasks): # 创建每个任务的env list_of_taskenv.append(make_env(arglist.scenario, arglist)) trainers = get_trainers(list_of_taskenv[i], "task_" + str(i + 1) + "_", num_adversaries, obs_shape_n, arglist, is_actor=False, acotr=policy) model_list.append(trainers) # 1.3 create p_train for task_index in range(num_tasks): for actor, critic in zip(policy, model_list[task_index]): actor.add_p(critic.name) critic.p = actor.p_train print('Using good policy {} and adv policy {}'.format( arglist.good_policy, arglist.adv_policy)) # 1.4 全局变量初始化 episodes_rewards = [[0.0] for _ in range(num_tasks) ] # 每个元素为在一个episode中所有agents rewards的和 # agent_rewards[i]中的每个元素记录单个agent在一个episode中所有rewards的和 agent_rewards = [[[0.0] for _ in range(env.n)] for _ in range(num_tasks)] final_ep_rewards = [[] for _ in range(num_tasks) ] # sum of rewards for training curve final_ep_ag_rewards = [[] for _ in range(num_tasks) ] # agent rewards for training curve energy_consumptions_for_test = [[] for _ in range(num_tasks)] j_index = [[] for _ in range(num_tasks)] aver_cover = [[] for _ in range(num_tasks)] instantaneous_dis = [[] for _ in range(num_tasks)] instantaneous_out_the_map = [[] for _ in range(num_tasks)] energy_efficiency = [[] for _ in range(num_tasks)] instantaneous_accmulated_reward = [[] for _ in range(num_tasks)] global_steps_tensor = tf.Variable( tf.zeros(num_tasks), trainable=False) # global timesteps for each env global_steps_ph = tf.placeholder(tf.float32, [num_tasks]) global_steps_assign_op = tf.assign(global_steps_tensor, global_steps_ph) model_number = int(arglist.num_episodes / arglist.save_rate) saver = tf.train.Saver(max_to_keep=model_number) efficiency_list = [] for i in range(num_tasks): efficiency_list.append( tf.placeholder(tf.float32, shape=None, name="efficiency_placeholder" + str(i))) efficiency_summary_list = [] for i in range(num_tasks): efficiency_summary_list.append( tf.summary.scalar("efficiency_%s" % i, efficiency_list[i])) writer = tf.summary.FileWriter("../summary/efficiency") # 1.5 episode局部变量初始化 local_steps = np.zeros(num_tasks) # local timesteps for each env t_start = time.time() energy_one_episode = [[] for _ in range(num_tasks)] j_index_one_episode = [[] for _ in range(num_tasks)] aver_cover_one_episode = [[] for _ in range(num_tasks)] over_map_counter = np.zeros(num_tasks) over_map_one_episode = [[] for _ in range(num_tasks)] disconnected_number_counter = np.zeros(num_tasks) disconnected_number_one_episode = [[] for _ in range(num_tasks)] episode_reward_step = np.zeros( num_tasks) # 累加一个episode里每一步的所有智能体的平均reward accmulated_reward_one_episode = [[] for _ in range(num_tasks)] route_one_episode = [[] for _ in range(num_tasks)] U.initialize() # 1.6 生成模型保存或者恢复文件夹目录 if arglist.load_dir == "": arglist.load_dir = save_path if arglist.display or arglist.restore or arglist.benchmark: file_list = [] for f in os.listdir(arglist.load_dir): if os.path.isdir(os.path.join(arglist.save_dir, f)): file_list.append(f) file_list.sort( key=lambda fn: os.path.getmtime(arglist.load_dir + "/" + fn)) if len(file_list) > num_tasks: load_dir = os.path.join(arglist.load_dir, file_list[-1], "model.ckpt") U.load_state(load_dir) print('Loading previous state...') global_steps = tf.get_default_session().run(global_steps_tensor) # 1.7 初始化ENV obs_n_list = [] for i in range(num_tasks): obs_n = list_of_taskenv[i].reset() list_of_taskenv[i].set_map( sample_map(arglist.data_path + "_" + str(i + 1) + ".h5")) obs_n_list.append(obs_n) # 1.8 生成maddpg 加上rnn之后的输入seq, history_n = [[] for _ in range(num_tasks)] for i in range(num_tasks): for j in range(len(obs_n_list[i])): # 生成每个智能体长度为history_length的观测 history = History(arglist, [obs_shape_n[j][0]]) history_n[i].append(history) for _ in range(arglist.history_length): history_n[i][j].add(obs_n_list[i][j]) if debug: print(time_end(begin, "initialize")) begin = time_begin() # 2.训练 print('Starting iterations...') episode_start_time = time.time() state_dim = obs_shape_n[0][0] while True: for task_index in range(num_tasks): current_env = list_of_taskenv[task_index] action_n = [] # 用critic获得state,用critic给出action, for agent, his in zip(policy, history_n[task_index]): hiss = his.obtain().reshape( 1, state_dim, arglist.history_length) # [1, state_dim, length] action = agent.action([hiss], [1]) action_n.append(action[0]) if debug: print(time_end(begin, "action2")) begin = time_begin() new_obs_n, rew_n, done_n = current_env.step(action_n) current_critics = model_list[task_index] if debug: print(time_end(begin, "env.step")) begin = time_begin() local_steps[task_index] += 1 # 更新局部计数器 global_steps[task_index] += 1 # 更新全局计数器 done = all(done_n) terminal = (local_steps[task_index] >= arglist.max_episode_len) # 收集experience for i in range(env.n): current_critics[i].experience(obs_n_list[task_index][i], action_n[i], rew_n[i], done_n[i], terminal) policy[i].experience(obs_n_list[task_index][i], action_n[i], rew_n[i], done_n[i], terminal) # 更新obs obs_n_list[task_index] = new_obs_n if debug: print(time_end(begin, "experience")) begin = time_begin() # 2.2,优化每一个任务的critic for i, rew in enumerate(rew_n): episodes_rewards[task_index][-1] += rew agent_rewards[task_index][i][-1] += rew for critic in current_critics: critic.preupdate() for critic in current_critics: critic.update(current_critics, global_steps[task_index]) if debug: print(time_end(begin, "update critic")) begin = time_begin() # 2.3,优化actor # policy_step += 1 # print("policy steps: ", policy_step) for actor, critic in zip(policy, current_critics): actor.change_p(critic.p) actor.update(policy, global_steps[task_index]) if debug: print(time_end(begin, "update actor")) begin = time_begin() # 2.4 记录和更新train过程 # energy energy_one_episode[task_index].append(current_env.get_energy()) # fair index j_index_one_episode[task_index].append( current_env.get_jain_index()) # coverage aver_cover_one_episode[task_index].append( current_env.get_aver_cover()) # over map counter over_map_counter[task_index] += current_env.get_over_map() over_map_one_episode[task_index].append( over_map_counter[task_index]) # disconnected counter disconnected_number_counter[task_index] += current_env.get_dis( ) disconnected_number_one_episode[task_index].append( disconnected_number_counter[task_index]) # reward episode_reward_step[task_index] += np.mean(rew_n) accmulated_reward_one_episode[task_index].append( episode_reward_step[task_index]) route = current_env.get_agent_pos() route_one_episode[task_index].append(route) if debug: print(time_end(begin, "others")) begin = time_begin() episode_number = math.ceil(global_steps[task_index] / arglist.max_episode_len) if done or terminal: model_name = save_path.split('/')[-2] + '/' temp_efficiency = np.array( aver_cover_one_episode[task_index]) * np.array( j_index_one_episode[task_index]) / np.array( energy_one_episode[task_index]) draw_util.draw_single_episode( arglist.pictures_dir_train + model_name + "single_episode_task_" + str(task_index) + "/", episode_number, temp_efficiency, aver_cover_one_episode[task_index], j_index_one_episode[task_index], energy_one_episode[task_index], disconnected_number_one_episode[task_index], over_map_one_episode[task_index], accmulated_reward_one_episode[task_index]) # 记录每个episode的变量 energy_consumptions_for_test[task_index].append( energy_one_episode[task_index][-1]) # energy j_index[task_index].append( j_index_one_episode[task_index][-1]) # fairness index aver_cover[task_index].append( aver_cover_one_episode[task_index][-1]) # coverage instantaneous_dis[task_index].append( disconnected_number_one_episode[task_index] [-1]) # disconnected instantaneous_out_the_map[task_index].append( over_map_one_episode[task_index][-1]) # out of the map instantaneous_accmulated_reward[task_index].append( accmulated_reward_one_episode[task_index] [-1]) # reward energy_efficiency[task_index].append( aver_cover_one_episode[task_index][-1] * j_index_one_episode[task_index][-1] / energy_one_episode[task_index][-1]) # efficiency episode_end_time = time.time() episode_time = episode_end_time - episode_start_time episode_start_time = episode_end_time print( 'Task %d, Episode: %d - energy_consumptions: %s, efficiency: %s, time %s' % (task_index, episode_number, str(current_env.get_energy_origin()), str(energy_efficiency[task_index][-1]), str(round(episode_time, 3)))) # 绘制reward曲线 efficiency_s = tf.get_default_session().run( efficiency_summary_list[task_index], feed_dict={ efficiency_list[task_index]: energy_efficiency[task_index][-1] }) writer.add_summary(efficiency_s, global_step=episode_number) # 应该在每个重置每个episode中的局部变量-------------------------------------------- if task_index == num_tasks - 1: energy_one_episode = [[] for _ in range(num_tasks)] j_index_one_episode = [[] for _ in range(num_tasks)] aver_cover_one_episode = [[] for _ in range(num_tasks)] over_map_counter = np.zeros(num_tasks) over_map_one_episode = [[] for _ in range(num_tasks)] disconnected_number_counter = np.zeros(num_tasks) disconnected_number_one_episode = [ [] for _ in range(num_tasks) ] episode_reward_step = np.zeros(num_tasks) accmulated_reward_one_episode = [ [] for _ in range(num_tasks) ] route_one_episode = [[] for _ in range(num_tasks)] # 重置局部变量 obs_n_list[task_index] = current_env.reset() # 重置env current_env.set_map( sample_map(arglist.data_path + "_" + str(task_index + 1) + ".h5")) local_steps[task_index] = 0 # 重置局部计数器 # 更新全局变量 episodes_rewards[task_index].append(0) # 添加新的元素 for reward in agent_rewards[task_index]: reward.append(0) # save model, display training output if terminal and (episode_number % arglist.save_rate == 0): tf.get_default_session().run( global_steps_assign_op, feed_dict={global_steps_ph: global_steps}) save_dir_custom = save_path + str( episode_number) + '/model.ckpt' U.save_state(save_dir_custom, saver=saver) # print statement depends on whether or not there are adversaries # 最新save_rate个episode的平均reward save_rate_mean_reward = np.mean( episodes_rewards[task_index][-arglist.save_rate:]) if num_adversaries == 0: print( "steps: {}, episodes: {}, mean episode reward: {}, time: {}" .format(global_steps[task_index], episode_number, save_rate_mean_reward, round(time.time() - t_start, 3))) else: print( "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}" .format(global_steps[task_index], episode_number, save_rate_mean_reward, [ np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards[task_index] ], round(time.time() - t_start, 3))) t_start = time.time() final_ep_rewards[task_index].append(save_rate_mean_reward) for rew in agent_rewards[task_index]: final_ep_ag_rewards[task_index].append( np.mean(rew[-arglist.save_rate:])) # 保存train曲线 if arglist.draw_picture_train: # model_name = save_path.split('/')[-2] + '/' draw_util.draw_episodes( episode_number, arglist.pictures_dir_train + model_name + "all_episodes_task_" + str(task_index) + "/", aver_cover[task_index], j_index[task_index], energy_consumptions_for_test[task_index], instantaneous_dis[task_index], instantaneous_out_the_map[task_index], energy_efficiency[task_index], instantaneous_accmulated_reward[task_index], len(aver_cover[task_index])) # saves final episode reward for plotting training curve later if episode_number > arglist.num_episodes: rew_file_name = arglist.plots_dir + arglist.exp_name + str( task_index) + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + str( task_index) + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) print('...Finished total of {} episodes.'.format( episode_number)) if episode_number > arglist.num_episodes: break
def train(arglist): debug = False arglist.save_dir = arglist.save_dir + "_batch_size_" + str( arglist.batch_size) + "_buffer_size_" + str(arglist.buffer_size) with U.single_threaded_session(): if debug: begin = time_begin() # Create environment env = make_env(arglist.scenario, arglist, arglist.benchmark) if debug: print(time_end(begin, "step 0")) begin = time_begin() # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) if debug: print(time_end(begin, "step 1")) begin = time_begin() trainers = get_trainers(env, "task_", num_adversaries, obs_shape_n, arglist) print('Using good policy {} and adv policy {}'.format( arglist.good_policy, arglist.adv_policy)) if debug: print(time_end(begin, "step2")) begin = time_begin() efficiency = tf.placeholder(tf.float32, shape=None, name="efficiency_placeholder") efficiency_summary = tf.summary.scalar("efficiency", efficiency) p_losses_ph = tf.placeholder(tf.float32, shape=[env.n], name="p_loss") p_losses_summary = tf.summary.histogram("loss", p_losses_ph) q_losses_ph = tf.placeholder(tf.float32, shape=[env.n], name="q_loss") q_losses_summary = tf.summary.histogram("loss", q_losses_ph) loss_summary = tf.summary.merge([q_losses_summary, p_losses_summary], name="loss") writer = tf.summary.FileWriter("../summary/efficiency") writer2 = tf.summary.FileWriter("../summary/loss") # Initialize U.initialize() for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES): print(var) if debug: print(time_end(begin, "step3")) begin = time_begin() # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.display or arglist.restore or arglist.benchmark: print('Loading previous state...') U.load_state(arglist.load_dir) if not os.path.exists(arglist.save_dir): os.makedirs(arglist.save_dir) episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info model_number = int(arglist.num_episodes / arglist.save_rate) saver = tf.train.Saver(max_to_keep=model_number) episode_step = 0 train_step = 0 t_start = time.time() # custom statistics variable------------------------------------------------------------------------------------ loss_all = [] aver_cover = [] j_index = [] instantaneous_accmulated_reward = [] instantaneous_dis = [] instantaneous_out_the_map = [] # q_value = [] energy_consumptions_for_test = [] bl_coverage = 0.8 bl_jainindex = 0.8 bl_loss = 100 energy_efficiency = [] over_map_counter = 0 over_map_one_episode = [] aver_cover_one_episode = [] j_index_one_episode = [] disconnected_number_counter = 0 disconnected_number_one_episode = [] accmulated_reward_one_episode = [] actions = [] energy_one_episode = [] route = [] obs_n = env.reset() episode_reward_step = 0 model_name = arglist.load_dir.split( '/')[-3] + '/' + arglist.load_dir.split('/')[-2] + '/' if FLAGS.greedy_action: model_name = model_name + 'greedy/' elif FLAGS.random_action: model_name = model_name + 'random/' # if debug: # print(time_end(begin, "initialize")) # begin = time_begin() print('Starting iterations...') episode_begin_time = time.time() while True: # get action action_n = [ agent.action(obs) for agent, obs in zip(trainers, obs_n) ] # environment step new_obs_n, rew_n, done_n, info_n = env.step(action_n) episode_step += 1 done = all(done_n) terminal = (episode_step >= arglist.max_episode_len) # collect experience for i, agent in enumerate(trainers): agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) obs_n = new_obs_n for i, rew in enumerate(rew_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew # increment custom statistics variables in the epoch-------------------------------------------------------- episode_reward_step += np.mean(rew_n) j_index_one_episode.append(env.get_jain_index()) over_map_counter += env.get_over_map() over_map_one_episode.append(over_map_counter) disconnected_number_counter += env.get_dis() disconnected_number_one_episode.append(disconnected_number_counter) aver_cover_one_episode.append(env.get_aver_cover()) energy_one_episode.append(env.get_energy()) s_route = env.get_state() for route_i in range(0, FLAGS.num_uav * 2, 2): tmp = [s_route[route_i], s_route[route_i + 1]] route.append(tmp) accmulated_reward_one_episode.append(episode_reward_step) # if debug: # print(time_end(begin, "others")) # begin = time_begin() if done or terminal: model_name = arglist.save_dir.split('/')[-1] + '/' episode_number = int(train_step / arglist.max_episode_len) temp_efficiency = np.array(aver_cover_one_episode) * np.array( j_index_one_episode) / np.array(energy_one_episode) draw_util.draw_single_episode( arglist.pictures_dir_train + model_name + "single_episode/", episode_number, temp_efficiency, aver_cover_one_episode, j_index_one_episode, energy_one_episode, disconnected_number_one_episode, over_map_one_episode, accmulated_reward_one_episode) # reset custom statistics variabl between episode and epoch--------------------------------------------- instantaneous_accmulated_reward.append( accmulated_reward_one_episode[-1]) j_index.append(j_index_one_episode[-1]) instantaneous_dis.append(disconnected_number_one_episode[-1]) instantaneous_out_the_map.append(over_map_one_episode[-1]) aver_cover.append(aver_cover_one_episode[-1]) energy_consumptions_for_test.append(energy_one_episode[-1]) energy_efficiency.append(aver_cover_one_episode[-1] * j_index_one_episode[-1] / energy_one_episode[-1]) episode_end_time = time.time() # plot fig efficiency_s = tf.get_default_session().run( efficiency_summary, feed_dict={efficiency: energy_efficiency[episode_number]}) writer.add_summary(efficiency_s, global_step=episode_number) # plt fig print( 'Episode: %d - energy_consumptions: %s, efficiency: %s, time %s' % (train_step / arglist.max_episode_len, str(env.get_energy_origin()), str(energy_efficiency[-1]), str(round(episode_end_time - episode_begin_time, 3)))) episode_begin_time = episode_end_time # draw picture of this episode if arglist.draw_picture_test and aver_cover[-1] >= bl_coverage and j_index[-1] >= bl_jainindex \ and instantaneous_dis[-1] <= bl_loss: episode_number_name = 'episode_' + str(episode_number) draw_util.draw(episode_number_name, arglist.pictures_dir_test + model_name, energy_one_episode, route, actions, aver_cover_one_episode, j_index_one_episode, accmulated_reward_one_episode, disconnected_number_one_episode, over_map_one_episode, arglist.max_episode_len) j_index_one_episode = [] over_map_counter = 0 over_map_one_episode = [] disconnected_number_counter = 0 disconnected_number_one_episode = [] aver_cover_one_episode = [] energy_one_episode = [] route = [] episode_reward_step = 0 accmulated_reward_one_episode = [] if arglist.draw_picture_test: if len(episode_rewards) % arglist.save_rate == 0: episode_number_name = train_step / arglist.max_episode_len draw_util.drawTest( episode_number_name, arglist.pictures_dir_train + model_name, energy_consumptions_for_test, aver_cover, j_index, instantaneous_accmulated_reward, instantaneous_dis, instantaneous_out_the_map, len(aver_cover), bl_coverage, bl_jainindex, bl_loss, energy_efficiency, False) # reset custom statistics variabl between episode and epoch--------------------------------------------- obs_n = env.reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) # increment global step counter train_step += 1 # for benchmarking learned policies if arglist.benchmark: for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) if train_step > arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue # for displaying learned policies if arglist.draw_picture_test: if len(episode_rewards) > arglist.num_episodes: break continue if arglist.display: time.sleep(0.1) env.render() continue # update all trainers, if not in display or benchmark mode p_loss_list = [] q_loss_list = [] for agent in trainers: agent.preupdate() for agent in trainers: temp = agent.update(trainers, train_step) if temp is not None: p_loss_list.append(temp[1]) q_loss_list.append(temp[0]) if len(p_loss_list) == env.n: loss_s = tf.get_default_session().run(loss_summary, feed_dict={ p_losses_ph: p_loss_list, q_losses_ph: q_loss_list }) writer2.add_summary(loss_s, global_step=train_step) # save model, display training output if terminal and (len(episode_rewards) % arglist.save_rate == 0): episode_number_name = train_step / arglist.max_episode_len save_dir_custom = arglist.save_dir + "/" + str( episode_number_name) + '/' # save_dir U.save_state(save_dir_custom, saver=saver) # print statement depends on whether or not there are adversaries if num_adversaries == 0: print( "steps: {}, episodes: {}, mean episode reward: {}, time: {}" .format(train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time() - t_start, 3))) else: print( "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}" .format(train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), [ np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards ], round(time.time() - t_start, 3))) t_start = time.time() # Keep track of final episode reward final_ep_rewards.append( np.mean(episode_rewards[-arglist.save_rate:])) for rew in agent_rewards: final_ep_ag_rewards.append( np.mean(rew[-arglist.save_rate:])) # draw custom statistics picture when save the model---------------------------------------------------- if arglist.draw_picture_train: episode_number_name = train_step / arglist.max_episode_len model_name = arglist.save_dir.split('/')[-1] + '/' draw_util.draw_episodes( episode_number_name, arglist.pictures_dir_train + model_name + "all_episodes/", aver_cover, j_index, energy_consumptions_for_test, instantaneous_dis, instantaneous_out_the_map, energy_efficiency, instantaneous_accmulated_reward, len(aver_cover)) # saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) print('...Finished total of {} episodes.'.format( len(episode_rewards))) break