def train(arglist): with U.single_threaded_session(): # Create environment env = make_env(arglist.scenario, arglist, arglist.benchmark) # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) print('Using good policy {} and adv policy {}'.format( arglist.good_policy, arglist.adv_policy)) # Initialize U.initialize() # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.display or arglist.restore or arglist.benchmark: print('Loading previous state...') U.load_state(arglist.load_dir) episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info saver = tf.train.Saver() obs_n = env.reset() episode_step = 0 train_step = 0 t_start = time.time() #if not (arglist.display or arglist.restore or arglist.benchmark): # U.save_state(arglist.save_dir, saver=saver) # print("Saved first checkpoint") current_game_experiences = [] t0 = time.time() print('Starting iterations...') while True: new_experiences = load_new_experiences() for exp in new_experiences: obs_n, action_n, rew_n, new_obs_n, done_n, terminal = exp for i, agent in enumerate(trainers): agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) # update all trainers, if not in display or benchmark mode loss = None for agent in trainers: agent.preupdate() for agent in trainers: loss = agent.update(trainers, train_step) U.save_state(arglist.save_dir, saver=saver)
def train(arglist): with U.single_threaded_session(): # Create environment env = make_env(arglist.scenario, arglist, arglist.benchmark) # Create experience buffer replay_buffer = ReplayBuffer(arglist.num_episodes * arglist.max_episode_len if arglist.benchmark and arglist.save_replay else 1e6) min_replay_buffer_len = arglist.batch_size * arglist.max_episode_len # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) print('Using good policy {} and adv policy {}'.format(arglist.good_policy, arglist.adv_policy)) # Initialize U.initialize() # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.display or arglist.restore or arglist.benchmark: print('Loading previous state...') U.load_state(arglist.load_dir) episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info saver = tf.train.Saver() obs_n = env.reset() episode_step = 0 train_step = 0 t_start = time.time() # pick random agent from ensemble for each episode if arglist.ensemble_choice == 'episode': agent_ids = np.random.randint(arglist.ensemble_size, size=len(trainers)) agents = [trainers[i][agent_id] for i, agent_id in enumerate(agent_ids)] print('Starting iterations...') while True: # pick random agent from ensemble for each timestep if arglist.ensemble_choice == 'timestep': agent_ids = np.random.randint(arglist.ensemble_size, size=len(trainers)) agents = [trainers[i][agent_id] for i, agent_id in enumerate(agent_ids)] # get action action_n = [agent.action(obs) for agent, obs in zip(agents,obs_n)] # environment step new_obs_n, rew_n, done_n, info_n = env.step(action_n) episode_step += 1 done = all(done_n) terminal = (episode_step >= arglist.max_episode_len) # collect experience replay_buffer.add(obs_n, action_n, rew_n, new_obs_n, done_n, agent_ids) obs_n = new_obs_n for i, rew in enumerate(rew_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) # for displaying learned policies if arglist.display: time.sleep(0.1) env.render() if done or terminal: obs_n = env.reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) # pick random agent from ensemble for each episode if arglist.ensemble_choice == 'episode': agent_ids = np.random.randint(arglist.ensemble_size, size=len(trainers)) agents = [trainers[i][agent_id] for i, agent_id in enumerate(agent_ids)] # increment global step counter train_step += 1 # for benchmarking learned policies if arglist.benchmark: if train_step >= arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) if arglist.save_replay: pickle.dump(replay_buffer._storage, fp) break continue # update all trainers, if not in display or benchmark mode # only update every 100 steps and if replay buffer is large enough if train_step % 100 == 0 and len(replay_buffer) >= min_replay_buffer_len: for i, ensemble in enumerate(trainers): for agent in ensemble: # sample different batch for each agent in ensemble batch_obs_n, batch_act_n, batch_rew_n, batch_obs_next_n, batch_done_n, batch_agent_ids = replay_buffer.sample(arglist.batch_size) batch_obs_n = [batch_obs_n[:, j] for j in range(batch_obs_n.shape[1])] batch_act_n = [batch_act_n[:, j] for j in range(batch_act_n.shape[1])] batch_obs_next_n = [batch_obs_next_n[:, j] for j in range(batch_obs_next_n.shape[1])] # choose random agent from ensemble for target action batch_agents = [random.choice(ensemble) for ensemble in trainers] loss = agent.update(batch_agents, batch_obs_n, batch_act_n, batch_rew_n[:, i], batch_obs_next_n, batch_done_n[:, i]) # save model, display training output if terminal and (len(episode_rewards) % arglist.save_rate == 0): U.save_state(arglist.save_dir, saver=saver) # print statement depends on whether or not there are adversaries if num_adversaries == 0: print("steps: {}, episodes: {}, mean episode reward: {}, time: {}".format( train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time()-t_start, 3))) else: print("steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format( train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), [np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards], round(time.time()-t_start, 3))) t_start = time.time() # Keep track of final episode reward final_ep_rewards.append(np.mean(episode_rewards[-arglist.save_rate:])) for rew in agent_rewards: final_ep_ag_rewards.append(np.mean(rew[-arglist.save_rate:])) # saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) print('...Finished total of {} episodes.'.format(len(episode_rewards))) break
#pdb.set_trace() if arglist.display: continue # update all trainers, if not in display or benchmark mode loss = None for agent in trainers: agent.preupdate() for agent in trainers: loss = agent.update(trainers, train_step) # save model, display training output if terminal and (len(episode_rewards) % arglist.save_rate == 0): U.save_state(arglist.save_dir, saver=saver) # print statement depends on whether or not there are adversaries if num_adversaries == 0: print("steps: {}, episodes: {}, mean episode reward: {}, time: {}".format( train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time()-t_start, 3))) else: print("steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format( train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), [np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards], round(time.time()-t_start, 3))) t_start = time.time() # Keep track of final episode reward final_ep_rewards.append(np.mean(episode_rewards[-arglist.save_rate:])) for rew in agent_rewards: final_ep_ag_rewards.append(np.mean(rew[-arglist.save_rate:]))
def train(arglist, extra_args=None): tf_graph = tf.Graph() tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) tf_config.gpu_options.allow_growth = True with tf.Session(graph=tf_graph, config=tf_config): # Create environment env = make_env(arglist.scenario, arglist) # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] if arglist.num_adversaries is None: arglist.num_adversaries = len([ agent for agent in env.agents if (hasattr(agent, "adversary") and agent.adversary) ]) arglist.num_adversaries = min(env.n, arglist.num_adversaries) num_adversaries = arglist.num_adversaries trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) print('Using good policy {} and adv policy {}'.format( arglist.good_policy, arglist.adv_policy)) # Initialize U.initialize() if os.environ.get("OUTPUT_GRAPH"): tf.summary.FileWriter(os.path.join(logger.get_dir(), "tb"), U.get_session().graph) # Load previous results, if necessary if arglist.display or arglist.restore or arglist.benchmark: print('Loading previous state...') U.load_state(arglist.load_dir) episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info saver = tf.train.Saver(max_to_keep=None) obs_n = env.reset() episode_step = 0 train_step = 0 t_start = time.time() print('Starting iterations...') while True: # get action action_n = [ agent.action(obs) for agent, obs in zip(trainers, obs_n) ] # print("[action] " + ", ".join(["agent {i}: {action}".format(i=i, action=list(action_n[i])) for i in range(len(action_n))])) # environment step new_obs_n, rew_n, done_n, info_n = env.step(action_n) episode_step += 1 done = all(done_n) terminal = (episode_step >= arglist.max_episode_len) # collect experience for i, agent in enumerate(trainers): agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) obs_n = new_obs_n for i, rew in enumerate(rew_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew if done or terminal: if arglist.save_render_images: input_file_name = os.path.join( arglist.render_dir, "image-episode_{}-step_%d.png".format( len(episode_rewards))) output_file_name = os.path.join( arglist.render_dir, "video-episode_{}.mp4".format(len(episode_rewards))) command = "ffmpeg -y -r 10 -i {} {}".format( input_file_name, output_file_name) os.system(command) print("Saved render video at {}".format(output_file_name)) for episode_step_ in range(episode_step): file_name = os.path.join( arglist.render_dir, "image-episode_{}-step_{}.png".format( len(episode_rewards), episode_step_)) if os.path.exists(file_name): os.remove(file_name) obs_n = env.reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) # increment global step counter train_step += 1 # for benchmarking learned policies if arglist.benchmark: for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) if train_step > arglist.benchmark_iters and (done or terminal): file_name = os.path.join(arglist.benchmark_dir, 'benchmark.pkl') print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue # for displaying learned policies if arglist.display: time.sleep(0.1) if arglist.save_render_images: images = env.render(mode="rgb_array") image = images[0] file_name = os.path.join( arglist.render_dir, "image-episode_{}-step_{}.png".format( len(episode_rewards), episode_step)) plt.imsave(file_name, image) print("Saved render image at {}".format(file_name)) else: env.render(mode="human") continue # update all trainers, if not in display or benchmark mode loss = None for agent in trainers: agent.preupdate() for agent in trainers: loss = agent.update(trainers, train_step) # save model if terminal and (len(episode_rewards) % arglist.save_rate == 0): U.save_state(os.path.join( arglist.save_dir, "checkpoint-episode_{}".format(len(episode_rewards))), saver=saver) # print training scalars if terminal and ((len(episode_rewards) % arglist.print_rate == 0) or (len(episode_rewards) % arglist.save_rate == 0)): # print statement depends on whether or not there are adversaries logger.log("Time: {}".format( datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) logger.logkv("steps", train_step) logger.logkv("episodes", len(episode_rewards)) logger.logkv("mean_episode_reward", np.mean(episode_rewards[-arglist.save_rate:])) if num_adversaries == 0: # print("[{}] steps: {}, episodes: {}, mean episode reward: {}, time: {}".format(time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime()), # train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time()-t_start, 3))) pass else: for agent_index in range(len(agent_rewards)): logger.logkv( "agent_{}_episode_reward".format(agent_index), np.mean(agent_rewards[agent_index] [-arglist.save_rate:])) # print("[{}] steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format(time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime()), # train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), # [np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards], round(time.time()-t_start, 3))) logger.logkv("time", round(time.time() - t_start, 3)) logger.dumpkvs() t_start = time.time() # Keep track of final episode reward final_ep_rewards.append( np.mean(episode_rewards[-arglist.save_rate:])) for rew in agent_rewards: final_ep_ag_rewards.append( np.mean(rew[-arglist.save_rate:])) # saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: rew_file_name = os.path.join(arglist.plots_dir, 'rewards.pkl') with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = os.path.join(arglist.plots_dir, 'average_rewards.pkl') with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) print('...Finished total of {} episodes.'.format( len(episode_rewards))) break
def train(arglist): with U.single_threaded_session() as sess: # Create environment env = make_env(arglist.scenario, arglist, arglist.benchmark) # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) print('Using good policy {} and adv policy {}'.format( arglist.good_policy, arglist.adv_policy)) saver = tf.train.Saver() # Initialize U.initialize() summary_writer = tf.summary.FileWriter(arglist.summary_dir, sess.graph) summary_placeholders, update_ops, summary_op = setup_summary() # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.display or arglist.restore or arglist.benchmark: print('Loading previous state...') U.load_state(arglist.load_dir) #saver.restore(sess, "/home/sugon/Peixian/maddpg_peixian/maddpg/experiments/tmp/policy/simple_comm_-4166440") #print ("susessfully restor") episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info saver = tf.train.Saver(max_to_keep=3) obs_n = env.reset() episode_step = 0 train_step = 0 t_start = time.time() adversary_rewards = 0.0 goodagent_rewards = 0.0 print('Starting iterations...') while True: #input('...') # get action action_n = [ agent.action(obs) for agent, obs in zip(trainers, obs_n) ] # environment step new_obs_n, rew_n, done_n, info_n = env.step(action_n) episode_step += 1 done = all(done_n) terminal = (episode_step >= arglist.max_episode_len) # collect experience for i, agent in enumerate(trainers): agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) obs_n = new_obs_n for i, rew in enumerate(rew_n): #print (i,":",rew_n[i]) episode_rewards[-1] += rew agent_rewards[i][-1] += rew if i < num_adversaries: adversary_rewards += rew else: goodagent_rewards += rew if done or terminal: if done: print("*" * 20) print("done:", episode_step) stats = [adversary_rewards, episode_step, goodagent_rewards] for i in range(len(stats)): sess.run( update_ops[i], feed_dict={summary_placeholders[i]: float(stats[i])}) summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, len(episode_rewards) + 1) obs_n = env.reset() episode_step = 0 adversary_rewards = 0.0 goodagent_rewards = 0.0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) # increment global step counter train_step += 1 # for benchmarking learned policies if arglist.benchmark: for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) if train_step > arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue # for displaying learned policies if arglist.display: time.sleep(0.1) env.render() continue # update all trainers, if not in display or benchmark mode loss = None for agent in trainers: agent.preupdate() for agent in trainers: loss = agent.update(trainers, train_step) # save model, display training output if (done or terminal) and (len(episode_rewards) % arglist.save_rate == 0): U.save_state(arglist.save_dir, train_step, saver=saver) # print statement depends on whether or not there are adversaries if num_adversaries == 0: print( "steps: {}, episodes: {}, mean episode reward: {}, time: {}" .format(train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time() - t_start, 3))) else: print( "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}" .format(train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), [ np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards ], round(time.time() - t_start, 3))) t_start = time.time() # Keep track of final episode reward final_ep_rewards.append( np.mean(episode_rewards[-arglist.save_rate:])) for rew in agent_rewards: final_ep_ag_rewards.append( np.mean(rew[-arglist.save_rate:])) # saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) print('...Finished total of {} episodes.'.format( len(episode_rewards))) break
def train(arglist): os.environ["CUDA_VISIBLE_DEVICES"] = "0,1" with U.single_threaded_session(): # Create environment env = make_env(arglist.scenario, arglist, arglist.benchmark) obs_n = env.reset( ) # so that env.observation_space is initialized so trainers can be initialized # Create agent trainers num_adversaries = arglist.num_adversaries obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] print("env.observation_space:", env.observation_space) print("num adversaries: ", num_adversaries, ", env.n (num agents): ", env.n) #need to ensure that the trainer is in correct order. pacman in front trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) print('Using good policy {} and adv policy {}'.format( arglist.good_policy, arglist.adv_policy)) # Initialize U.initialize() # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir + ("{}".format( arglist.load_episode)) if arglist.restore or arglist.benchmark: print('Loading previous state...') U.load_state(arglist.load_dir) if arglist.display and arglist.load: print('Loading previous state...') U.load_state(arglist.load_dir) episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [[] for i in range(env.n) ] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info saver = tf.train.Saver(max_to_keep=None) episode_step = 0 train_step = 0 total_win = [0] final_win = [] total_lose = [0] final_lose = [] t_start = time.time() loss_list = {} for i in range(env.n): loss_list[i] = [[] for i in range(6)] print('Starting iterations...') while True: # get action action_n = [ agent.action(obs) for agent, obs in zip(trainers, obs_n) ] # environment step new_obs_n, rew_n, done, info_n, win, lose = env.step(action_n) episode_step += 1 terminal = (episode_step >= arglist.max_episode_len) # print("obs_n", obs_n) # print("new_obs_n", new_obs_n) #print("action_n", action_n) # print("rew_n",episode_step, rew_n) # print("done", done) # print("terminal", terminal) # collect experience for i, agent in enumerate(trainers): agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done, terminal) obs_n = new_obs_n for i, rew in enumerate(rew_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew if done or terminal: if arglist.display: env.render() obs_n = env.reset() episode_step = 0 if win: total_win[-1] += 1 if lose: total_lose[-1] += 1 total_win.append(0) total_lose.append(0) episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) # increment global step counter train_step += 1 # if train_step % 1000 == 0: # print(train_step) # for benchmarking learned policies if arglist.benchmark: for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) if train_step > arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue # for displaying learned policies if arglist.display: time.sleep(0.1) env.render() continue # update all trainers, if not in display or benchmark mode loss = None for agent in trainers: agent.preupdate() for ind, agent in enumerate(trainers): loss = agent.update(trainers, train_step) if train_step % 10000 == 0 and loss != None: for i in range(len(loss)): loss_list[ind][i].append(loss[i]) # save model, display training output if (terminal or done) and (len(episode_rewards) % arglist.save_rate == 0): saving = arglist.save_dir + ( "{}".format(0 + len(episode_rewards)) ) #TODO why append this U.save_state(saving, saver=saver) # print statement depends on whether or not there are adversaries if num_adversaries == 0: print( "steps: {}, episodes: {}, mean episode reward: {}, time: {}" .format(train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time() - t_start, 3))) else: print( "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, number of wins {}, number of lose {}, " "time: {}".format( train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), [ np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards ], np.sum(total_win[-arglist.save_rate:]), np.sum(total_lose[-arglist.save_rate:]), round(time.time() - t_start, 3))) t_start = time.time() # Keep track of final episode reward final_ep_rewards.append( np.mean(episode_rewards[-arglist.save_rate:])) final_win.append(np.sum(total_win[-arglist.save_rate:])) final_lose.append(np.sum(total_lose[-arglist.save_rate:])) ep_reward_df = pd.DataFrame(final_ep_rewards) ep_ag_reward_df = pd.DataFrame(final_ep_ag_rewards) win_df = pd.DataFrame(final_win) lose_df = pd.DataFrame(final_lose) for i in range(env.n): trainer_loss_df = pd.DataFrame(loss_list[i]).transpose() trainer_loss_df.to_csv(arglist.plots_dir + arglist.exp_name + '_trainer_loss_df_{}.csv'.format(i)) ep_reward_df.to_csv(arglist.plots_dir + arglist.exp_name + '_rewards.csv') ep_ag_reward_df.to_csv(arglist.plots_dir + arglist.exp_name + '_agrewards.csv') win_df.to_csv(arglist.plots_dir + arglist.exp_name + '_win_df.csv') lose_df.to_csv(arglist.plots_dir + arglist.exp_name + '_lose_df.csv') for i, rew in enumerate(agent_rewards): final_ep_ag_rewards[i].append( np.mean(rew[-arglist.save_rate:])) # saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: # rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' # with open(rew_file_name, 'wb') as fp: # pickle.dump(final_ep_rewards, fp) # agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' # with open(agrew_file_name, 'wb') as fp: # pickle.dump(final_ep_ag_rewards, fp) print('...Finished total of {} episodes.'.format( len(episode_rewards))) break
def train(arglist): with U.single_threaded_session(): # Create environment env = make_env(arglist) # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] board_write_path = './board/' + datetime.now().strftime("%Y%m%d_%H%M%S") os.makedirs(board_write_path) board_writer = tf.summary.FileWriter(board_write_path) trainers = get_trainers(env, obs_shape_n, arglist, board_writer) print('Using good policy {} and adv policy {}'.format(arglist.good_policy, arglist.adv_policy)) # Initialize U.initialize() # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.display or arglist.restore or arglist.benchmark: print('Loading previous state...') U.load_state(arglist.load_dir) episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info evaluate_rewards = [] saver = tf.train.Saver() obs_n = env.reset() episode_step = 0 train_step = 0 t_start = time.time() print('Starting iterations...') while True: # get action action_n = [agent.action(obs) for agent, obs in zip(trainers, obs_n)] # environment step action_n_saved = deepcopy(action_n) if arglist.display: for idx, (agent, obs) in enumerate(zip(trainers, obs_n)): action_result = agent.p_debug['p_values'](obs[None])[0] print("agent_%d" % idx, action_result) new_obs_n, rew_n, done_n, info_n = env.step(action_n) action_n = action_n_saved episode_step += 1 done = all(done_n) terminal = (episode_step >= arglist.max_episode_len) # collect experience for i, agent in enumerate(trainers): agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) obs_n = new_obs_n for i, rew in enumerate(rew_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew if done or terminal: obs_n = env.reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) # increment global step counter train_step += 1 # for benchmarking learned policies if arglist.benchmark: for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) if train_step > arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue if arglist.display: continue # update all trainers, if not in display or benchmark mode if train_step % 100 == 0 and len(trainers[0].replay_buffer) >= trainers[0].max_replay_buffer_len: loss = None replay_sample_index = trainers[0].get_memory_index() obs_n_sampled = [] obs_next_n_sampled = [] act_n_sampled = [] for agent in trainers: agent.set_memory_index(replay_sample_index) obs_sampled, act_sampled, _, obs_next_sampled, _ = agent.get_replay_data() obs_n_sampled.append(obs_sampled) obs_next_n_sampled.append(obs_next_sampled) act_n_sampled.append(act_sampled) target_act_next_n = [] for agent in trainers: target_act_next_n.append(agent.get_target_act(obs_next_n_sampled)) for agent in trainers: loss = agent.update(train_step, obs_n_sampled, act_n_sampled, obs_next_n_sampled, target_act_next_n) import math if math.isnan(episode_rewards[-1]): print("NaN occurred! ") break # save model, display training output if terminal and (len(episode_rewards) % arglist.save_rate == 0): U.save_state(arglist.save_dir, saver=saver) # print statement depends on whether or not there are adversaries print("steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format( train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), [np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards], round(time.time() - t_start, 3))) t_start = time.time() # Keep track of final episode reward final_ep_rewards.append(np.mean(episode_rewards[-arglist.save_rate:])) for rew in agent_rewards: final_ep_ag_rewards.append(np.mean(rew[-arglist.save_rate:])) evaluate_rewards.append(evaluate(arglist, trainers, is_toy=True)) # saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) print('...Finished total of {} episodes.'.format(len(episode_rewards))) with open(arglist.plots_dir + arglist.exp_name + "_evaluate_rewards.pkl", 'wb') as fp: pickle.dump(evaluate_rewards, fp) break
def update(self, arglist, obs_n, rew_n, done_n, info_n, terminal): # info_n is false only when the very first data was created if info_n != False: done = all(done_n) # collect experience for i, agent in enumerate(self.trainers): # do this every iteration if arglist.critic_lstm and arglist.actor_lstm: agent.experience( self.prev_obs_n[i], self.action_n[i], rew_n[i], obs_n[i], done_n[i], # terminal, self.p_in_c_n[i][0], self.p_in_h_n[i][0], self.p_out_c_n[i][0], self.p_out_h_n[i][0], self.q_in_c_n[i][0], self.q_in_h_n[i][0], self.q_out_c_n[i][0], self.q_out_h_n[i][0], self.new_episode) elif arglist.critic_lstm: agent.experience( self.prev_obs_n[i], self.action_n[i], rew_n[i], obs_n[i], done_n[i], # terminal, self.q_in_c_n[i][0], self.q_in_h_n[i][0], self.q_out_c_n[i][0], self.q_out_h_n[i][0], self.new_episode) elif arglist.actor_lstm: agent.experience( self.prev_obs_n[i], self.action_n[i], rew_n[i], obs_n[i], done_n[i], # terminal, self.p_in_c_n[i][0], self.p_in_h_n[i][0], self.p_out_c_n[i][0], self.p_out_h_n[i][0], self.new_episode) else: agent.experience( self.prev_obs_n[i], self.action_n[i], rew_n[i], obs_n[i], done_n[i], # terminal, self.new_episode) # Adding rewards if arglist.tracking: for i, a in enumerate(self.trainers): a.tracker.record_information("ag_reward", rew_n[i]) a.tracker.record_information("team_dist_reward", info_n["team_dist"][i]) a.tracker.record_information("team_diff_reward", info_n["team_diff"][i]) # Closing graph writer if arglist.graph: self.writer.close() for i, rew in enumerate(rew_n): self.episode_rewards[-1] += rew self.agent_rewards[i][-1] += rew # If an episode was finished, reset internal values if done or terminal: self.new_episode = True # reset trainers if arglist.actor_lstm or arglist.critic_lstm: for agent in self.trainers: agent.reset_lstm() if arglist.tracking: for agent in self.trainers: agent.tracker.reset() self.episode_rewards.append(0) for a in self.agent_rewards: a.append(0) self.agent_info.append([[]]) else: self.new_episode = False # increment global step counter self.train_step += 1 # for benchmarking learned policies if arglist.benchmark: for i, info in enumerate(info_n): self.agent_info[-1][i].append(info_n['n']) if self.train_step > arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(self.agent_info[:-1], fp) return # otherwise training else: # update all trainers, if not in display or benchmark mode loss = None # get same episode sampling if arglist.sync_sampling: inds = [ random.randint( 0, len(self.trainers[0].replay_buffer._storage) - 1) for i in range(arglist.batch_size) ] else: inds = None for agent in self.trainers: # if arglist.lstm: # agent.preupdate(inds=inds) # else: agent.preupdate(inds) for agent in self.trainers: loss = agent.update(self.trainers, self.train_step) if loss is None: continue # save model, display training output if terminal and (len(self.episode_rewards) % arglist.save_rate == 0): U.save_state(arglist.save_dir, saver=self.saver) # print statement depends on whether or not there are adversaries if self.num_adversaries == 0: print( "steps: {}, episodes: {}, mean episode reward: {}, time: {}" .format( self.train_step, len(self.episode_rewards), np.mean( self.episode_rewards[-arglist.save_rate:]), round(time.time() - self.t_start, 3))) else: print( "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}" .format( self.train_step, len(self.episode_rewards), np.mean( self.episode_rewards[-arglist.save_rate:]), [ np.mean(rew[-arglist.save_rate:]) for rew in self.agent_rewards ], round(time.time() - self.t_start, 3))) self.t_start = time.time() # Keep track of final episode reward self.final_ep_rewards.append( np.mean(self.episode_rewards[-arglist.save_rate:])) for rew in self.agent_rewards: self.final_ep_ag_rewards.append( np.mean(rew[-arglist.save_rate:])) if arglist.actor_lstm: # get critic input states self.p_in_c_n, self.p_in_h_n = get_lstm_states( 'p', self.trainers) # num_trainers x 1 x 1 x 64 if arglist.critic_lstm: self.q_in_c_n, self.q_in_h_n = get_lstm_states( 'q', self.trainers) # num_trainers x 1 x 1 x 64 # get action self.action_n = [ agent.action(obs) for agent, obs in zip(self.trainers, obs_n) ] if arglist.critic_lstm: # get critic output states p_states = [self.p_in_c_n, self.p_in_h_n ] if arglist.actor_lstm else [] update_critic_lstm(self.trainers, obs_n, self.action_n, p_states) self.q_out_c_n, self.q_out_h_n = get_lstm_states( 'q', self.trainers) # num_trainers x 1 x 1 x 64 if arglist.actor_lstm: self.p_out_c_n, self.p_out_h_n = get_lstm_states( 'p', self.trainers) # num_trainers x 1 x 1 x 64 self.prev_obs_n = obs_n return self.action_n
def train(arglist): with U.single_threaded_session(): # Create environment env = make_env(arglist.scenario, arglist, arglist.benchmark) # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) print( "Using good policy {} and adv policy {}".format( arglist.good_policy, arglist.adv_policy ) ) np.seterr(all="raise") # define before your code. # Initialize U.initialize() # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.display or arglist.restore or arglist.benchmark: print("Loading previous state...") U.load_state(arglist.load_dir) episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info saver = tf.train.Saver() obs_n = env.reset() episode_step = 0 train_step = 0 t_start = time.time() print("making logger") tb_configure("logs/" + str(arglist.exp_name) + "_" + str(datetime.now())) print("Starting iterations...") while True: # get action action_n = [agent.action(obs) for agent, obs in zip(trainers, obs_n)] # environment step new_obs_n, rew_n, done_n, info_n = env.step(action_n) episode_step += 1 done = all(done_n) terminal = episode_step >= arglist.max_episode_len # collect experience for i, agent in enumerate(trainers): agent.experience( obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal ) obs_n = new_obs_n for i, rew in enumerate(rew_n): episode_rewards[ -1 ] += rew ## / self.n (?) Do we want this to be average across all agents? agent_rewards[i][-1] += rew if done or terminal: obs_n = env.reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) # increment global step counter train_step += 1 # for benchmarking learned policies if arglist.benchmark: for i, info in enumerate(info_n): agent_info[-1][i].append(info_n["n"]) if train_step > arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + ".pkl" print("Finished benchmarking, now saving...") with open(file_name, "wb") as fp: pickle.dump(agent_info[:-1], fp) break continue # for displaying learned policies if arglist.display: time.sleep(0.1) env.render() continue # update all trainers, if not in display or benchmark mode loss = None for agent in trainers: agent.preupdate() for agent in trainers: loss = agent.update(trainers, train_step) # log metrics tb_log_value("episode_reward", episode_rewards[train_step - 1], train_step) tb_log_value( "first_agent_reward", agent_rewards[0][train_step - 1], train_step ) tb_log_value( "second_agent_reward", agent_rewards[1][train_step], train_step ) if loss is not None: loss_to_log = loss else: loss_to_log = -100 tb_log_value("loss", loss_to_log, train_step) # save model, display training output if terminal and (len(episode_rewards) % arglist.save_rate == 0): print("made it into if terminal and len(episde)") U.save_state(arglist.save_dir, saver=saver) # print statement depends on whether or not there are adversaries if num_adversaries == 0: print( "steps: {}, episodes: {}, mean episode reward: {}, time: {}".format( train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate :]), round(time.time() - t_start, 3), ) ) else: print( "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format( train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate :]), [ np.mean(rew[-arglist.save_rate :]) for rew in agent_rewards ], round(time.time() - t_start, 3), ) ) t_start = time.time() # Keep track of final episode reward final_ep_rewards.append(np.mean(episode_rewards[-arglist.save_rate :])) for rew in agent_rewards: final_ep_ag_rewards.append(np.mean(rew[-arglist.save_rate :])) # saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: rew_file_name = arglist.plots_dir + arglist.exp_name + "_rewards.pkl" with open(rew_file_name, "wb") as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = ( arglist.plots_dir + arglist.exp_name + "_agrewards.pkl" ) with open(agrew_file_name, "wb") as fp: pickle.dump(final_ep_ag_rewards, fp) print("...Finished total of {} episodes.".format(len(episode_rewards))) break
def train(arglist): with U.single_threaded_session(): # Create environment env = make_env(arglist.scenario, arglist, arglist.benchmark) # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) print('Using good policy {} and adv policy {}'.format(arglist.good_policy, arglist.adv_policy)) # Initialize U.initialize() # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.display or arglist.restore or arglist.benchmark: print('Loading previous state...') U.load_state(arglist.load_dir) episode_rewards = [0.0] # sum of rewards for all agents episode_accuracy = [[] for i in range(env.n)] agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] final_ep_accurancy =[] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info saver = tf.train.Saver() obs_n = env.reset() episode_step = 0 train_step = 0 t_start = time.time() record_accurancy = [[]] * env.n #initialize act trajectories act_trajs = [] for i in range(env.n): act_trajs.append(collections.deque(np.zeros((arglist.timestep, env.action_space[0].n)), maxlen = arglist.timestep) ) print('Starting iterations...') while True: # get action act_traj_n = get_traj_n(act_trajs) if arglist.adv_i3 == 1 and arglist.good_i3 == 1: intent_n = [agent.intent(obs, act_traj) for agent, obs, act_traj in zip(trainers, obs_n, act_traj_n)] action_n = [agent.action(obs, intent) for agent, obs,intent in zip(trainers,obs_n, intent_n)] # environment step new_obs_n, rew_n, done_n, info_n = env.step(action_n) episode_step += 1 done = all(done_n) terminal = (episode_step >= arglist.max_episode_len) # collect experience for i in range(len(act_trajs)): act_trajs[i].append(action_n[i]) act_traj_next_n = get_traj_n(act_trajs) intent_next_n = [agent.intent(obs, act_traj) for agent, obs, act_traj in zip(trainers, new_obs_n, act_traj_next_n)] for i, agent in enumerate(trainers): agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], act_traj_n[i], intent_n[i],act_traj_next_n[i], intent_next_n[i], done_n[i], terminal) if arglist.onpolicy_i == 1: i_loss = agent.onpolicy_train_i(obs_n, act_traj_n,action_n ) episode_accuracy[i].append(i_loss) elif arglist.adv_i3 == 1 and arglist.good_i3 == 0: #adv use I3 good use maddpg intent_n = [] action_n = [] for i in range(len(trainers)): if i < arglist.num_adversaries: intent = trainers[i].intent(obs_n[i], act_traj_n[i]) action = trainers[i].action(obs_n[i], intent) action_n.append(action) intent_n.append(intent) else: action = trainers[i].action(obs_n[i]) action_n.append(action) intent_n.append(np.zeros((arglist.timestep * (env.action_space[0].n-1)))) # environment step new_obs_n, rew_n, done_n, info_n = env.step(action_n) episode_step += 1 done = all(done_n) terminal = (episode_step >= arglist.max_episode_len) for i in range(len(act_trajs)): act_trajs[i].append(action_n[i]) act_traj_next_n = get_traj_n(act_trajs) intent_next_n = [] for i in range(len(trainers)): if i < arglist.num_adversaries: intent_next_n.append(trainers[i].intent(new_obs_n[i], act_traj_next_n[i])) else: intent_next_n.append(np.zeros((arglist.timestep * (env.action_space[0].n-1)))) for i in range(len(trainers)): if i < arglist.num_adversaries: trainers[i].experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], act_traj_n[i], intent_n[i],act_traj_next_n[i], intent_next_n[i], done_n[i], terminal) if arglist.onpolicy_i == 1: i_loss = trainers[i].onpolicy_train_i(obs_n, act_traj_n,action_n) episode_accuracy[i].append(i_loss) else: trainers[i].experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) elif arglist.good_i3 == 1 and arglist.adv_i3 ==0: #adv use I3 good use maddpg intent_n = [] action_n = [] for i in range(len(trainers)): if i >=arglist.num_adversaries: intent = trainers[i].intent(obs_n[i], act_traj_n[i]) action = trainers[i].action(obs_n[i], intent) action_n.append(action) intent_n.append(intent) else: action = trainers[i].action(obs_n[i]) action_n.append(action) intent_n.append(np.zeros((arglist.timestep * (env.action_space[0].n-1)))) # environment step new_obs_n, rew_n, done_n, info_n = env.step(action_n) episode_step += 1 done = all(done_n) terminal = (episode_step >= arglist.max_episode_len) for i in range(len(act_trajs)): act_trajs[i].append(action_n[i]) act_traj_next_n = get_traj_n(act_trajs) intent_next_n = [] for i in range(len(trainers)): if i >= arglist.num_adversaries: intent_next_n.append(trainers[i].intent(new_obs_n[i], act_traj_next_n[i])) else: intent_next_n.append(np.zeros((arglist.timestep * (env.action_space[0].n-1)))) for i in range(len(trainers)): if i >= arglist.num_adversaries: trainers[i].experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], act_traj_n[i], intent_n[i],act_traj_next_n[i], intent_next_n[i], done_n[i], terminal) if arglist.onpolicy_i == 1: i_loss = trainers[i].onpolicy_train_i(obs_n, act_traj_n,action_n ) episode_accuracy[i].append(i_loss) else: trainers[i].experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) else: action_n = [agent.action(obs) for agent, obs in zip(trainers,obs_n)] new_obs_n, rew_n, done_n, info_n = env.step(action_n) episode_step += 1 done = all(done_n) terminal = (episode_step >= arglist.max_episode_len) # collect experience for i, agent in enumerate(trainers): agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) obs_n = new_obs_n for i, rew in enumerate(rew_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew if done or terminal: obs_n = env.reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) # increment global step counter train_step += 1 # for benchmarking learned policies if arglist.benchmark: for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) if train_step > arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue # for displaying learned policies if arglist.display: time.sleep(0.5) env.render() continue # update all trainers, if not in display or benchmark mode loss = None for agent in trainers: agent.preupdate() for agent in trainers: loss = agent.update(trainers, train_step) # save model, display training output if terminal and (len(episode_rewards) % arglist.save_rate == 0): U.save_state(arglist.save_dir, saver=saver) # print statement depends on whether or not there are adversaries if num_adversaries == 0: print("steps: {}, episodes: {}, mean episode reward: {}, time: {}".format( train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time()-t_start, 3))) else: print("steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format( train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), [np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards], round(time.time()-t_start, 3))) t_start = time.time() # Keep track of final episode reward final_ep_rewards.append(np.mean(episode_rewards[-arglist.save_rate:])) for rew in agent_rewards: final_ep_ag_rewards.append(np.mean(rew[-arglist.save_rate:])) # print("-----------------------------") for iloss in episode_accuracy: if len(iloss) < arglist.save_rate: continue else: final_ep_accurancy.append(np.mean(iloss[-arglist.save_rate])) # saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: rew_file_name = arglist.plots_dir + arglist.exp_name + str(arglist.seed) + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + str(arglist.seed) + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) acc_file = arglist.plots_dir + arglist.exp_name + str(arglist.seed) + '_accurancy.pkl' with open(acc_file, 'wb') as fp: pickle.dump(final_ep_accurancy, fp) print('...Finished total of {} episodes.'.format(len(episode_rewards))) break
def train(arglist): with U.single_threaded_session(): # Create environment env = make_env(arglist, arglist.benchmark) # Create agent trainers obs_shape_n = [env.observation_space.shape for i in range(env.n)] trainers = get_trainers(env, obs_shape_n, arglist) print('Using good policy {}'.format(arglist.good_policy)) # Initialize U.initialize() # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.display or arglist.restore or arglist.benchmark: print('Loading previous state...') U.load_state(arglist.load_dir) episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info saver = tf.train.Saver() obs_n = env.reset() episode_step = 0 train_step = 0 t_start = time.time() done = 0 current_player_index = 0 no_op_actions = False print('Starting iterations...') while True: # get action current_player_obs = np.asarray(obs_n) original_action = trainers[current_player_index].action( current_player_obs) if (no_op_actions): action = np.random.choice(np.linspace(0, env.action_space.n - 1, num=env.action_space.n, dtype=int), 1, p=original_action)[0] mask = env.getValidActions() while (mask[action] == 0): action = np.random.choice(np.linspace( 0, env.action_space.n - 1, num=env.action_space.n, dtype=int), 1, p=original_action)[0] else: # get action mask mask = env.getValidActions() # zero out invalid options masked_actions = mask * original_action # normalize masked_actions = masked_actions / np.nansum(masked_actions) # Get action with given probability if (np.isnan(masked_actions).any()): print(current_player_obs) print(masked_actions) print(np.nansum(masked_actions)) print(original_action) try: action = np.random.choice(np.linspace( 0, env.action_space.n - 1, num=env.action_space.n, dtype=int), 1, p=masked_actions)[0] except: print("Exception: choosing random action") action = np.random.choice( np.linspace(0, env.action_space.n - 1, num=env.action_space.n, dtype=int), 1)[0] new_obs, rew, done, info = env.step(action) #trainers[current_player_index].experience(current_player_obs, original_action, mask, rew, new_obs, done) trainers[current_player_index].experience(current_player_obs, masked_actions, mask, rew, new_obs, done) current_player_index += 1 if (current_player_index >= len(trainers)): current_player_index = 0 obs_n = new_obs episode_rewards[-1] += rew agent_rewards[current_player_index][-1] += rew if done: obs_n = env.reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) current_player_index = 0 # increment global step counter train_step += 1 # for benchmarking learned policies if arglist.benchmark: for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) if train_step > arglist.benchmark_iters and (done): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue # for displaying learned policies if arglist.display: time.sleep(0.1) env.render() continue # update all trainers, if not in display or benchmark mode loss = None for agent in trainers: agent.preupdate() for agent in trainers: loss = agent.update(trainers, train_step) if (loss is not None and agent.sleep_regimen and agent.agent_mic != 0 and train_step % 100 == 0): # Change sleep frequency here if desired original_policy_loss = loss[1] new_loss = agent.update(trainers, train_step, sleeping=True)[1] sleep_iteration = 0 while ((sleep_iteration < 10) and (new_loss < original_policy_loss * 1.05)): new_loss = agent.update(trainers, train_step, sleeping=True)[1] sleep_iteration += 1 #print("sleep walking") # save model, display training output if done and (len(episode_rewards) % arglist.save_rate == 0): U.save_state(arglist.save_dir, saver=saver) print( "steps: {}, episodes: {}, mean episode reward: {}, time: {}" .format(train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time() - t_start, 3))) t_start = time.time() # Keep track of final episode reward final_ep_rewards.append( np.mean(episode_rewards[-arglist.save_rate:])) for rew in agent_rewards: final_ep_ag_rewards.append( np.mean(rew[-arglist.save_rate:])) # saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: print(arglist.plots_dir) print(arglist.exp_name) rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) print('...Finished total of {} episodes.'.format( len(episode_rewards))) break
def play(arglist): with U.single_threaded_session(): # Create environment env = make_env(arglist.scenario, arglist, arglist.benchmark) # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) print('Using good policy {} and adv policy {}'.format( arglist.good_policy, arglist.adv_policy)) # Initialize U.initialize() # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.display or arglist.restore or arglist.benchmark: print('Loading previous state...') U.load_state(arglist.load_dir) episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info saver = tf.train.Saver() obs_n = env.reset() episode_step = 0 train_step = 0 t_start = time.time() print('Starting iterations...') # create world world = scenario.make_world() # create multiagent environment env = MultiAgentEnv(world, scenario.reset_world, scenario.reward, scenario.observation, info_callback=None, shared_viewer=True) env.window_pos = 'right' # render call to create viewer window (necessary only for interactive policies) env.render() # create interactive policies for one agent policy = InteractivePolicy(env, -1) # execution loop obs_n = env.reset() while True: # query for action from each agent's policy act_n = [agent.action(obs) for agent, obs in zip(trainers, obs_n)] # trained policy act_n[-1] = policy.action(obs_n[-1]) # interactive keyboard policy # step environment new_obs_n, reward_n, done_n, _ = env.step(act_n) episode_step += 1 done = all(done_n) terminal = (episode_step >= arglist.max_episode_len) # collect experience for i, agent in enumerate(trainers): agent.experience(obs_n[i], act_n[i], reward_n[i], new_obs_n[i], done_n[i], terminal) obs_n = new_obs_n for i, rew in enumerate(reward_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew if done or terminal: obs_n = env.reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) # increment global step counter train_step += 1 # for game over try: if scenario.game_over: sys.exit(0) except AttributeError: pass # for benchmarking learned policies if arglist.benchmark: for i, info in enumerate(_): agent_info[-1][i].append(_['n']) if train_step > arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue # render all agent views time.sleep(arglist.delay) env.render() # display rewards for agent in env.world.agents: pass # print(agent.name + " reward: %0.3f" % env._get_reward(agent)) # update all trainers, if not in display or benchmark mode loss = None for agent in trainers: agent.preupdate() for agent in trainers: loss = agent.update(trainers, train_step) # save model, display training output if terminal and (len(episode_rewards) % arglist.save_rate == 0): U.save_state(arglist.save_dir, saver=saver) # print statement depends on whether or not there are adversaries if num_adversaries == 0: print( "steps: {}, episodes: {}, mean episode reward: {}, time: {}" .format(train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time() - t_start, 3))) else: print( "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}" .format(train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), [ np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards ], round(time.time() - t_start, 3))) t_start = time.time() # Keep track of final episode reward final_ep_rewards.append( np.mean(episode_rewards[-arglist.save_rate:])) for rew in agent_rewards: final_ep_ag_rewards.append( np.mean(rew[-arglist.save_rate:])) # saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: U.save_state(arglist.plots_dir, saver=saver) rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) print('...Finished total of {} episodes.'.format( len(episode_rewards))) break
def train(arglist): with U.single_threaded_session(): # Create environment env = make_env(arglist.scenario, arglist, arglist.benchmark) # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) print('Using good policy {} and adv policy {}'.format( arglist.good_policy, arglist.adv_policy)) agents = copy.copy(trainers) # Initialize U.initialize() # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.display or arglist.restore or arglist.benchmark: print('Loading previous state...') U.load_state(arglist.load_dir) episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info saver = tf.train.Saver() obs_n = env.reset() episode_step = 0 train_step = 0 t_start = time.time() print('Starting iterations...') if (arglist.record != False): writer = skvideo.io.FFmpegWriter("{}.avi".format(arglist.record)) while True: # shuffle agents to prevent them from learning fixed strategy if not arglist.benchmark and arglist.shuffle == 'timestep': random.shuffle(agents) # get action action_n = [agent.action(obs) for agent, obs in zip(agents, obs_n)] # environment step new_obs_n, rew_n, done_n, info_n = env.step(action_n) episode_step += 1 done = all(done_n) terminal = (episode_step >= arglist.max_episode_len) # collect experience #print([ag.state.p_pos for ag in env.agents]) for i, agent in enumerate(agents): agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) obs_n = new_obs_n for i, rew in enumerate(rew_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) # for displaying learned policies if arglist.display: time.sleep(0.01) x = env.render(mode='rgb_array') if (arglist.record != False): LM = [ag.state.p_pos for ag in env.world.landmarks] LM = [FixPosition(j, 10, 10) for j in LM] AP = [ag.state.p_pos for ag in env.agents] AP = [FixPosition(j) for j in AP] img = np.copy(x[0]) img = AddTextToImage( img, text=['Agent {}', 'Agent {}', 'Agent {}'], color=(0, 0, 255), pos=AP) img = AddTextToImage(img, text=['LM{}', 'LM{}', 'LM{}'], pos=LM, color=(255, 0, 0)) writer.writeFrame(img) if done or terminal: if (arglist.record != False): writer.close() exit() obs_n = env.reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) # shuffle agents to prevent them from learning fixed strategy if not arglist.benchmark and arglist.shuffle == 'episode': random.shuffle(agents) # increment global step counter train_step += 1 # for benchmarking learned policies if arglist.benchmark: if train_step >= arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) if arglist.save_replay: # save in original order for i, agent in enumerate(trainers): pickle.dump(agent.replay_buffer._storage, fp) break continue # update all agents, if not in display or benchmark mode loss = None for agent in agents: agent.preupdate() for agent in agents: loss = agent.update(agents, train_step) # if shared model, train only once if arglist.shared: break # save model, display training output if terminal and (len(episode_rewards) % arglist.save_rate == 0): U.save_state(arglist.save_dir, saver=saver) # print statement depends on whether or not there are adversaries if num_adversaries == 0: print( "steps: {}, episodes: {}, mean episode reward: {}, time: {}" .format(train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time() - t_start, 3))) else: print( "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}" .format(train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), [ np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards ], round(time.time() - t_start, 3))) t_start = time.time() # Keep track of final episode reward final_ep_rewards.append( np.mean(episode_rewards[-arglist.save_rate:])) for rew in agent_rewards: final_ep_ag_rewards.append( np.mean(rew[-arglist.save_rate:])) # saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) print('...Finished total of {} episodes.'.format( len(episode_rewards))) break
def learn_old( env, arglist, seed=None, total_timesteps=None, nb_epochs=None, # with default settings, perform 1M steps total nb_epoch_cycles=20, nb_rollout_steps=100, reward_scale=1.0, render=False, render_eval=False, noise_type='adaptive-param_0.2', normalize_returns=False, normalize_observations=True, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-3, popart=False, gamma=0.99, clip_norm=None, nb_train_steps=50, # per epoch cycle and MPI worker, nb_eval_steps=100, batch_size=64, # per MPI worker tau=0.01, eval_env=None, param_noise_adaption_interval=50, save_interval=100, **network_kwargs): with U.single_threaded_session(): # Create environment env = make_env(arglist.scenario, arglist, arglist.benchmark) # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) print('Using good policy {} and adv policy {}'.format( arglist.good_policy, arglist.adv_policy)) # Initialize U.initialize() # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.display or arglist.restore or arglist.benchmark: print('Loading previous state...') U.load_state(arglist.load_dir) episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info saver = tf.train.Saver() obs_n = env.reset() episode_step = 0 # train_step = 0 t_start = time.time() epinfobuf = deque(maxlen=100) print('Starting iterations...') total_timesteps = arglist.num_episodes * arglist.max_episode_len for train_step in range(1, total_timesteps + 1): # get action action_n = [ agent.action(obs) for agent, obs in zip(trainers, obs_n) ] # environment step new_obs_n, rew_n, done_n, info_n = env.step(action_n) episode_step += 1 done = any(done_n) terminal = (episode_step >= arglist.max_episode_len) # collect experience for i, agent in enumerate(trainers): agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) obs_n = new_obs_n for i, rew in enumerate(rew_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew if done or terminal: obs_n = env.reset() # save episode info epinfobuf.append({ "r": episode_rewards[-1], "l": episode_step, "t": round(time.time() - t_start, 6) }) # reset episode variables episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) if train_step % arglist.log_interval == 0: # logger.logkv(Config.tensorboard_rootdir+"serial_timesteps", train_step) # logger.logkv(Config.tensorboard_rootdir+"num_update", update) logger.logkv(Config.tensorboard_rootdir + "total_timesteps", train_step) logger.logkv(Config.tensorboard_rootdir + "current_episode", int(train_step / arglist.max_episode_len)) # logger.logkv(Config.tensorboard_rootdir+"fps", fps) # logger.logkv(Config.tensorboard_rootdir+"explained_variance", float(ev)) logger.logkv(Config.tensorboard_rootdir + 'ep_reward_mean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv(Config.tensorboard_rootdir + 'ep_length', safemean([epinfo['l'] for epinfo in epinfobuf])) logger.logkv(Config.tensorboard_rootdir + 'time_elapsed', round(time.time() - t_start, 6)) # for (lossval, lossname) in zip(lossvals, model.loss_names): # logger.logkv(Config.tensorboard_rootdir+lossname, lossval) if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: logger.dumpkvs() # increment global step counter # train_step += 1 # for benchmarking learned policies if arglist.benchmark: for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) if train_step > arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue # for displaying learned policies if arglist.display: time.sleep(0.1) env.render() continue # update all trainers, if not in display or benchmark mode loss = None for agent in trainers: agent.preupdate() for agent in trainers: loss = agent.update(trainers, train_step) # log loss info for each agent if (train_step % arglist.log_interval == 0) and loss: lossvals = [ np.mean(data, axis=0) if isinstance(data, list) else data for data in loss ] for (lossval, lossname) in zip(lossvals, agent.loss_names): log_key = "{}{}/{}".format(Config.tensorboard_rootdir, lossname, agent.name) logger.logkv(log_key, lossval) # save model if at save_rate step or if its the first train_step save_model = arglist.save_rate and ( (train_step % arglist.save_rate == 0) or (train_step == total_timesteps)) # only save model if logger dir specified and current node rank is 0 (multithreading) save_model &= logger.get_dir() and (MPI is None or MPI.COMM_WORLD.Get_rank() == 0) if save_model: checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i' % train_step) print('Saving to', savepath) U.save_state(savepath, saver=saver) # model.save(savepath) env.close()
def train(env, arglist, trainers): episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward agent_info = [[[]]] # placeholder for benchmarking info saver = tf.train.Saver() obs_n = env.reset() episode_step = 0 train_step = 0 t_start = time.time() print('Starting iterations...') while True: # get action action_n = [agent.action(obs) for agent, obs in zip(trainers, obs_n)] # environment step new_obs_n, rew_n, done_n, info_n = env.step(action_n) episode_step += 1 done = all(done_n) terminal = (episode_step >= arglist.max_episode_len) # collect experience for i, agent in enumerate(trainers): agent.experience( obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) obs_n = new_obs_n for i, rew in enumerate(rew_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew if done or terminal: obs_n = env.reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) # increment global step counter train_step += 1 # for benchmarking learned policies if arglist.benchmark: for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) if train_step > arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue # for displaying learned policies if arglist.display: time.sleep(0.1) env.render() continue # update all trainers, if not in display or benchmark mode for agent in trainers: agent.preupdate() for agent in trainers: loss = agent.update(trainers, train_step) episode = len(episode_rewards) + arglist.restore_episode # save model, display training output if (done or terminal) and (episode % arglist.save_rate == 0): mean_reward = np.mean(episode_rewards[-arglist.save_rate:]) agents_episode_rewards = [ np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards] time_spent = round(time.time()-t_start, 3) U.save_state( f"{arglist.save_dir}/{arglist.exp_name}/episode_{episode}/model", saver=saver) # print statement depends on whether or not there are adversaries if num_adversaries == 0: print("steps: {}, episodes: {}, mean episode reward: {}, time: {}".format( train_step, episode, mean_reward, time_spent)) else: print("steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format( train_step, episode, mean_reward, agents_episode_rewards, time_spent)) t_start = time.time() # Keep track of rewards save_rewards(arglist, episode, mean_reward, agents_episode_rewards) # saves final episode reward for plotting training curve later if episode >= arglist.num_episodes: print('...Finished total of {} episodes.'.format(episode)) break
def train(arglist): # To make sure that training and testing are based on diff seeds if arglist.restore: create_seed(np.random.randint(2)) else: create_seed(arglist.seed) with U.single_threaded_session() as sess: # Create environment env = make_env(arglist.scenario, arglist, arglist.benchmark) # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) print('Using good policy {} and adv policy {}'.format(arglist.good_policy, arglist.adv_policy)) # Initialize U.initialize() # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.restore or arglist.benchmark: print('Loading previous state...') U.load_state(arglist.load_dir) if arglist.analysis: print("Starting analysis on {}...".format(arglist.analysis)) if arglist.analysis != 'video': analyze.run_analysis(arglist, env, trainers) return # should be a single run episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info saver = tf.train.Saver() obs_n = env.reset() episode_step = 0 train_step = 0 t_start = time.time() new_episode = True # start of a new episode (used for replay buffer) start_saving_comm = False if arglist.graph: print("Setting up graph writer!") writer = tf.summary.FileWriter("learning_curves/graph",sess.graph) print('Starting iterations...') while True: if arglist.actor_lstm: # get critic input states p_in_c_n, p_in_h_n = get_lstm_states('p', trainers) # num_trainers x 1 x 1 x 64 if arglist.critic_lstm: q_in_c_n, q_in_h_n = get_lstm_states('q', trainers) # num_trainers x 1 x 1 x 64 # get action action_n = [agent.action(obs) for agent, obs in zip(trainers,obs_n)] if arglist.critic_lstm: # get critic output states p_states = [p_in_c_n, p_in_h_n] if arglist.actor_lstm else [] update_critic_lstm(trainers, obs_n, action_n, p_states) q_out_c_n, q_out_h_n = get_lstm_states('q', trainers) # num_trainers x 1 x 1 x 64 if arglist.actor_lstm: p_out_c_n, p_out_h_n = get_lstm_states('p', trainers) # num_trainers x 1 x 1 x 64 # environment step new_obs_n, rew_n, done_n, info_n = env.step(action_n) episode_step += 1 done = all(done_n) terminal = (episode_step >= arglist.max_episode_len) # collect experience for i, agent in enumerate(trainers): num_episodes = len(episode_rewards) # do this every iteration if arglist.critic_lstm and arglist.actor_lstm: agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], # terminal, p_in_c_n[i][0], p_in_h_n[i][0], p_out_c_n[i][0], p_out_h_n[i][0], q_in_c_n[i][0], q_in_h_n[i][0], q_out_c_n[i][0], q_out_h_n[i][0], new_episode) elif arglist.critic_lstm: agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], # terminal, q_in_c_n[i][0], q_in_h_n[i][0], q_out_c_n[i][0], q_out_h_n[i][0],new_episode) elif arglist.actor_lstm: agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], # terminal, p_in_c_n[i][0], p_in_h_n[i][0], p_out_c_n[i][0], p_out_h_n[i][0], new_episode) else: agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], # terminal, new_episode) obs_n = new_obs_n # Adding rewards if arglist.tracking: for i, a in enumerate(trainers): if arglist.num_episodes - len(episode_rewards) <= 1000: a.tracker.record_information("goal", np.array(env.world.landmarks[0].state.p_pos)) a.tracker.record_information("position",np.array(env.world.agents[i].state.p_pos)) a.tracker.record_information("ag_reward", rew_n[i]) a.tracker.record_information("team_dist_reward", info_n["team_dist"][i]) a.tracker.record_information("team_diff_reward", info_n["team_diff"][i]) # Closing graph writer if arglist.graph: writer.close() for i, rew in enumerate(rew_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew if done or terminal: new_episode = True num_episodes = len(episode_rewards) obs_n = env.reset() # reset trainers if arglist.actor_lstm or arglist.critic_lstm: for agent in trainers: agent.reset_lstm() if arglist.tracking: for agent in trainers: agent.tracker.reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) else: new_episode=False # increment global step counter train_step += 1 # for benchmarking learned policies if arglist.benchmark: for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) if train_step > arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue # update all trainers, if not in display or benchmark mode loss = None # get same episode sampling if arglist.sync_sampling: inds = [random.randint(0, len(trainers[0].replay_buffer._storage)-1) for i in range(arglist.batch_size)] else: inds = None for agent in trainers: # if arglist.lstm: # agent.preupdate(inds=inds) # else: agent.preupdate(inds) for agent in trainers: loss = agent.update(trainers, train_step) if loss is None: continue # for displaying learned policies if arglist.display: env.render() # continue # save model, display training output if terminal and (len(episode_rewards) % arglist.save_rate == 0): U.save_state(arglist.save_dir, saver=saver) # print statement depends on whether or not there are adversaries if num_adversaries == 0: print("steps: {}, episodes: {}, mean episode reward: {}, time: {}".format( train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time()-t_start, 3))) else: print("steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format( train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), [np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards], round(time.time()-t_start, 3))) t_start = time.time() # Keep track of final episode reward final_ep_rewards.append(np.mean(episode_rewards[-arglist.save_rate:])) for rew in agent_rewards: final_ep_ag_rewards.append(np.mean(rew[-arglist.save_rate:])) # saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: # U.save_state(arglist.save_dir, saver=saver) if arglist.tracking: for agent in trainers: agent.tracker.save() rew_file_name = "rewards/" + arglist.commit_num + "_rewards.pkl" with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = "rewards/" + arglist.commit_num + "_agrewards.pkl" # agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) print('...Finished total of {} episodes.'.format(len(episode_rewards))) break
def train(arglist): with U.single_threaded_session(): # [Initialization] # Create environment env = make_env(arglist.scenario, arglist, arglist.benchmark) # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) print('Using good policy {} and adv policy {}'.format( arglist.good_policy, arglist.adv_policy)) # Initialize (Tensorflow initialization procedure) U.initialize() # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.display or arglist.restore or arglist.benchmark: print('Loading previous state...') U.load_state(arglist.load_dir) # Parameters initialization episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info saver = tf.train.Saver() obs_n = env.reset() episode_step = 0 train_step = 0 t_start = time.time() print('Starting iterations...') while True: # get action action_n = [ agent.action(obs) for agent, obs in zip(trainers, obs_n) ] # Get Action from Policy training. # environment step according to actions new_obs_n, rew_n, done_n, info_n = env.step( action_n ) # Receive the observation, the reward, the done and the information from the simulation environment. episode_step += 1 done = all(done_n) # Check if all tasks have been done. terminal = (episode_step >= arglist.max_episode_len ) # Check the timeout. # record experience to agents for i, agent in enumerate( trainers ): # The "done" may be the actions which has been executed at the past. agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) # Record for the experience replay. obs_n = new_obs_n # Reset the current observation for i, rew in enumerate( rew_n ): # Update the total rewards and each agent's rewards episode_rewards[-1] += rew agent_rewards[i][-1] += rew if done or terminal: # Task finished or timeout, restart the simulation environment again. obs_n = env.reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) # increment global step counter train_step += 1 # for benchmarking learned policies if arglist.benchmark: # Save the agents' information. for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) if train_step > arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue # for displaying learned policies if arglist.display: time.sleep(0.1) # Delay. env.render() # Displaying the environment if necessary. continue # update all trainers, if not in display or benchmark mode [Important] loss = None for agent in trainers: agent.preupdate( ) # Clear the index randomly choosed by method 'make_index' --> 'agent.replay_sample_index = None' for agent in trainers: loss = agent.update(trainers, train_step) # save model, display training output if terminal and (len(episode_rewards) % arglist.save_rate == 0): U.save_state(arglist.save_dir, saver=saver) # print statement depends on whether or not there are adversaries if num_adversaries == 0: print( "steps: {}, episodes: {}, mean episode reward: {}, time: {}" .format(train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time() - t_start, 3))) else: print( "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}" .format(train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), [ np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards ], round(time.time() - t_start, 3))) t_start = time.time() # Keep track of final episode reward final_ep_rewards.append( np.mean(episode_rewards[-arglist.save_rate:])) for rew in agent_rewards: final_ep_ag_rewards.append( np.mean(rew[-arglist.save_rate:])) # saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) print('...Finished total of {} episodes.'.format( len(episode_rewards))) break
def train(arglist): ############################################ @marlo.threaded def funcion(env, action, agent_num): contador = 0 while True: # Ejecutamos la accion evitando errores _, r, done, info, new_obs = env.step(np.argmax(action) + 1) new_obs = new_obs['observation'] if new_obs == None: new_obs = last_obs[agent_num] else: new_obs = [ new_obs.get('XPos'), new_obs.get('ZPos'), new_obs.get('Yaw') ] contador += 1 if r != 0: break elif info != None: if "caught_the_Chicken" in info: r += 1 print("SE HA HARCODEADO LA PUNTUACION ", done, " ", info) break if "Agent0_defaulted" in info: r = -0.02 break if "Agent1_defaulted" in info: r = -0.02 break elif contador >= 100: print("SE HA TARDADO MUCHO EN REALIZAR LA ACCION") break return new_obs, r, done, info ####################################################### with U.single_threaded_session(): # Create environment client_pool = [('127.0.0.1', 10000), ('127.0.0.1', 10001)] join_tokens = marlo.make( "MarLo-MobchaseTrain1-v0", params=dict(client_pool=client_pool, agent_names=["MarLo-Agent-0", "MarLo-Agent-1"], videoResolution=[64, 64], kill_clients_after_num_rounds=500, forceWorldReset=False, max_retries=500, retry_sleep=0.1, step_sleep=0.1, prioritise_offscreen_rendering=False, suppress_info=False)) assert len(join_tokens) == 2 # Create agent trainers #obs_shape_n = [(64,64,3,),(64,64,3,)] observation_space = [ gym.spaces.Box(low=-np.inf, high=+np.inf, shape=(6, ), dtype=np.float32), gym.spaces.Box(low=-np.inf, high=+np.inf, shape=(6, ), dtype=np.float32) ] obs_shape_n = [observation_space[i].shape for i in range(2)] action_space = [gym.spaces.Discrete(4), gym.spaces.Discrete(4)] num_adversaries = 0 trainers = get_trainers(num_adversaries, obs_shape_n, action_space, arglist) # Initialize U.initialize() epis_trans = 0 epsilon = 0.0 # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.restore: print('Loading previous state...') resbuf = pickle.load(open("./saves/losbuffers.p", "rb")) epis_trans = resbuf[2] epsilon = resbuf[3] U.load_state(arglist.load_dir + str(epis_trans)) trainers[0].replay_buffer = resbuf[0] trainers[1].replay_buffer = resbuf[1] episode_rewards = [] agent_rewards = [ [] for _ in range(2) ] # lista de sumas de las recompensas de cada episodio final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] # agent rewards for training curve saver = tf.train.Saver() t_start = time.time() #inicial0 = [1.5, 2.5, 270, 5.5, 6.5, 180] #inicial1 = [5.5, 6.5, 180, 1.5, 2.5, 270] inicial0 = [1.5, 2.5, 270, 3.5, 4.5, 180] inicial1 = [3.5, 4.5, 180, 1.5, 2.5, 270] while True: #NEW last_obs = [] agent_rewards[0].append(0) agent_rewards[1].append(0) env0 = marlo.init(join_tokens[0]) env1 = marlo.init(join_tokens[1]) # Run agent-0 agent_thread_0, res0 = reiniciar(env0) # Run agent-1 agent_thread_1, res1 = reiniciar(env1) obs0 = res0.get() obs1 = res1.get() obs0 = inicial0 obs1 = inicial1 done0 = False done1 = False num_eps = 0 #Ejecutar 10 episodios while True: if (random() > epsilon): action0 = trainers[0].action(np.array( obs0)) # se obtine la accion que ejecuta la politica else: action0 = np.random.dirichlet(np.ones(4), size=1)[0] if (random() > epsilon): action1 = trainers[1].action(np.array( obs0)) # se obtine la accion que ejecuta la politica else: action1 = np.random.dirichlet(np.ones(4), size=1)[0] #print("Estan dentro") # Run agent-0 agent_thread_0, resul0 = funcion(env0, action0, 0) # Run agent-1 agent_thread_1, resul1 = funcion(env1, action1, 1) # Wait for both the threads to complete execution agent_thread_0.join() #print("Esta fuera 1") agent_thread_1.join() #print("Estan fuera") nob0, r0, done0, i0 = resul0.get() nob1, r1, done1, i1 = resul1.get() last_obs = [copy.deepcopy(nob0), copy.deepcopy(nob1)] # Las nuevas observciones varhelp = copy.deepcopy(nob0) nob0.extend(nob1) nob1.extend(varhelp) #print("ESTAS SON LAS OBSERVACIONES") #print(nob0) #print(nob1) trainers[0].experience(np.array(obs0), action0, r0, np.array(nob0), done0, False) trainers[1].experience(np.array(obs1), action1, r1, np.array(nob1), done1, False) agent_rewards[0][-1] += r0 agent_rewards[1][-1] += r1 obs0 = nob0 obs1 = nob1 if done0 or done1: print("EPISODIO NUMERO:", num_eps) # Run agent-0 agent_thread_0, res0 = reiniciar(env0) # Run agent-1 agent_thread_1, res1 = reiniciar(env1) obs0 = res0.get() obs1 = res1.get() obs0 = inicial0 obs1 = inicial1 done0 = False done1 = False num_eps += 1 loss = None for agent in trainers: agent.preupdate() for agent in trainers: loss = agent.update(trainers) print("LA LOSS", loss) if num_eps % epi_per_iter == 0: break agent_rewards[0].append(0) agent_rewards[1].append(0) #Fin de ejecutar 10 episodios print("FIN DEL SAMPLE") # Se obtiene una lista de tuplas que contienen las rewards de los agentes emparejadas por episodios utilizadno los ultimos episodios generados en la iteracion # A estas tuplas se transforman a listas y se aplica sum() # El resultado de esto se coloca al final de episode_rewards # # En resumen: se suman las ultimas rewards de los agentes por episodios y se añaden a la lista episode_rewards.extend( list( map( sumtuple, list( zip(agent_rewards[0][epis_trans:], agent_rewards[1][epis_trans:]))))) epis_trans += 10 if epsilon > 0.1: epsilon -= 0.002 print("TOTAL DE EPISODIOS TRANSCURRIDOS: ", epis_trans, " Epsilon: ", epsilon) # update all trainers, if not in display or benchmark mode # save model, display training output if (epis_trans % arglist.save_rate == 0): U.save_state(arglist.save_dir + str(epis_trans), saver=saver) losbuffers = [ trainers[0].replay_buffer, trainers[1].replay_buffer, epis_trans, epsilon ] pickle.dump( losbuffers, open("./saves/losbuffers" + str(epis_trans) + ".p", "wb")) pickle.dump(losbuffers, open("./saves/losbuffers.p", "wb")) if (epis_trans % 1000 == 0): break
def train(arglist): """ Run MADDPG algorithm using passed in commandline arguments Args: arglist (argparse.Namespace): Parsed commandline arguments object """ tf.reset_default_graph() if arglist.seed is not None: np.random.seed(arglist.seed) tf.set_random_seed(arglist.seed) with tf_util.make_session(config=None, num_cpu=1, make_default=False, graph=None): # with tf_util.single_threaded_session(): ########################################### # Create environment # ########################################### env = make_env(arglist.scenario, arglist=arglist, done=arglist.done_callback, logging=arglist.logging, benchmark=arglist.benchmark) ########################################### # Create agent trainers # ########################################### obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) print("Number of Adversaries: {}".format(num_adversaries)) print('Experiment: {}. Using good policy {} and adv policy {}'.format( arglist.exp_name, arglist.good_policy, arglist.adv_policy)) ########################################### # Initialize # ########################################### tf_util.initialize() ########################################### # Load previous results, if necessary # ########################################### if arglist.load_dir == "": arglist.load_dir = arglist.save_dir # if arglist.display or arglist.restore or arglist.benchmark or arglist.load_dir is not None: if arglist.restore or arglist.benchmark or arglist.load_dir is not None: print('Loading previous state...') # Set model file if arglist.model_file == "": arglist.model_file = arglist.exp_name print("Model File: " + arglist.load_dir + arglist.model_file) tf_util.load_state(arglist.load_dir + arglist.model_file) ########################################### # Create the save directory # ########################################### if not os.path.exists(arglist.save_dir): os.makedirs(arglist.save_dir, exist_ok=True) if not os.path.exists(arglist.plots_dir): os.makedirs(arglist.plots_dir, exist_ok=True) ########################################### # Set parameters # ########################################### # Sum of rewards for all agents episode_rewards = [0.0] # This was changed so that a reward can be tracked for fixed policy agents as well as learning agents # Individual agent reward # agent_rewards = [[0.0] for _ in range(env.n)] agent_rewards = [[0.0] for _ in range(len(env.world.agents))] # Retrieve previous episode count try: prev_ep_ct = int(arglist.model_file.split("_")[-1]) except ValueError: print("Starting from untrained network...") prev_ep_ct = 0 ep_ct = prev_ep_ct + arglist.num_episodes # Sum of rewards for training curve final_ep_rewards = [] # Agent rewards for training curve final_ep_ag_rewards = [] # Placeholder for benchmarking info agent_info = [[[]]] saver = tf.train.Saver() obs_n = env.reset() episode_step = 0 train_step = 0 t_start = time.time() progress = False # Save more often if you have fewer episodes arglist.save_rate = min(arglist.save_rate, arglist.num_episodes) # Initialize loss file for each agent if arglist.log_loss: for i in range(len(env.world.agents)): log_loss(arglist, ep_ct, "agent_{}".format(i), initialize=True) ########################################### # Start # ########################################### print('Starting iterations...') while True: # TODO: Switch to is isinstance() # if type(env.world.scripted_agents[0].action) == type(None): # print("Error") # Get action action_n = [ agent.action(obs) for agent, obs in zip(trainers, obs_n) ] # Environment step new_obs_n, rew_n, done_n, info_n = env.step(action_n) # Logging step if arglist.logging: env.log( len(episode_rewards) + prev_ep_ct, episode_step, new_obs_n, rew_n, done_n, info_n) # Update information episode_step += 1 # Check if all agents are done # done = all(done_n) # Check if any agents are done done = any(done_n) terminal = (episode_step >= arglist.max_episode_len) # Collect experience for i, agent in enumerate(trainers): agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) obs_n = new_obs_n for i, rew in enumerate(rew_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew # For displaying learned policies if arglist.display: time.sleep(0.1) env.render() if done or terminal: print('Episode Reward: {}'.format( [rew[-1] for rew in agent_rewards])) time.sleep(0.5) obs_n = env.reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) continue if done or terminal: obs_n = env.reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) # Increment global step counter train_step += 1 # For benchmarking learned policies if arglist.benchmark: for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) if train_step > arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue # In testing mode, don't perform model updates if arglist.testing: if len(episode_rewards) > arglist.num_episodes: print("episodes: {}, " "mean episode reward: {}, time: {}".format( len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time() - t_start, 3))) env.logger.save("State", arglist.save_dir, filename=arglist.exp_name + '_state' + '_' + str(prev_ep_ct) + arglist.log_append) break continue # Update all trainers, if not in display or benchmark mode loss = None for agent in trainers: agent.preupdate() for i, agent in enumerate(trainers): loss = agent.update(trainers, train_step) if arglist.log_loss and loss is not None: log_loss(arglist, ep_ct, "agent_{}".format(i), loss=loss[1]) if len(episode_rewards) % 100 == 0 and progress: print("Episode {} Reached. Time: {}".format( len(episode_rewards), time.time() - t_start)) progress = False elif len(episode_rewards) % 100 != 0 and not progress: progress = True # Save model, display training output if (terminal or done) and (len(episode_rewards) % arglist.save_rate == 0): # TODO: Implement some checks so that we don't overwrite old networks unintentionally? # Save model state tf_util.save_state(arglist.save_dir + arglist.exp_name + '_' + str(len(episode_rewards) + prev_ep_ct), saver=saver) # Print statement depends on whether or not there are adversaries if num_adversaries == 0: print( "steps: {}, episodes: {}, mean episode reward: {}, time: {}" .format(train_step, len(episode_rewards) + prev_ep_ct, np.mean(episode_rewards[-arglist.save_rate:]), round(time.time() - t_start, 3))) else: print( "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}" .format(train_step, len(episode_rewards) + prev_ep_ct, np.mean(episode_rewards[-arglist.save_rate:]), [ np.mean(reward[-arglist.save_rate:]) for reward in agent_rewards ], round(time.time() - t_start, 3))) # Reset start time to current time t_start = time.time() # Keep track of final episode reward final_ep_rewards.append( np.mean(episode_rewards[-arglist.save_rate:])) for reward in agent_rewards: final_ep_ag_rewards.append( np.mean(reward[-arglist.save_rate:])) # Saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) # Log agent data for run env.logger.save("State", arglist.save_dir, filename=arglist.exp_name + '_state' + '_' + str(len(episode_rewards) + prev_ep_ct)) print('...Finished total of {} episodes.'.format( len(episode_rewards))) break
def train(arglist): with U.single_threaded_session(): # Create environment env = StarCraft2Env(map_name=arglist.scenario, reward_only_positive=False, obs_last_action=True, obs_timestep_number=True, reward_scale_rate=200) # Create agent trainers env_info = env.get_env_info() num_agents = env_info["n_agents"] num_adversaries = num_agents obs_shape_n = [(env_info["obs_shape"], ) for i in range(num_adversaries)] action_space_n = [ env_info["n_actions"] for i in range(num_adversaries) ] buffer_size = arglist.buffer_size trainers = get_trainers(num_adversaries, obs_shape_n, action_space_n, arglist, buffer_size) print('Using good policy {} and adv policy {}'.format( arglist.good_policy, arglist.adv_policy)) # Initialize U.initialize() logdir = "./tensorboard/" Logger.DEFAULT \ = Logger.CURRENT \ = Logger(dir=None, output_formats=[TensorBoardOutputFormat(logdir)]) # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.display or arglist.restore or arglist.benchmark: print('Loading previous state...') U.load_state(arglist.load_dir) episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(num_agents)] # individual agent reward saver = tf.train.Saver(max_to_keep=100000000) n_actions_no_attack = 6 env.reset() obs_n = [] reward_hl_own_old = [] reward_hl_en_old = [] for agent_id in range(num_agents): # 第一个循环是为了得到初始状态/观察/生命值信息 obs = env.get_obs_agent(agent_id) obs_n.append(obs) reward_hl_own_old.append(env.get_agent_health(agent_id)) reward_hl_en_old.append(env.get_enemy_health(agent_id)) episode_step = 0 step = 0 print('Starting iterations...') while True: # get action action_set_actual = [] action_set_execute = [] action_n = [] dead_unit = [] for agent_id in range(num_agents): action_output = trainers[agent_id].action(obs_n[agent_id]) action_n.append(action_output) action_prob = action_output action_to_choose = np.argmax(action_prob) action_set_actual.append(action_to_choose) avail_actions = env.get_avail_agent_actions(agent_id) avail_actions_ind = np.nonzero(avail_actions)[0] if action_to_choose in avail_actions_ind: action_set_execute.append(action_to_choose) elif (avail_actions[0] == 1): action_set_execute.append( 0) # 如果该动作不能执行,并且智能体已经死亡,那么就用NO_OP代替当前动作 else: action_set_execute.append(1) # 如果该动作不能执行,那么就用STOP动作代替 if (len(avail_actions_ind) == 1 and avail_actions_ind[0] == 0): # 判断该智能体是否已经死亡 dead_unit.append(agent_id) rew_base, done, _ = env.step(action_set_execute) episode_rewards[-1] += rew_base new_obs_n = [] reward_hl_own_new = [] reward_hl_en_new = [] rew_n = [] for agent_id in range(num_agents): obs_next = env.get_obs_agent(agent_id=agent_id) new_obs_n.append(obs_next) reward_hl_own_new.append(env.get_agent_health(agent_id)) reward_hl_en_new.append(env.get_enemy_health(agent_id)) for agent_id in range(num_agents): if (agent_id in dead_unit): reward = 0 elif (action_set_execute[agent_id] != action_set_actual[agent_id] ): #当输出动作无法执行时,执行替代动作,但是把输出动作进行保存并且给与一个负的奖励 reward = -2 elif (action_set_execute[agent_id] > 5): target_id = action_set_execute[ agent_id] - n_actions_no_attack health_reduce_en = reward_hl_en_old[ target_id] - reward_hl_en_new[target_id] if (health_reduce_en > 0): if (rew_base > 0): reward = 2 + rew_base else: reward = 2 else: reward = 1 else: reward = (reward_hl_own_new[agent_id] - reward_hl_own_old[agent_id]) * 5 rew_n.append(reward) episode_step += 1 # collect experience for i, agent in enumerate(trainers): agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done) obs_n = new_obs_n reward_hl_own_old = reward_hl_own_new reward_hl_en_old = reward_hl_en_new for i, rew in enumerate(rew_n): agent_rewards[i][-1] += rew if done: print("steps until now : %s, episode: %s, episode reward: %s" % (step, len(episode_rewards), episode_rewards[-1])) logger.record_tabular("episodes", len(episode_rewards)) logger.record_tabular("episode reward", episode_rewards[-1]) for i in range(num_agents): logger.record_tabular("agent" + str(i) + " episode reward", agent_rewards[i][-1]) logger.dump_tabular() env.reset() obs_n = [] reward_hl_own_old = [] reward_hl_en_old = [] for agent_id in range(num_agents): # 第一个循环是为了得到初始状态/观察/生命值信息 obs = env.get_obs_agent(agent_id) obs_n.append(obs) reward_hl_own_old.append(env.get_agent_health(agent_id)) reward_hl_en_old.append(env.get_enemy_health(agent_id)) episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) # increment global step counter step += 1 if (step == arglist.buffer_size): print("Training starts.") # update all trainers, if not in display or benchmark mode loss = None for agent in trainers: agent.preupdate() for agent in trainers: loss = agent.update(trainers, step) # save model, display training output if done and (len(episode_rewards) % arglist.save_rate == 0): save_dir = arglist.save_dir + "/model_" + str( step) + "steps/" + arglist.exp_name U.save_state(save_dir, saver=saver) # print statement depends on whether or not there are adversaries if num_adversaries == 0: print("steps: {}, episodes: {}, mean episode reward: {}". format(step, len(episode_rewards), np.mean( episode_rewards[-arglist.save_rate:]))) else: print( "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}" .format(step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), [ np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards ])) # saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: print('...Finished total of {} episodes.'.format( len(episode_rewards) - 1)) break
def train(arglist): with U.make_session(8): # Create environment env = make_env(arglist.scenario, arglist, arglist.benchmark) # Create agent trainers obs_shape_n = [[29] for i in range(env.n)] obs_map_shape_n =[[56*86] for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) trainers = get_trainers(env, num_adversaries, obs_shape_n, obs_map_shape_n,arglist) print('Using good policy {} and adv policy {}'.format(arglist.good_policy, arglist.adv_policy)) # Initialize U.initialize() # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.display or arglist.restore or arglist.benchmark: print('Loading previous state...') U.load_state(arglist.load_dir) episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info saver = tf.train.Saver() obs_n = env._reset() episode_step = 13000 train_step = 0 t_start = time.time() print('Starting iterations...') while True: # get action #action_n = [agent.action(obs) for agent, obs in zip(trainers,obs_n)] action_n=[] for agent, obs in zip(trainers,obs_n): #print(obs) t=agent.action(obs) d=np.argmax(t) if d%5==4: rt=random.randint(0,20) if rt<4: swap=t[d] t[d]=t[d-rt-1] t[d-rt-1]=swap else: rt=random.randint(0,80) if rt<4: swap=t[d] t[d]=t[d//5*5+rt] t[d//5*5+rt]=swap action_n.append(t) #print(action_n) # environment step new_obs_n, rew_n, done_n, info_n = env._step(action_n) #print(rew_n) episode_step += 1 env.training_episode=episode_step done = all(done_n) terminal = (episode_step >= arglist.max_episode_len) # collect experience for i, agent in enumerate(trainers): agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) obs_n = new_obs_n for i, rew in enumerate(rew_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew if done or terminal: obs_n = env._reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) # increment global step counter train_step += 1 # for benchmarking learned policies if arglist.benchmark: for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) if train_step > arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue # for displaying learned policies if arglist.online_display or arglist.display: time.sleep(0.01) #if rew_n[2]>0: pdb.set_trace() env._render(close=False) print(rew_n) # if (rew_n[2]>0) or (rew_n[0]>0) or (rew_n[1]>0): # pdb.set_trace() #pdb.set_trace() if arglist.display: continue # update all trainers, if not in display or benchmark mode loss = None for agent in trainers: agent.preupdate() for agent in trainers: loss = agent.update(trainers, train_step) # save model, display training output if terminal and (len(episode_rewards) % arglist.save_rate == 0): U.save_state(arglist.save_dir, saver=saver) # print statement depends on whether or not there are adversaries if num_adversaries == 0: print("steps: {}, episodes: {}, mean episode reward: {}, time: {}".format( train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time()-t_start, 3))) else: print("steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format( train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), [np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards], round(time.time()-t_start, 3))) t_start = time.time() # Keep track of final episode reward final_ep_rewards.append(np.mean(episode_rewards[-arglist.save_rate:])) for rew in agent_rewards: final_ep_ag_rewards.append(np.mean(rew[-arglist.save_rate:])) # saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) print('...Finished total of {} episodes.'.format(len(episode_rewards))) break
def train(arglist): with U.single_threaded_session(): # Create environment env = make_env(arglist.scenario, arglist, arglist.benchmark) # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) print('Using good policy {} and adv policy {}'.format( arglist.good_policy, arglist.adv_policy)) print("number of adversaries is: ", num_adversaries) # Initialize U.initialize() # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.display or arglist.restore or arglist.benchmark: print('Loading previous state...') print("path is: ", arglist.load_dir) print("restoring checkpoints") # added for selective training. # Make it general for other environments as well later. if arglist.scenario == "simple_tag": print("inside simple tag") if not arglist.train_adversaries: print("loading only positive") print("number of adversaries are: ", num_adversaries) saver = tf.train.Saver(var_list=tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope="agent_" + str(num_adversaries))) print( "var list is: ", tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="agent_" + str(num_adversaries))) if not arglist.train_positive_agent: print("only loading adversaries") var_list = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="agent_0") print("var list is: ", var_list) for l in range(1, arglist.num_adversaries): var_list += tf.get_collection( tf.GraphKeys.GLOBAL_VARIABLES, scope="agent_" + str(l)) saver = tf.train.Saver(var_list=var_list) U.load_state(arglist.load_dir, saver=saver) else: U.load_state(arglist.load_dir) episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward if arglist.restore: final_ep_rewards = list( np.load(arglist.plots_dir + arglist.exp_name + '_episode_rewards.npy')) final_ep_ag_rewards = list( np.load(arglist.plots_dir + arglist.exp_name + '_agent_rewards.npy')) final_ep_ag_rewards = [list(a) for a in final_ep_ag_rewards] else: final_ep_rewards = [] # sum of rewards for training curve # final_ep_ag_rewards = [] # agent rewards for training curve final_ep_ag_rewards = [[0.0] for _ in range(env.n) ] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info saver = tf.train.Saver() obs_n = env.reset() episode_step = 0 train_step = 0 t_start = time.time() print("number of agents in the environment are: ", env.n) episode_avg_rewards = [0.0] agent_avg_rewards = [[0.0] for _ in range(env.n)] print('Starting iterations...') while True: # get action action_n = [ agent.action(obs) for agent, obs in zip(trainers, obs_n) ] # environment step new_obs_n, rew_n, done_n, info_n = env.step(action_n) episode_step += 1 done = all(done_n) terminal = (episode_step >= arglist.max_episode_len) # collect experience for i, agent in enumerate(trainers): agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) obs_n = new_obs_n for i, rew in enumerate(rew_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew if done or terminal: obs_n = env.reset() episode_step = 0 # this should perhaps be done later. episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) # increment global step counter train_step += 1 # for benchmarking learned policies if arglist.benchmark: for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) if train_step > arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue # for displaying learned policies if arglist.display: time.sleep(0.1) env.render() continue # update all trainers, if not in display or benchmark mode loss = None # for agent in trainers: # agent.preupdate() # for agent in trainers: # loss = agent.update(trainers, train_step) for m in range(0, len(trainers)): agent = trainers[m] if not arglist.train_adversaries and m > num_adversaries: # print("updating positive") agent.preupdate() if not arglist.train_positive_agent and m <= num_adversaries: # print("updating adversary") agent.preupdate() if arglist.train_positive_agent and arglist.train_adversaries: # print("updating both") agent.preupdate() for m in range(0, len(trainers)): agent = trainers[m] if not arglist.train_adversaries and m > num_adversaries: loss = agent.update(trainers, train_step) if not arglist.train_positive_agent and m <= num_adversaries: loss = agent.update(trainers, train_step) if arglist.train_positive_agent and arglist.train_adversaries: loss = agent.update(trainers, train_step) # save model, display training output if terminal and (len(episode_rewards) % arglist.save_rate == 0): U.save_state(arglist.save_dir, saver=saver) # print statement depends on whether or not there are adversaries if num_adversaries == 0: print( "steps: {}, episodes: {}, mean episode reward: {}, time: {}" .format(train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time() - t_start, 3))) else: print( "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}" .format(train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), [ np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards ], round(time.time() - t_start, 3))) t_start = time.time() # Keep track of final episode reward final_ep_rewards.append( np.mean(episode_rewards[-arglist.save_rate:-1])) # for rew in agent_rewards: # final_ep_ag_rewards.append(np.mean(rew[-arglist.save_rate:])) # for rew in agent_rewards: for j in range(len(agent_rewards)): rew = agent_rewards[j] final_ep_ag_rewards[j].append( np.mean(rew[-arglist.save_rate:-1])) # saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) print('...Finished total of {} episodes.'.format( len(episode_rewards))) agent_rewards = np.array(final_ep_ag_rewards) episode_rewards = np.array(final_ep_rewards) np.save( arglist.plots_dir + arglist.exp_name + '_agent_rewards.npy', agent_rewards) np.save( arglist.plots_dir + arglist.exp_name + '_episode_rewards.npy', episode_rewards) fig, ax = plt.subplots() for k in range(len(agent_rewards)): ax.plot(agent_rewards[k], label="agent_" + str(k)) ax.plot(episode_rewards, label="total") ax.legend() plt.savefig(arglist.plots_dir + arglist.exp_name + '_plot.png') plt.show() break
def on_event(self, f): @inlineCallbacks def set_wheel(self, robot_wheels): yield self.call(u'aiwc.set_speed', args.key, robot_wheels) return # initiate empty frame received_frame = Frame() if 'time' in f: received_frame.time = f['time'] if 'score' in f: received_frame.score = f['score'] if 'reset_reason' in f: received_frame.reset_reason = f['reset_reason'] if 'coordinates' in f: received_frame.coordinates = f['coordinates'] if 'EOF' in f: self.end_of_frame = f['EOF'] #self.printConsole(received_frame.time) #self.printConsole(received_frame.score) #self.printConsole(received_frame.reset_reason) #self.printConsole(self.end_of_frame) ############################################################################## if (self.end_of_frame): # How to get the robot and ball coordinates: (ROBOT_ID can be 0,1,2,3,4) #self.printConsole(received_frame.coordinates[MY_TEAM][ROBOT_ID][X]) #self.printConsole(received_frame.coordinates[MY_TEAM][ROBOT_ID][Y]) #self.printConsole(received_frame.coordinates[MY_TEAM][ROBOT_ID][TH]) #self.printConsole(received_frame.coordinates[MY_TEAM][ROBOT_ID][ACTIVE]) #self.printConsole(received_frame.coordinates[MY_TEAM][ROBOT_ID][TOUCH]) #self.printConsole(received_frame.coordinates[OP_TEAM][ROBOT_ID][X]) #self.printConsole(received_frame.coordinates[OP_TEAM][ROBOT_ID][Y]) #self.printConsole(received_frame.coordinates[OP_TEAM][ROBOT_ID][TH]) #self.printConsole(received_frame.coordinates[OP_TEAM][ROBOT_ID][ACTIVE]) #self.printConsole(received_frame.coordinates[OP_TEAM][ROBOT_ID][TOUCH]) #self.printConsole(received_frame.coordinates[BALL][X]) #self.printConsole(received_frame.coordinates[BALL][Y]) self.get_coord(received_frame) ############################################################################## # Next state, Reward, Reset # new_obs_n = [np.zeros([self.state_dim * self.history_size]) for _ in range(self.number_of_robots)] new_obs_n = [] rew_n = [] done_n = [] for i in range(self.number_of_robots): next_state = self.pre_processing(i) # self.printConsole(next_state) # new_obs_n[i] = np.append(next_state, next_state - self.obs_n[i][:-self.state_dim]) # position and velocity new_obs_n.append( np.append(next_state, next_state - self.obs_n[i][:-self.state_dim]) ) # position and velocity # self.printConsole('observation ' + str(i) + ': '+ str(new_obs_n[i])) rew_n.append(self.get_reward(received_frame.reset_reason, i)) if (received_frame.reset_reason != NONE): done_n.append(True) else: done_n.append(False) done = all(done_n) if done: self.printConsole("reset reason: " + str(received_frame.reset_reason)) # self.printConsole('reward: ' + str(rew_n[0])) # rew_n = [sum(rew_n) for i in range(self.number_of_robots)] # for i, agent in enumerate(self.trainers): # agent.experience(self.obs_n[i], self.action_n[i], rew_n[i], new_obs_n[i], done_n[i], False) for i in range(self.number_of_robots): if not self.cur_my_posture[i][ACTIVE]: self.printConsole('robot ' + str(i) + ' is not active') continue self.trainers[0].experience(self.obs_n[i], self.action_n[i], rew_n[i], new_obs_n[i], done_n[i], False) self.obs_n = new_obs_n # for i, rew in enumerate(rew_n): # self.episode_rewards[-1] += rew # self.agent_rewards[i][-1] += rew # if done: # self.episode_rewards.append(0) # for a in self.agent_rewards: # a.append(0) # self.agent_info.append([[]]) self.reward_sum += rew_n # increment global step counter self.train_step += 1 # update all trainers loss = None for agent in self.trainers: agent.preupdate() for agent in self.trainers: loss = agent.update(self.trainers, self.train_step) # get action # self.action_n = [agent.action(obs) for agent, obs in zip(self.trainers,self.obs_n)] self.action_n = [ self.trainers[0].action(obs) for obs in self.obs_n ] # self.printConsole("original action: " + str(self.action_n[0])) for i in range(self.number_of_robots): self.wheels[2 * i] = self.max_linear_velocity * ( self.action_n[i][1] - self.action_n[i][2] + self.action_n[i][3] - self.action_n[i][4]) self.wheels[2 * i + 1] = self.max_linear_velocity * ( self.action_n[i][1] - self.action_n[i][2] - self.action_n[i][3] + self.action_n[i][4]) # self.printConsole(" action: " + str(self.wheels[:2])) self.printConsole('step: ' + str(self.train_step)) self.pre_ball = self.cur_ball set_wheel(self, self.wheels.tolist()) ############################################################################## if (self.train_step % self.save_every_steps) == 0: U.save_state(self.arglist.save_dir, saver=self.saver) # if done: # plot the statics if (self.train_step % self.stats_steps ) == 0: # plot every 6000 steps (about 5 minuites) self.printConsole("add data to tensorboard") stats = [sum(self.reward_sum)] + [ self.reward_sum[i] for i in range(len(self.reward_sum)) ] + [self.score_sum] for i in range(len(stats)): U.get_session().run(self.update_ops[i], feed_dict={ self.summary_placeholders[i]: float(stats[i]) }) summary_str = U.get_session().run(self.summary_op) self.summary_writer.add_summary(summary_str, self.inner_step) self.reward_sum = np.zeros(len(self.reward_sum)) self.score_sum = 0 self.inner_step += 1 ############################################################################## if (received_frame.reset_reason == GAME_END): #(virtual finish() in random_walk.cpp) #save your data with open(args.datapath + '/result.txt', 'w') as output: #output.write('yourvariables') output.close() #unsubscribe; reset or leave yield self.sub.unsubscribe() try: yield self.leave() except Exception as e: self.printConsole("Error: {}".format(e)) self.end_of_frame = False
def train(arglist): with U.single_threaded_session(): # Create environment env = make_env(arglist.scenario, arglist, arglist.benchmark) # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) state_shape_n = [(64, ) for i in range(env.n)] trainers = get_trainers(env, num_adversaries, obs_shape_n, state_shape_n, arglist) print('Using good policy {} and adv policy {}'.format( arglist.good_policy, arglist.adv_policy)) # Initialize U.initialize() episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward episode_begin_num = 0 # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.display or arglist.restore or arglist.benchmark: print('Loading previous state...') U.load_state(arglist.load_dir) fname = './learning_curves/' + arglist.exp_name + '_rewards.pkl' final_ep_rewards = pickle.load(open(fname, 'rb')) fname = './learning_curves/' + arglist.exp_name + '_agrewards.pkl' final_ep_ag_rewards = pickle.load(open(fname, 'rb')) episode_begin_num = arglist.save_rate * len(final_ep_rewards) final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info saver = tf.train.Saver() obs_n = env.reset() state_n = [agent.p_init_state(1) for agent in trainers] pred_n = [agent.init_pred(1) for agent in trainers] episode_step = 0 train_step = 0 t_start = time.time() print('Starting iterations...') while True: ## get action temp = [ agent.take_action(obs, state, pred) for agent, obs, state, pred in zip( trainers, obs_n, state_n, pred_n) ] action_n = [x[0] for x in temp] new_state_n = [x[1] for x in temp] gru_out_n = [x[2] for x in temp] new_pred_n = [ agent.predict(act[None], gru_out) for agent, act, gru_out in zip(trainers, action_n, gru_out_n) ] # environment step new_obs_n, rew_n, done_n, info_n = env.step(action_n) episode_step += 1 done = all(done_n) terminal = (episode_step >= arglist.max_episode_len) # collect experience ## need to be modified for i, agent in enumerate(trainers): agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) obs_n = new_obs_n state_n = new_state_n # pred_n = [x.eval() for x in new_pred_n] pred_n = new_pred_n for i, rew in enumerate(rew_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew if done or terminal: obs_n = env.reset() state_n = [agent.p_init_state(1) for agent in trainers] pred_n = [agent.init_pred(1) for agent in trainers] episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) # increment global step counter train_step += 1 # for benchmarking learned policies if arglist.benchmark: for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) if train_step > arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue # for displaying learned policies if arglist.display: time.sleep(0.05) env.render() continue # update all trainers, if not in display or benchmark mode loss = None for agent in trainers: agent.preupdate() for agent in trainers: loss = agent.update(trainers, train_step, arglist.step_size, arglist.burn_in_step) # save model, display training output episode_num = len(episode_rewards) + episode_begin_num if terminal and (episode_num % arglist.save_rate == 0): # print statement depends on whether or not there are adversaries if num_adversaries == 0: print( "steps: {}, episodes: {}, mean episode reward: {}, time: {}" .format(train_step, episode_num, np.mean(episode_rewards[-arglist.save_rate:]), round(time.time() - t_start, 3))) else: print( "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}" .format(train_step, episode_num, np.mean(episode_rewards[-arglist.save_rate:]), [ np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards ], round(time.time() - t_start, 3))) t_start = time.time() # Keep track of final episode reward final_ep_rewards.append( np.mean(episode_rewards[-arglist.save_rate:])) for rew in agent_rewards: final_ep_ag_rewards.append( np.mean(rew[-arglist.save_rate:])) rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) print('...Finished total of {} episodes.'.format(episode_num)) U.save_state(arglist.save_dir, saver=saver) if episode_num > arglist.num_episodes: rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) print('...Finished total of {} episodes.'.format( len(episode_rewards))) break
def train(arglist, PID=None, lock=None): start_time = time.time() # global replay_buffer with U.single_threaded_session() as sess: # Create environment env = make_env(arglist.scenario, arglist, arglist.benchmark) # Create agents networks obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] ####changed by yuan li num_adversaries = copy.deepcopy(env.num_adversaries) arglist.num_adversaries = copy.deepcopy(num_adversaries) if comm_rank != 0 and comm_rank != 1: req = None wait_flag = False actors = get_agents(env, num_adversaries, obs_shape_n, arglist) U.initialize() #var_list = [var for var in tf.trainable_variables()] #加载模型 var_list_n = [] for actor in actors: var_list_n.extend(actor.get_variable_list()) saver = tf.train.Saver(var_list=var_list_n, max_to_keep=20) if arglist.load_dir != "": U.load_state(arglist.load_dir, saver) episode_rewards, agent_rewards, final_ep_rewards, final_ep_ag_rewards, agent_info = initialize_variables( env) obs_n = env.reset() step = 0 episode_step = 0 sample_number = 0 t_start = time.time() updata_time = 0 print('Starting iterations...') invalid_train, red_win, red_leave, green_win, green_leave = 0, 0, 0, 0, 0 while True: if not wait_flag: #req = comm.irecv(350000, source=(comm_rank - 1 + comm_size) % comm_size, tag=11) req = comm.irecv(350000, source=0, tag=11) wait_flag = True else: data_recv = req.test() if data_recv[0]: wait_flag = False if data_recv[1] == 'finish': #finish = True comm.send('finish', dest=1, tag=11) break else: update_start = time.time() i = 0 j = 0 for var in tf.trainable_variables(): if 11 < (i % 24) < 24: var.load(data_recv[1][j], sess) j += 1 i += 1 #for var in var_list: # var.load(data_recv[1][i], sess) # i += 1 #print("111111111111111111111111,load param") #for i, actor in enumerate(actors): # actor.load_weights(data_recv[1][i], sess) update_end = time.time() #print("step:{}, rank0_update_end_time:{}".format(step, update_end)) updata_time += (update_end - update_start) step += 1 else: wait_flag = True # get action action_n = [ agent.action(obs) for agent, obs in zip(actors, obs_n) ] # environment step new_obs_n, rew_n, done_n, info_n = env.step(action_n) episode_step += 1 # changed by liyuan done = any(done_n) terminal = (episode_step >= arglist.max_episode_len) ###liyuan: compute the arverage win rate if green_leave_screen(env) or adversary_all_die( env) or adversary_leave_screen(env): terminal = True if adversary_all_die(env): green_win += 1 if green_leave_screen(env): invalid_train += 1 green_leave += 1 if adversary_leave_screen(env): red_leave += 1 if episode_step >= arglist.max_episode_len: for i, agent in enumerate(env.agents): if agent.adversary: rew_n[i] -= 50 if adversary_all_die(env): for i, agent in enumerate(env.agents): if agent.adversary: rew_n[i] -= 100 if done: red_win = red_win + 1 for i, agent in enumerate(env.agents): if agent.adversary: rew_n[i] += 200 rew_n[i] += ( arglist.max_episode_len - episode_step) / arglist.max_episode_len #send data data = [obs_n, action_n, rew_n, new_obs_n, done_n] comm.send(data, dest=1, tag=11) sample_number += 1 #replay_buffer.add(obs_n, action_n, rew_n, new_obs_n, done_n) obs_n = new_obs_n for i, rew in enumerate(rew_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew if done or terminal: obs_n = env.reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) # save model, display training output if (terminal or done) and (len(episode_rewards) % arglist.save_rate == 0): if red_win >= 0.8 * arglist.save_rate: temp_dir = arglist.save_dir + "_" + str( len(episode_rewards)) + "_" + str( red_win) + "_{}".format(PID) U.save_state(temp_dir, saver=saver) # print statement depends on whether or not there are adversaries if num_adversaries == 0: print( "Rank {}, steps: {}, episodes: {}, mean episode reward: {}, time: {}" .format( comm_rank, sample_number, len(episode_rewards), np.mean(episode_rewards[-arglist. save_rate:]), round(time.time() - t_start, 3))) else: print( "Rank {}, steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}" .format( comm_rank, sample_number, len(episode_rewards), np.mean(episode_rewards[-arglist. save_rate:]), [ np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards ], round(time.time() - t_start, 3))) print( "Rank {}, red win: {}, green win: {}, red all leave: {}, green all leave: {}" .format(comm_rank, red_win, green_win, red_leave, green_leave)) middle_time = time.time() print( "sample_number:{}, train_step:{}, update_time:{}, total_time:{}" .format(sample_number, step, updata_time, middle_time - start_time)) mydata = [] mydata.append(str(len(episode_rewards))) mydata.append( str( np.mean(episode_rewards[-arglist. save_rate:]))) mydata.append( str( np.mean(agent_rewards[0] [-arglist.save_rate:]))) mydata.append( str( np.mean(agent_rewards[1] [-arglist.save_rate:]))) mydata.append( str( np.mean(agent_rewards[2] [-arglist.save_rate:]))) mydata.append(str(red_win)) mydata.append( str(round(time.time() - t_start, 3))) out = open('1mydata_{}.csv'.format(comm_rank), 'a', newline='') csv_write = csv.writer(out, dialect='excel') csv_write.writerow(mydata) if len(episode_rewards) > 3000: U.save_state(arglist.save_dir, saver=saver) invalid_train, red_win, red_leave, green_win, green_leave = 0, 0, 0, 0, 0 t_start = time.time() # Keep track of final episode reward final_ep_rewards.append( np.mean(episode_rewards[-arglist.save_rate:])) for rew in agent_rewards: final_ep_ag_rewards.append( np.mean(rew[-arglist.save_rate:])) end_time = time.time() print("rank{}_time:{}".format(comm_rank, end_time - start_time)) print("rank{}_update_time:{}".format(comm_rank, updata_time)) print("rank{}_step:{}".format(comm_rank, step)) if comm_rank == 1: replay_buffer = ReplayBuffer(1e6) wait_flag_1 = False wait_flag_2 = False wait_flag_3 = False req1 = None req2 = None req3 = None sample = 0 step = 0 req_list = [] while True: if not wait_flag_1 or not wait_flag_2 or not wait_flag_3: if not wait_flag_1: req1 = comm.irecv(source=2, tag=11) wait_flag_1 = True if not wait_flag_2: req2 = comm.irecv(source=3, tag=11) wait_flag_2 = True if not wait_flag_3: req3 = comm.irecv(source=4, tag=11) wait_flag_3 = True else: data_recv_1 = req1.test() data_recv_2 = req2.test() data_recv_3 = req3.test() if data_recv_1[0] or data_recv_2[0] or data_recv_3[0]: if data_recv_1[0]: wait_flag_1 = False if data_recv_1[1] == 'finish': break else: obs_n, action_n, rew_n, new_obs_n, done_n = data_recv_1[ 1] replay_buffer.add(obs_n, action_n, rew_n, new_obs_n, done_n) sample += 1 if data_recv_2[0]: wait_flag_2 = False if data_recv_2[1] == 'finish': break else: obs_n, action_n, rew_n, new_obs_n, done_n = data_recv_2[ 1] replay_buffer.add(obs_n, action_n, rew_n, new_obs_n, done_n) sample += 1 if data_recv_3[0]: wait_flag_3 = False if data_recv_3[1] == 'finish': break else: obs_n, action_n, rew_n, new_obs_n, done_n = data_recv_3[ 1] replay_buffer.add(obs_n, action_n, rew_n, new_obs_n, done_n) sample += 1 ''' #计算接收100个样本然后发送样本用的时间 if (sample % 100 == 0) and len(replay_buffer) >= arglist.batch_size * arglist.max_episode_len: start = time.time() replay_sample_index = replay_buffer.make_index(arglist.batch_size) send_data = replay_buffer.sample_index(replay_sample_index) #send_data = (obs_n_a, act_n_a, rew_n_a, obs_next_n_a, done_n_a) comm.send(send_data, dest=(comm_rank + 1) % comm_size, tag=11) sample = 0 step += 1 end = time.time() print("rank1 send sample time:", end-start) ''' else: wait_flag_1 = True wait_flag_2 = True wait_flag_3 = True if (sample // 100 > 0) and len( replay_buffer ) >= arglist.batch_size * arglist.max_episode_len: replay_sample_index = replay_buffer.make_index( arglist.batch_size) send_data = replay_buffer.sample_index( replay_sample_index) #send_data = (obs_n_a, act_n_a, rew_n_a, obs_next_n_a, done_n_a) comm.send(send_data, dest=0, tag=11) sample = 0 step += 1 end_time = time.time() print("rank1_time:", end_time - start_time) print("rank1_step", step) if comm_rank == 0: extract_time = 0 step = 0 learners = get_agents(env, num_adversaries, obs_shape_n, arglist) var_list_n = [] for learner in learners: var_list_n.extend(learner.get_variable_list()) U.initialize() #var_list = [var for var in tf.trainable_variables()] # 加载模型 saver = tf.train.Saver(var_list=var_list_n, max_to_keep=20) if arglist.load_dir != "": U.load_state(arglist.load_dir, saver) while True: if step >= STEP: for i in range(comm_size - 2): comm.send('finish', dest=(i + 2), tag=11) break else: start = time.time() data_recv = comm.recv(source=1, tag=11) for i, agent in enumerate(learners): agent.update(learners, data_recv) #dict_list = [] param = [] extract_start = time.time() i = 0 for var in tf.trainable_variables(): if 11 < (i % 24) < 24: param.append(sess.run(var)) i += 1 #print("2222222222222222 load weights") #for var in var_list: # param.append(sess.run(var)) extract_end = time.time() extract_time += (extract_end - extract_start) for i in range(comm_size - 2): comm.send(param, dest=(i + 2), tag=11) #print("222222222222222222222222,send param") step += 1 end = time.time() #print("rank2 train time:{}, extract_time:{}".format(end - start, extract_end - extract_start)) end_time = time.time() print("rank0_time:", end_time - start_time) print("rank0_extract_time:", extract_time) print("rank0_step:", step)
def train(arglist): # random.seed(arglist.random_seed) # np.random.seed(arglist.random_seed) # tf.set_random_seed(arglist.random_seed) with U.single_threaded_session(): # Create environment env = make_env(arglist.scenario, arglist, arglist.benchmark) # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) print('Using good policy {} and adv policy {}'.format( arglist.good_policy, arglist.adv_policy)) # Initialize U.initialize() savers = [ tf.train.Saver(U.scope_vars(trainer.name)) for trainer in trainers ] # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.display or arglist.restore or arglist.benchmark: print('Loading previous state...') # U.load_state(arglist.load_dir) [ U.load_state(os.path.join(arglist.load_dir, 'team_{}'.format(i)), saver=saver) for i, saver in enumerate(savers) ] episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info saver = tf.train.Saver() obs_n = env.reset() episode_step = 0 train_step = 0 if arglist.trainer == 'tarmac' or arglist.trainer == 'reuse_tarmac' or arglist.trainer == 'ibmac_inter': message_n = np.zeros([len(obs_n), 4]) is_training = True t_start = time.time() writer = tf.summary.FileWriter("graph", U.get_session().graph) writer.close() writer = SummaryWriter(arglist.save_dir) print('Starting iterations...') while True: # get action if arglist.trainer == 'ibmac' or arglist.trainer == 'reuse_ibmac': is_inference = False if arglist.display or arglist.restore or arglist.benchmark: is_inference = False if len(trainers) == 2: action_n1 = trainers[0].action(obs_n[:num_adversaries], is_inference=is_inference) action_n2 = trainers[1].action(obs_n[num_adversaries:], is_inference=is_inference) action_n = [action[0] for action in action_n1 ] + [action[0] for action in action_n2] else: action_n = trainers[0].action(obs_n, is_inference=is_inference) action_n = [action[0] for action in action_n] elif arglist.trainer == 'ibmac_inter': if len(trainers) == 2: action_n1, message_action_n1 = trainers[0].action( obs_n[:num_adversaries], message_n[:num_adversaries]) action_n2, message_action_n2 = trainers[1].action( obs_n[num_adversaries:], message_n[num_adversaries:]) action_n = [action[0] for action in action_n1 ] + [action[0] for action in action_n2] else: action_n, message_action_n = trainers[0].action( obs_n, message_n) action_n = [action[0] for action in action_n] message_n = [ message_action[0] for message_action in message_action_n ] else: action_n = [ agent.action(obs) for agent, obs in zip(trainers, obs_n) ] # environment step new_obs_n, rew_n, done_n, info_n = env.step(action_n) episode_step += 1 done = all(done_n) terminal = (episode_step >= arglist.max_episode_len) # collect experience if arglist.trainer == 'ibmac': if len(trainers) == 2: trainers[0].experience(obs_n[:num_adversaries], action_n[:num_adversaries], rew_n[:num_adversaries], new_obs_n[:num_adversaries], done_n[:num_adversaries], terminal) trainers[1].experience(obs_n[num_adversaries:], action_n[num_adversaries:], rew_n[num_adversaries:], new_obs_n[num_adversaries:], done_n[num_adversaries:], terminal) else: trainers[0].experience(obs_n, action_n, rew_n, new_obs_n, done_n, terminal) elif arglist.trainer == 'ibmac_inter': if len(trainers) == 2: trainers[0].experience(obs_n[:num_adversaries], message_n[:num_adversaries], action_n[:num_adversaries], rew_n[:num_adversaries], new_obs_n[:num_adversaries], done_n[:num_adversaries], terminal) trainers[1].experience(obs_n[num_adversaries:], message_n[:num_adversaries], action_n[num_adversaries:], rew_n[num_adversaries:], new_obs_n[num_adversaries:], done_n[num_adversaries:], terminal) else: trainers[0].experience(obs_n, message_n, action_n, rew_n, new_obs_n, done_n, terminal) else: for i, agent in enumerate(trainers): agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) obs_n = new_obs_n for i, rew in enumerate(rew_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew if done or terminal: obs_n = env.reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) # increment global step counter train_step += 1 # for benchmarking learned policies if arglist.benchmark: for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) if train_step > arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue # for displaying learned policies if arglist.display: env.render() continue # update all trainers, if not in display or benchmark mode loss = None for agent in trainers: agent.preupdate() for i, agent in enumerate(trainers): loss = agent.update(trainers, train_step) if loss: if isinstance(agent, IBMACAgentTrainer) or isinstance( agent, ReuseIBMACAgentTrainer): q_loss, p_loss, _, _, _, _, kl_loss = loss writer.add_scalar('agent_{}/loss_kl'.format(i), kl_loss, train_step) else: q_loss, p_loss, _, _, _, _ = loss writer.add_scalar('agent_{}/loss_policy'.format(i), p_loss, train_step) writer.add_scalar('agent_{}/loss_critic'.format(i), q_loss, train_step) # save model, display training output if terminal and (len(episode_rewards) % arglist.save_rate == 0): U.save_state(arglist.save_dir, saver=saver) [ U.save_state(os.path.join(arglist.save_dir, 'team_{}'.format(i)), saver=saver) for i, saver in enumerate(savers) ] # print statement depends on whether or not there are adversaries for i in range(len(agent_rewards)): writer.add_scalar( 'agent_{}/mean_episode_reward'.format(i), np.mean(agent_rewards[i][-arglist.save_rate:]), len(episode_rewards)) if num_adversaries == 0: print( "steps: {}, episodes: {}, mean episode reward: {}, time: {}" .format(train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time() - t_start, 3))) else: print( "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}" .format(train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), [ np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards ], round(time.time() - t_start, 3))) t_start = time.time() # Keep track of final episode reward final_ep_rewards.append( np.mean(episode_rewards[-arglist.save_rate:])) for rew in agent_rewards: final_ep_ag_rewards.append( np.mean(rew[-arglist.save_rate:])) # saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) print('...Finished total of {} episodes.'.format( len(episode_rewards))) break
def train(arglist): with U.single_threaded_session(): if not os.path.isdir(arglist.save_dir): os.makedirs(arglist.save_dir) if not os.path.isdir(arglist.benchmark_dir): os.makedirs(arglist.benchmark_dir) if not os.path.isdir(arglist.plots_dir): os.makedirs(arglist.plots_dir) #tensorboard summary_writer = tf.summary.FileWriter( "./" + arglist.exp_name + "_graph/", U.get_session().graph) reward_plot = None reward_summary = tf.Summary() reward_summary.value.add(tag='reward', simple_value=reward_plot) # Create environment env = make_env(arglist.scenario, arglist, arglist.benchmark) # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) print('Using good policy {} and adv policy {}'.format( arglist.good_policy, arglist.adv_policy)) # Initialize U.initialize() # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.display or arglist.restore or arglist.benchmark: print('Loading previous state...') U.load_state(arglist.load_dir) episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info saver = tf.train.Saver() obs_n = env.reset() episode_step = 0 train_step = 0 """ #### USE RVO """ use_rvo_range = -1 # if want to use rvo, set 0.28 t_start = time.time() print('Starting iterations...') while True: # get action action_n = [ agent.action(obs) for agent, obs in zip(trainers, obs_n) ] if use_rvo_range < 0: new_obs_n, rew_n, done_n, info_n = env.step(action_n, use_rvo=None) else: # use_rvo list total_rvo_list = [] for obs in obs_n: agent_pos = obs[-2 * (env.world.num_agents - 1)::] obst_pos = obs[-2 * (env.world.num_agents + env.world.num_obstacles)::] agent_rvo_list = [] for i in range(0, len(agent_pos), 2): if np.sqrt(np.sum(np.square( agent_pos[i:i + 2]))) < use_rvo_range: agent_rvo_list.append(True) else: agent_rvo_list.append(False) for i in range(0, len(obst_pos), 2): if np.sqrt(np.sum(np.square( obst_pos[i:i + 2]))) < use_rvo_range: agent_rvo_list.append(True) else: agent_rvo_list.append(False) if any(agent_rvo_list): total_rvo_list.append(True) else: total_rvo_list.append(False) # environment step new_obs_n, rew_n, done_n, info_n = env.step( action_n, use_rvo=total_rvo_list) episode_step += 1 done = all(done_n) terminal = (episode_step >= arglist.max_episode_len) # collect experience for i, agent in enumerate(trainers): agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) obs_n = new_obs_n for i, rew in enumerate(rew_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew if done or terminal: obs_n = env.reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) # increment global step counter train_step += 1 # for benchmarking learned policies if arglist.benchmark: for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) if train_step > arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue # for displaying learned policies if arglist.display: time.sleep(0.1) env.render() continue # update all trainers, if not in display or benchmark mode loss = None for agent in trainers: agent.preupdate() for agent in trainers: loss = agent.update(trainers, train_step) # add reward to tensorboard reward_summary.value[0].simple_value = np.mean( episode_rewards[-arglist.save_rate:]) summary_writer.add_summary(reward_summary, len(episode_rewards)) # save model, display training output if terminal and (len(episode_rewards) % arglist.save_rate == 0): U.save_state(arglist.save_dir, saver=saver) # print statement depends on whether or not there are adversaries if num_adversaries == 0: print( "steps: {}, episodes: {}, mean episode reward: {}, time: {}" .format(train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time() - t_start, 3))) else: print( "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}" .format(train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), [ np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards ], round(time.time() - t_start, 3))) t_start = time.time() if terminal: # Keep track of final episode reward final_ep_rewards.append( np.mean(episode_rewards[-arglist.save_rate:])) for rew in agent_rewards: final_ep_ag_rewards.append( np.mean(rew[-arglist.save_rate:])) # saves final episode reward for plotting training curve later if len(episode_rewards) % 1000 == 0: rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' + str( len(episode_rewards)) with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) print('saved') if len(episode_rewards) > arglist.num_episodes: print('...Finished total of {} episodes.'.format( len(episode_rewards))) break
def train(arglist): with U.single_threaded_session(): # Create environment env = make_env(arglist.scenario, arglist, arglist.benchmark) obs_n = env.reset() # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) print('Using good policy {} and adv policy {}'.format( arglist.good_policy, arglist.adv_policy)) # Pretrain the safety_layer safety_layer = None if arglist.use_safety_layer: safety_layer = SafetyLayer(env, len(env.world.landmarks) - 1, mlp_model_safety_layer, env.observation_space[0].shape, env.action_space, trainers[0].action) # set safety_layer for trainer[0] trainers[0].set_safety_layer(safety_layer) if arglist.use_mpc_layer: safety_layer = MpcLayer(env) # set safety_layer for trainer[0] trainers[0].set_safety_layer(safety_layer) # Initialize U.initialize() # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.display or arglist.restore or arglist.benchmark: print('Loading previous state...') U.load_state(arglist.load_dir) episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info saver = tf.train.Saver() episode_step = 0 train_step = 0 cumulative_constraint_violations = 0 t_start = time.time() data_save = [] num_done = 0 # pickle env # env0 = copy.deepcopy(env) '''file_path = open('env.pkl', 'rb') import pickle for i in range(len(env.world.landmarks)): env.world.landmarks[i] = pickle.load(file_path) for i in range(len(env.world.agents)): env.world.agents[i] = pickle.load(file_path) obs_n = [] agents = env.world.agents for agent in agents: obs_n.append(env._get_obs(agent))''' print('Starting iterations...') while True: # get constraint_values c_n = env.get_constraint_values() is_any_collision = env.is_any_collision() if is_any_collision[0]: cumulative_constraint_violations = cumulative_constraint_violations + 1 '''if c_n[0][0] > 0: print("there is a c_n > 0")''' # get action action_n = [ agent.action_real(obs, c, env) for agent, obs, c in zip(trainers, obs_n, c_n) ] action_real = [action_n[0][0]] if_call = [action_n[0][2]] action_n = [action_n[0][1]] data_save.append( np.concatenate([obs_n[0], action_n[0], action_n[0]])) # environment step new_obs_n, rew_n, done_n, info_n = env.step(action_n, if_call=if_call) '''is_any_collision_new = env.is_any_collision() if is_any_collision_new[0]: env.is_any_collision() dist = np.sqrt(np.sum(np.square(env.agents[0].state.p_pos - env.world.landmarks[0].state.p_pos))) -\ (env.agents[0].size + env.world.landmarks[0].size) # print("aaa", env.agents[0].state.p_pos, dist)''' # new c_n # new_c_n = env.get_constraint_values() episode_step += 1 done = all(done_n) terminal = (episode_step >= arglist.max_episode_len) or \ (env.agents[0].state.p_pos[0] - env.world.landmarks[-1].state.p_pos[0]) > 1.5 # collect experience for i, agent in enumerate(trainers): agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) obs_n = new_obs_n for i, rew in enumerate(rew_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew if done or terminal: if done: num_done = num_done + 1 data_save.append( np.concatenate([obs_n[0], action_n[0], action_n[0]])) data_save = np.array(data_save) '''np.savetxt("data_save.txt", data_save)''' # 缺省按照'%.18e'格式保存数据,以空格分隔 # plot x, y, v, theta a = data_save V = a[:, 1] x = a[:, 2] y = a[:, 3] theta = a[:, 4] omega = a[:, 5] # action_n = a[:, 26] - a[:, 27] # action_real = a[:, 31] - a[:, 32] fig, ax0 = plt.subplots() for i, landmark in enumerate(env.world.landmarks[:-1]): p_pos = landmark.state.p_pos r = landmark.size circle = mpathes.Circle(p_pos, r, facecolor='w', edgecolor='forestgreen', linestyle='-.') ax0.add_patch(circle) for i, landmark in enumerate(env.world.landmarks): p_pos = landmark.state.p_pos r = (landmark.size - 0.09) if landmark is not env.world.landmarks[ -1] else landmark.size circle = mpathes.Circle(p_pos, r, facecolor='forestgreen') ax0.add_patch(circle) for i in range(len(x)): p_pos = np.array([x[i], y[i]]) r = env.world.agents[0].size circle = mpathes.Circle(p_pos, r, facecolor='darkgreen') ax0.add_patch(circle) ax0.set_xlim((-1, 40)) ax0.set_ylim((-10, 10)) ax0.axis('equal') ax0.set_title("x-y") x1 = [-1, 40] y1 = [10, 10] y2 = [-10, -10] ax0.plot(x1, y1, color='forestgreen', linestyle='-.') ax0.plot(x1, y2, color='forestgreen', linestyle='-.') plt.show() '''fig, ax = plt.subplots(ncols=2, nrows=2) for i, landmark in enumerate(env.world.landmarks): p_pos = landmark.state.p_pos r = landmark.size circle = mpathes.Circle(p_pos, r) ax[0, 0].add_patch(circle) for i in range(len(x)): p_pos = np.array([x[i], y[i]]) r = env.world.agents[0].size circle = mpathes.Circle(p_pos, r) ax[0, 0].add_patch(circle) ax[0, 0].set_xlim((-1, 20)) ax[0, 0].set_ylim((-10.3, 10.3)) ax[0, 0].set_title("x-y") ax[0, 0].axis('equal') ax[0, 1].plot(theta) ax[0, 1].set_title("theta") ax[1, 0].plot(omega) ax[1, 0].set_title("omega") # ax[1, 1].plot(action_n * 0.12) # ax[1, 1].set_title("action_n") plt.show()''' # reset and continue data_save = [] obs_n = env.reset() # env0 = copy.deepcopy(env) episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) # increment global step counter train_step += 1 # for benchmarking learned policies if arglist.benchmark: for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) if train_step > arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue # for displaying learned policies if arglist.display: time.sleep(0.1) env.render() continue # update all trainers, if not in display or benchmark mode loss = None for agent in trainers: agent.preupdate() '''for agent in trainers: loss = agent.update(trainers, train_step)''' # save model, display training output if (done or terminal) and ((len(episode_rewards) - 1) % arglist.save_rate == 0): U.save_state(arglist.save_dir, saver=saver) # print statement depends on whether or not there are adversaries if num_adversaries == 0: print( "steps: {}, episodes: {}, mean episode reward: {}, num_cumulative_constraints: {}, num_done: {}, time: {}" .format(train_step, len(episode_rewards) - 1, np.mean(episode_rewards[-arglist.save_rate:]), cumulative_constraint_violations, num_done, round(time.time() - t_start, 3))) else: print( "steps: {}, episodes: {}, mean episode reward: {}, num_cumulative_constraints: {}, num_done: {}, time: {}" .format(train_step, len(episode_rewards) - 1, np.mean(episode_rewards[-arglist.save_rate:]), cumulative_constraint_violations, num_done, round(time.time() - t_start, 3))) # print(trainers[0].safety_layer.num_call) t_start = time.time() num_done = 0 cumulative_constraint_violations = 0 # Keep track of final episode reward final_ep_rewards.append( np.mean(episode_rewards[-arglist.save_rate:])) for rew in agent_rewards: final_ep_ag_rewards.append( np.mean(rew[-arglist.save_rate:])) # saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) print('...Finished total of {} episodes.'.format( len(episode_rewards))) break
def train(arglist): with U.single_threaded_session(): # Create environment env = make_env(arglist.scenario, arglist, arglist.benchmark) # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) print('Using good policy {} and adv policy {}'.format( arglist.good_policy, arglist.adv_policy)) # Initialize U.initialize() # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.display or arglist.restore or arglist.benchmark or arglist.plot: print('Loading previous state...') U.load_state(arglist.load_dir) episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info saver = tf.train.Saver() obs_n = env.reset() episode_step = 0 train_step = 0 t_start = time.time() plot_data = [] print('Starting iterations...') while True: # get action action_n = [ agent.action(obs) for agent, obs in zip(trainers, obs_n) ] # environment step new_obs_n, rew_n, done_n, info_n = env.step(action_n) episode_step += 1 done = all(done_n) terminal = (episode_step >= arglist.max_episode_len) # collect experience for i, agent in enumerate(trainers): agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) obs_n = new_obs_n plot_d = env.get_plot_data() for i, rew in enumerate(rew_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew plot_data.append(plot_d) if done or terminal: if arglist.plot: if arglist.scenario == "simple_spread" or arglist.scenario == "simple_spread_obstacles": plot_spread(plot_data) if arglist.scenario == "simple_formation" or arglist.scenario == "simple_formation_obstacles": plot_formation(plot_data) obs_n = env.reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) plot_data = [] # increment global step counter train_step += 1 # for benchmarking learned policies if arglist.benchmark: for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) if train_step > arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue # for displaying learned policies if arglist.display: time.sleep(0.1) env.render() continue # update all trainers, if not in display or benchmark mode loss = None for agent in trainers: agent.preupdate() for agent in trainers: loss = agent.update(trainers, train_step) # save model, display training output if terminal and (len(episode_rewards) % arglist.save_rate == 0): U.save_state(arglist.save_dir, saver=saver) # print statement depends on whether or not there are adversaries if num_adversaries == 0: print( "steps: {}, episodes: {}, mean episode reward: {}, time: {}" .format(train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time() - t_start, 3))) else: print( "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}" .format(train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), [ np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards ], round(time.time() - t_start, 3))) t_start = time.time() # Keep track of final episode reward final_ep_rewards.append( np.mean(episode_rewards[-arglist.save_rate:])) for rew in agent_rewards: final_ep_ag_rewards.append( np.mean(rew[-arglist.save_rate:])) # saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(episode_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(agent_rewards, fp) print('...Finished total of {} episodes.'.format( len(episode_rewards))) break
def train(arglist): with U.single_threaded_session(): # create world world = World() # Create environment env = MultiAgentTorcsEnv(world, 0, world.reset_world, world.reward, world.observation, done_callback=world.done) # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = env.adv #min(env.n, arglist.num_adversaries) trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) print('Using good policy {} and adv policy {}'.format( arglist.good_policy, arglist.adv_policy)) # Initialize U.initialize() # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.display or arglist.restore or arglist.benchmark: print('Loading previous state...') U.load_state(arglist.load_dir) episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info saver = tf.train.Saver() #todo : call reset function here os.system("pkill torcs") os.system("cd ~/vtorcs3 && ./torcs &" ) #use the location of torcs installation on your system time.sleep(0.5) os.system('sh autostart.sh') time.sleep(1) obs_n = [] world.initialize_agents() for agent in env.agents: obs_n.append(world.observation(agent)) #obs_n = env.reset() episode_step = 0 train_step = 0 t_start = time.time() episode_count = 0 epsilon = 1 EXPLORE = 100000. train_indicator = 1 print('Starting iterations...') while True: print("Episode number: " + str(episode_count) + " ") # get action action_n = [ agent.action(obs) for agent, obs in zip(trainers, obs_n) ] # environment step new_obs_n, rew_n, done_n, info_n = env.step( action_n, epsilon, train_indicator) episode_step += 1 done = all(done_n) terminal = (episode_step >= arglist.max_episode_len) # collect experience for i, agent in enumerate(trainers): agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) obs_n = new_obs_n for i, rew in enumerate(rew_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew if done or terminal: obs_n = env.reset() epsilon -= 1.0 / EXPLORE episode_step = 0 episode_rewards.append(0) episode_count += 1 for a in agent_rewards: a.append(0) agent_info.append([[]]) # increment global step counter train_step += 1 world.step = train_step # for benchmarking learned policies if arglist.benchmark: for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) if train_step > arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue #NA for TORCS env # for displaying learned policies '''if arglist.display: time.sleep(0.1) env.render() continue''' # update all trainers, if not in display or benchmark mode loss = None for agent in trainers: agent.preupdate() for agent in trainers: loss = agent.update(trainers, train_step) l2 = "Loss is " + str(loss) + "\n" with open("log2.txt", "a") as f: f.write(l2) print(l2) # save model, display training output if terminal and (len(episode_rewards) % arglist.save_rate == 0): U.save_state(arglist.save_dir, saver=saver) # print statement depends on whether or not there are adversaries if num_adversaries == 0: print( "steps: {}, episodes: {}, mean episode reward: {}, time: {}" .format(train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time() - t_start, 3))) else: print( "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}" .format(train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), [ np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards ], round(time.time() - t_start, 3))) t_start = time.time() # Keep track of final episode reward final_ep_rewards.append( np.mean(episode_rewards[-arglist.save_rate:])) for rew in agent_rewards: final_ep_ag_rewards.append( np.mean(rew[-arglist.save_rate:])) # saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) print('...Finished total of {} episodes.'.format( len(episode_rewards))) break