def init_variables(self, info): # Here you have the information of the game (virtual init() in random_walk.cpp) # List: game_time, goal, number_of_robots, penalty_area, codewords, # robot_height, robot_radius, max_linear_velocity, field, team_info, # {rating, name}, axle_length, resolution, ball_radius # self.game_time = info['game_time'] self.field = info['field'] self.robot_size = 2 * info['robot_radius'] self.goal = info['goal'] self.max_linear_velocity = info['max_linear_velocity'] self.number_of_robots = info['number_of_robots'] self.end_of_frame = False self.cur_my = [] self.cur_ball = [] self.state_dim = 2 # relative ball self.history_size = 2 # frame history size self.action_dim = 2 # 2 self.arglist = Argument() self.state_shape = (self.state_dim * self.history_size, ) # state dimension self.act_space = [Discrete(self.action_dim * 2 + 1)] self.trainers = MADDPGAgentTrainer('agent_moving', self.mlp_model, self.state_shape, self.act_space, 0, self.arglist, local_q_func=False) # for tensorboard self.summary_placeholders, self.update_ops, self.summary_op = \ self.setup_summary() self.summary_writer = \ tf.summary.FileWriter('summary/moving_test', U.get_session().graph) U.initialize() # Load previous results, if necessary if self.arglist.load_dir == "": self.arglist.load_dir = self.arglist.save_dir if self.arglist.restore: print('Loading previous state... %s' % self.arglist.load_dir) U.load_state(self.arglist.load_dir) self.saver = tf.train.Saver(max_to_keep=1100) self.state = np.zeros([self.state_dim * self.history_size ]) # histories self.train_step = 216000 self.wheels = np.zeros(self.number_of_robots * 2) self.action = np.zeros(self.action_dim * 2 + 1) # not np.zeros(2) self.stats_steps = 6000 # for tensorboard self.rwd_sum = 0 self.done = False self.control_idx = 0 return
def init_variables(self, info): # Here you have the information of the game (virtual init() in random_walk.cpp) # List: game_time, goal, number_of_robots, penalty_area, codewords, # robot_height, robot_radius, max_linear_velocity, field, team_info, # {rating, name}, axle_length, resolution, ball_radius # self.game_time = info['game_time'] self.field = info['field'] self.robot_size = 2*info['robot_radius'] self.goal = info['goal'] self.max_linear_velocity = info['max_linear_velocity'] self.number_of_robots = info['number_of_robots'] self.end_of_frame = False self.cur_my_posture = [] self.cur_op_posture = [] self.cur_ball = [] self.pre_ball = [0, 0] self.state_dim = 2 # 3*my robots, relative to the ball position self.history_size = 2 # frame history size self.action_dim = 2 # 2 self.arglist = Argument() self.obs_shape_n = [(self.state_dim * self.history_size,) for _ in range(1)] # state dimenstion self.action_space = [spaces.Discrete(self.action_dim * 2 + 1) for _ in range(1)] self.trainers = self.get_trainers(1, self.obs_shape_n, self.action_space, self.arglist) # for tensorboard self.summary_placeholders, self.update_ops, self.summary_op = self.setup_summary() self.summary_writer = tf.summary.FileWriter('summary/aiwc_maddpg', U.get_session().graph) U.initialize() # Load previous results, if necessary if self.arglist.load_dir == "": self.arglist.load_dir = self.arglist.save_dir if self.arglist.display or self.arglist.restore or self.arglist.benchmark: print('Loading previous state...') U.load_state(self.arglist.load_dir) self.final_ep_rewards = [] # sum of rewards for training curve self.final_ep_ag_rewards = [] # agent rewards for training curve self.agent_info = [[[]]] # placeholder for benchmarking info self.saver = tf.train.Saver() self.obs_n = [np.zeros([self.state_dim * self.history_size]) for _ in range(1)] # histories self.train_step = 0 self.wheels = np.zeros(self.number_of_robots*2) self.action_n = [np.zeros(self.action_dim * 2 + 1) for _ in range(1)] self.save_every_steps = 12000 # save the model every 10 minutes self.stats_steps = 6000 # for tensorboard self.reward_sum = 0 self.score_sum = 0 self.active_flag = [[False for _ in range(5)], [False for _ in range(5)]] self.inner_step = 0 self.done = False self.control_idx = 0 return
def train(arglist): with U.single_threaded_session(): if not os.path.isdir(arglist.save_dir): os.makedirs(arglist.save_dir) if not os.path.isdir(arglist.benchmark_dir): os.makedirs(arglist.benchmark_dir) if not os.path.isdir(arglist.plots_dir): os.makedirs(arglist.plots_dir) #tensorboard summary_writer = tf.summary.FileWriter( "./" + arglist.exp_name + "_graph/", U.get_session().graph) reward_plot = None reward_summary = tf.Summary() reward_summary.value.add(tag='reward', simple_value=reward_plot) # Create environment env = make_env(arglist.scenario, arglist, arglist.benchmark) # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) print('Using good policy {} and adv policy {}'.format( arglist.good_policy, arglist.adv_policy)) # Initialize U.initialize() # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.display or arglist.restore or arglist.benchmark: print('Loading previous state...') U.load_state(arglist.load_dir) episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info saver = tf.train.Saver() obs_n = env.reset() episode_step = 0 train_step = 0 """ #### USE RVO """ use_rvo_range = -1 # if want to use rvo, set 0.28 t_start = time.time() print('Starting iterations...') while True: # get action action_n = [ agent.action(obs) for agent, obs in zip(trainers, obs_n) ] if use_rvo_range < 0: new_obs_n, rew_n, done_n, info_n = env.step(action_n, use_rvo=None) else: # use_rvo list total_rvo_list = [] for obs in obs_n: agent_pos = obs[-2 * (env.world.num_agents - 1)::] obst_pos = obs[-2 * (env.world.num_agents + env.world.num_obstacles)::] agent_rvo_list = [] for i in range(0, len(agent_pos), 2): if np.sqrt(np.sum(np.square( agent_pos[i:i + 2]))) < use_rvo_range: agent_rvo_list.append(True) else: agent_rvo_list.append(False) for i in range(0, len(obst_pos), 2): if np.sqrt(np.sum(np.square( obst_pos[i:i + 2]))) < use_rvo_range: agent_rvo_list.append(True) else: agent_rvo_list.append(False) if any(agent_rvo_list): total_rvo_list.append(True) else: total_rvo_list.append(False) # environment step new_obs_n, rew_n, done_n, info_n = env.step( action_n, use_rvo=total_rvo_list) episode_step += 1 done = all(done_n) terminal = (episode_step >= arglist.max_episode_len) # collect experience for i, agent in enumerate(trainers): agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) obs_n = new_obs_n for i, rew in enumerate(rew_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew if done or terminal: obs_n = env.reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) # increment global step counter train_step += 1 # for benchmarking learned policies if arglist.benchmark: for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) if train_step > arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue # for displaying learned policies if arglist.display: time.sleep(0.1) env.render() continue # update all trainers, if not in display or benchmark mode loss = None for agent in trainers: agent.preupdate() for agent in trainers: loss = agent.update(trainers, train_step) # add reward to tensorboard reward_summary.value[0].simple_value = np.mean( episode_rewards[-arglist.save_rate:]) summary_writer.add_summary(reward_summary, len(episode_rewards)) # save model, display training output if terminal and (len(episode_rewards) % arglist.save_rate == 0): U.save_state(arglist.save_dir, saver=saver) # print statement depends on whether or not there are adversaries if num_adversaries == 0: print( "steps: {}, episodes: {}, mean episode reward: {}, time: {}" .format(train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time() - t_start, 3))) else: print( "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}" .format(train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), [ np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards ], round(time.time() - t_start, 3))) t_start = time.time() if terminal: # Keep track of final episode reward final_ep_rewards.append( np.mean(episode_rewards[-arglist.save_rate:])) for rew in agent_rewards: final_ep_ag_rewards.append( np.mean(rew[-arglist.save_rate:])) # saves final episode reward for plotting training curve later if len(episode_rewards) % 1000 == 0: rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' + str( len(episode_rewards)) with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) print('saved') if len(episode_rewards) > arglist.num_episodes: print('...Finished total of {} episodes.'.format( len(episode_rewards))) break
def train(arglist, extra_args=None): tf_graph = tf.Graph() tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) tf_config.gpu_options.allow_growth = True with tf.Session(graph=tf_graph, config=tf_config): # Create environment env = make_env(arglist.scenario, arglist) # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] if arglist.num_adversaries is None: arglist.num_adversaries = len([ agent for agent in env.agents if (hasattr(agent, "adversary") and agent.adversary) ]) arglist.num_adversaries = min(env.n, arglist.num_adversaries) num_adversaries = arglist.num_adversaries trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) print('Using good policy {} and adv policy {}'.format( arglist.good_policy, arglist.adv_policy)) # Initialize U.initialize() if os.environ.get("OUTPUT_GRAPH"): tf.summary.FileWriter(os.path.join(logger.get_dir(), "tb"), U.get_session().graph) # Load previous results, if necessary if arglist.display or arglist.restore or arglist.benchmark: print('Loading previous state...') U.load_state(arglist.load_dir) episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info saver = tf.train.Saver(max_to_keep=None) obs_n = env.reset() episode_step = 0 train_step = 0 t_start = time.time() print('Starting iterations...') while True: # get action action_n = [ agent.action(obs) for agent, obs in zip(trainers, obs_n) ] # print("[action] " + ", ".join(["agent {i}: {action}".format(i=i, action=list(action_n[i])) for i in range(len(action_n))])) # environment step new_obs_n, rew_n, done_n, info_n = env.step(action_n) episode_step += 1 done = all(done_n) terminal = (episode_step >= arglist.max_episode_len) # collect experience for i, agent in enumerate(trainers): agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) obs_n = new_obs_n for i, rew in enumerate(rew_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew if done or terminal: if arglist.save_render_images: input_file_name = os.path.join( arglist.render_dir, "image-episode_{}-step_%d.png".format( len(episode_rewards))) output_file_name = os.path.join( arglist.render_dir, "video-episode_{}.mp4".format(len(episode_rewards))) command = "ffmpeg -y -r 10 -i {} {}".format( input_file_name, output_file_name) os.system(command) print("Saved render video at {}".format(output_file_name)) for episode_step_ in range(episode_step): file_name = os.path.join( arglist.render_dir, "image-episode_{}-step_{}.png".format( len(episode_rewards), episode_step_)) if os.path.exists(file_name): os.remove(file_name) obs_n = env.reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) # increment global step counter train_step += 1 # for benchmarking learned policies if arglist.benchmark: for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) if train_step > arglist.benchmark_iters and (done or terminal): file_name = os.path.join(arglist.benchmark_dir, 'benchmark.pkl') print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue # for displaying learned policies if arglist.display: time.sleep(0.1) if arglist.save_render_images: images = env.render(mode="rgb_array") image = images[0] file_name = os.path.join( arglist.render_dir, "image-episode_{}-step_{}.png".format( len(episode_rewards), episode_step)) plt.imsave(file_name, image) print("Saved render image at {}".format(file_name)) else: env.render(mode="human") continue # update all trainers, if not in display or benchmark mode loss = None for agent in trainers: agent.preupdate() for agent in trainers: loss = agent.update(trainers, train_step) # save model if terminal and (len(episode_rewards) % arglist.save_rate == 0): U.save_state(os.path.join( arglist.save_dir, "checkpoint-episode_{}".format(len(episode_rewards))), saver=saver) # print training scalars if terminal and ((len(episode_rewards) % arglist.print_rate == 0) or (len(episode_rewards) % arglist.save_rate == 0)): # print statement depends on whether or not there are adversaries logger.log("Time: {}".format( datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) logger.logkv("steps", train_step) logger.logkv("episodes", len(episode_rewards)) logger.logkv("mean_episode_reward", np.mean(episode_rewards[-arglist.save_rate:])) if num_adversaries == 0: # print("[{}] steps: {}, episodes: {}, mean episode reward: {}, time: {}".format(time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime()), # train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time()-t_start, 3))) pass else: for agent_index in range(len(agent_rewards)): logger.logkv( "agent_{}_episode_reward".format(agent_index), np.mean(agent_rewards[agent_index] [-arglist.save_rate:])) # print("[{}] steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}".format(time.strftime("%Y-%m-%dT%H:%M:%S", time.localtime()), # train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), # [np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards], round(time.time()-t_start, 3))) logger.logkv("time", round(time.time() - t_start, 3)) logger.dumpkvs() t_start = time.time() # Keep track of final episode reward final_ep_rewards.append( np.mean(episode_rewards[-arglist.save_rate:])) for rew in agent_rewards: final_ep_ag_rewards.append( np.mean(rew[-arglist.save_rate:])) # saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: rew_file_name = os.path.join(arglist.plots_dir, 'rewards.pkl') with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = os.path.join(arglist.plots_dir, 'average_rewards.pkl') with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) print('...Finished total of {} episodes.'.format( len(episode_rewards))) break
def on_event(self, f): @inlineCallbacks def set_wheel(self, robot_wheels): yield self.call(u'aiwc.set_speed', args.key, robot_wheels) return # initiate empty frame received_frame = Frame() if 'time' in f: received_frame.time = f['time'] if 'score' in f: received_frame.score = f['score'] if 'reset_reason' in f: received_frame.reset_reason = f['reset_reason'] if 'coordinates' in f: received_frame.coordinates = f['coordinates'] if 'EOF' in f: self.end_of_frame = f['EOF'] #self.printConsole(received_frame.time) #self.printConsole(received_frame.score) #self.printConsole(received_frame.reset_reason) #self.printConsole(self.end_of_frame) ############################################################################## if (self.end_of_frame): # How to get the robot and ball coordinates: (ROBOT_ID can be 0,1,2,3,4) #self.printConsole(received_frame.coordinates[MY_TEAM][ROBOT_ID][X]) #self.printConsole(received_frame.coordinates[MY_TEAM][ROBOT_ID][Y]) #self.printConsole(received_frame.coordinates[MY_TEAM][ROBOT_ID][TH]) #self.printConsole(received_frame.coordinates[MY_TEAM][ROBOT_ID][ACTIVE]) #self.printConsole(received_frame.coordinates[MY_TEAM][ROBOT_ID][TOUCH]) #self.printConsole(received_frame.coordinates[OP_TEAM][ROBOT_ID][X]) #self.printConsole(received_frame.coordinates[OP_TEAM][ROBOT_ID][Y]) #self.printConsole(received_frame.coordinates[OP_TEAM][ROBOT_ID][TH]) #self.printConsole(received_frame.coordinates[OP_TEAM][ROBOT_ID][ACTIVE]) #self.printConsole(received_frame.coordinates[OP_TEAM][ROBOT_ID][TOUCH]) #self.printConsole(received_frame.coordinates[BALL][X]) #self.printConsole(received_frame.coordinates[BALL][Y]) self.get_coord(received_frame) ############################################################################## # Next state, Reward, Reset # new_obs_n = [np.zeros([self.state_dim * self.history_size]) for _ in range(self.number_of_robots)] new_obs_n = [] rew_n = [] done_n = [] for i in range(self.number_of_robots): next_state = self.pre_processing(i) # self.printConsole(next_state) # new_obs_n[i] = np.append(next_state, next_state - self.obs_n[i][:-self.state_dim]) # position and velocity new_obs_n.append( np.append(next_state, next_state - self.obs_n[i][:-self.state_dim]) ) # position and velocity # self.printConsole('observation ' + str(i) + ': '+ str(new_obs_n[i])) rew_n.append(self.get_reward(received_frame.reset_reason, i)) if (received_frame.reset_reason != NONE): done_n.append(True) else: done_n.append(False) done = all(done_n) if done: self.printConsole("reset reason: " + str(received_frame.reset_reason)) # self.printConsole('reward: ' + str(rew_n[0])) # rew_n = [sum(rew_n) for i in range(self.number_of_robots)] # for i, agent in enumerate(self.trainers): # agent.experience(self.obs_n[i], self.action_n[i], rew_n[i], new_obs_n[i], done_n[i], False) for i in range(self.number_of_robots): if not self.cur_my_posture[i][ACTIVE]: self.printConsole('robot ' + str(i) + ' is not active') continue self.trainers[0].experience(self.obs_n[i], self.action_n[i], rew_n[i], new_obs_n[i], done_n[i], False) self.obs_n = new_obs_n # for i, rew in enumerate(rew_n): # self.episode_rewards[-1] += rew # self.agent_rewards[i][-1] += rew # if done: # self.episode_rewards.append(0) # for a in self.agent_rewards: # a.append(0) # self.agent_info.append([[]]) self.reward_sum += rew_n # increment global step counter self.train_step += 1 # update all trainers loss = None for agent in self.trainers: agent.preupdate() for agent in self.trainers: loss = agent.update(self.trainers, self.train_step) # get action # self.action_n = [agent.action(obs) for agent, obs in zip(self.trainers,self.obs_n)] self.action_n = [ self.trainers[0].action(obs) for obs in self.obs_n ] # self.printConsole("original action: " + str(self.action_n[0])) for i in range(self.number_of_robots): self.wheels[2 * i] = self.max_linear_velocity * ( self.action_n[i][1] - self.action_n[i][2] + self.action_n[i][3] - self.action_n[i][4]) self.wheels[2 * i + 1] = self.max_linear_velocity * ( self.action_n[i][1] - self.action_n[i][2] - self.action_n[i][3] + self.action_n[i][4]) # self.printConsole(" action: " + str(self.wheels[:2])) self.printConsole('step: ' + str(self.train_step)) self.pre_ball = self.cur_ball set_wheel(self, self.wheels.tolist()) ############################################################################## if (self.train_step % self.save_every_steps) == 0: U.save_state(self.arglist.save_dir, saver=self.saver) # if done: # plot the statics if (self.train_step % self.stats_steps ) == 0: # plot every 6000 steps (about 5 minuites) self.printConsole("add data to tensorboard") stats = [sum(self.reward_sum)] + [ self.reward_sum[i] for i in range(len(self.reward_sum)) ] + [self.score_sum] for i in range(len(stats)): U.get_session().run(self.update_ops[i], feed_dict={ self.summary_placeholders[i]: float(stats[i]) }) summary_str = U.get_session().run(self.summary_op) self.summary_writer.add_summary(summary_str, self.inner_step) self.reward_sum = np.zeros(len(self.reward_sum)) self.score_sum = 0 self.inner_step += 1 ############################################################################## if (received_frame.reset_reason == GAME_END): #(virtual finish() in random_walk.cpp) #save your data with open(args.datapath + '/result.txt', 'w') as output: #output.write('yourvariables') output.close() #unsubscribe; reset or leave yield self.sub.unsubscribe() try: yield self.leave() except Exception as e: self.printConsole("Error: {}".format(e)) self.end_of_frame = False
def train(arglist): # random.seed(arglist.random_seed) # np.random.seed(arglist.random_seed) # tf.set_random_seed(arglist.random_seed) with U.single_threaded_session(): # Create environment env = make_env(arglist.scenario, arglist, arglist.benchmark) # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(env.n)] num_adversaries = min(env.n, arglist.num_adversaries) trainers = get_trainers(env, num_adversaries, obs_shape_n, arglist) print('Using good policy {} and adv policy {}'.format( arglist.good_policy, arglist.adv_policy)) # Initialize U.initialize() savers = [ tf.train.Saver(U.scope_vars(trainer.name)) for trainer in trainers ] # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.display or arglist.restore or arglist.benchmark: print('Loading previous state...') # U.load_state(arglist.load_dir) [ U.load_state(os.path.join(arglist.load_dir, 'team_{}'.format(i)), saver=saver) for i, saver in enumerate(savers) ] episode_rewards = [0.0] # sum of rewards for all agents agent_rewards = [[0.0] for _ in range(env.n)] # individual agent reward final_ep_rewards = [] # sum of rewards for training curve final_ep_ag_rewards = [] # agent rewards for training curve agent_info = [[[]]] # placeholder for benchmarking info saver = tf.train.Saver() obs_n = env.reset() episode_step = 0 train_step = 0 if arglist.trainer == 'tarmac' or arglist.trainer == 'reuse_tarmac' or arglist.trainer == 'ibmac_inter': message_n = np.zeros([len(obs_n), 4]) is_training = True t_start = time.time() writer = tf.summary.FileWriter("graph", U.get_session().graph) writer.close() writer = SummaryWriter(arglist.save_dir) print('Starting iterations...') while True: # get action if arglist.trainer == 'ibmac' or arglist.trainer == 'reuse_ibmac': is_inference = False if arglist.display or arglist.restore or arglist.benchmark: is_inference = False if len(trainers) == 2: action_n1 = trainers[0].action(obs_n[:num_adversaries], is_inference=is_inference) action_n2 = trainers[1].action(obs_n[num_adversaries:], is_inference=is_inference) action_n = [action[0] for action in action_n1 ] + [action[0] for action in action_n2] else: action_n = trainers[0].action(obs_n, is_inference=is_inference) action_n = [action[0] for action in action_n] elif arglist.trainer == 'ibmac_inter': if len(trainers) == 2: action_n1, message_action_n1 = trainers[0].action( obs_n[:num_adversaries], message_n[:num_adversaries]) action_n2, message_action_n2 = trainers[1].action( obs_n[num_adversaries:], message_n[num_adversaries:]) action_n = [action[0] for action in action_n1 ] + [action[0] for action in action_n2] else: action_n, message_action_n = trainers[0].action( obs_n, message_n) action_n = [action[0] for action in action_n] message_n = [ message_action[0] for message_action in message_action_n ] else: action_n = [ agent.action(obs) for agent, obs in zip(trainers, obs_n) ] # environment step new_obs_n, rew_n, done_n, info_n = env.step(action_n) episode_step += 1 done = all(done_n) terminal = (episode_step >= arglist.max_episode_len) # collect experience if arglist.trainer == 'ibmac': if len(trainers) == 2: trainers[0].experience(obs_n[:num_adversaries], action_n[:num_adversaries], rew_n[:num_adversaries], new_obs_n[:num_adversaries], done_n[:num_adversaries], terminal) trainers[1].experience(obs_n[num_adversaries:], action_n[num_adversaries:], rew_n[num_adversaries:], new_obs_n[num_adversaries:], done_n[num_adversaries:], terminal) else: trainers[0].experience(obs_n, action_n, rew_n, new_obs_n, done_n, terminal) elif arglist.trainer == 'ibmac_inter': if len(trainers) == 2: trainers[0].experience(obs_n[:num_adversaries], message_n[:num_adversaries], action_n[:num_adversaries], rew_n[:num_adversaries], new_obs_n[:num_adversaries], done_n[:num_adversaries], terminal) trainers[1].experience(obs_n[num_adversaries:], message_n[:num_adversaries], action_n[num_adversaries:], rew_n[num_adversaries:], new_obs_n[num_adversaries:], done_n[num_adversaries:], terminal) else: trainers[0].experience(obs_n, message_n, action_n, rew_n, new_obs_n, done_n, terminal) else: for i, agent in enumerate(trainers): agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) obs_n = new_obs_n for i, rew in enumerate(rew_n): episode_rewards[-1] += rew agent_rewards[i][-1] += rew if done or terminal: obs_n = env.reset() episode_step = 0 episode_rewards.append(0) for a in agent_rewards: a.append(0) agent_info.append([[]]) # increment global step counter train_step += 1 # for benchmarking learned policies if arglist.benchmark: for i, info in enumerate(info_n): agent_info[-1][i].append(info_n['n']) if train_step > arglist.benchmark_iters and (done or terminal): file_name = arglist.benchmark_dir + arglist.exp_name + '.pkl' print('Finished benchmarking, now saving...') with open(file_name, 'wb') as fp: pickle.dump(agent_info[:-1], fp) break continue # for displaying learned policies if arglist.display: env.render() continue # update all trainers, if not in display or benchmark mode loss = None for agent in trainers: agent.preupdate() for i, agent in enumerate(trainers): loss = agent.update(trainers, train_step) if loss: if isinstance(agent, IBMACAgentTrainer) or isinstance( agent, ReuseIBMACAgentTrainer): q_loss, p_loss, _, _, _, _, kl_loss = loss writer.add_scalar('agent_{}/loss_kl'.format(i), kl_loss, train_step) else: q_loss, p_loss, _, _, _, _ = loss writer.add_scalar('agent_{}/loss_policy'.format(i), p_loss, train_step) writer.add_scalar('agent_{}/loss_critic'.format(i), q_loss, train_step) # save model, display training output if terminal and (len(episode_rewards) % arglist.save_rate == 0): U.save_state(arglist.save_dir, saver=saver) [ U.save_state(os.path.join(arglist.save_dir, 'team_{}'.format(i)), saver=saver) for i, saver in enumerate(savers) ] # print statement depends on whether or not there are adversaries for i in range(len(agent_rewards)): writer.add_scalar( 'agent_{}/mean_episode_reward'.format(i), np.mean(agent_rewards[i][-arglist.save_rate:]), len(episode_rewards)) if num_adversaries == 0: print( "steps: {}, episodes: {}, mean episode reward: {}, time: {}" .format(train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), round(time.time() - t_start, 3))) else: print( "steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}" .format(train_step, len(episode_rewards), np.mean(episode_rewards[-arglist.save_rate:]), [ np.mean(rew[-arglist.save_rate:]) for rew in agent_rewards ], round(time.time() - t_start, 3))) t_start = time.time() # Keep track of final episode reward final_ep_rewards.append( np.mean(episode_rewards[-arglist.save_rate:])) for rew in agent_rewards: final_ep_ag_rewards.append( np.mean(rew[-arglist.save_rate:])) # saves final episode reward for plotting training curve later if len(episode_rewards) > arglist.num_episodes: rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(final_ep_rewards, fp) agrew_file_name = arglist.plots_dir + arglist.exp_name + '_agrewards.pkl' with open(agrew_file_name, 'wb') as fp: pickle.dump(final_ep_ag_rewards, fp) print('...Finished total of {} episodes.'.format( len(episode_rewards))) break
def on_event(self, f): @inlineCallbacks def set_wheel(self, robot_wheels): yield self.call(u'aiwc.set_speed', args.key, robot_wheels) return # initiate empty frame received_frame = Frame() if 'time' in f: received_frame.time = f['time'] if 'score' in f: received_frame.score = f['score'] if 'reset_reason' in f: received_frame.reset_reason = f['reset_reason'] if 'coordinates' in f: received_frame.coordinates = f['coordinates'] if 'EOF' in f: self.end_of_frame = f['EOF'] #self.printConsole(received_frame.time) #self.printConsole(received_frame.score) #self.printConsole(received_frame.reset_reason) #self.printConsole(self.end_of_frame) ############################################################################## if (self.end_of_frame): # How to get the robot and ball coordinates: (ROBOT_ID can be 0,1,2,3,4) #self.printConsole(received_frame.coordinates[MY_TEAM][ROBOT_ID][X]) #self.printConsole(received_frame.coordinates[MY_TEAM][ROBOT_ID][Y]) #self.printConsole(received_frame.coordinates[MY_TEAM][ROBOT_ID][TH]) #self.printConsole(received_frame.coordinates[MY_TEAM][ROBOT_ID][ACTIVE]) #self.printConsole(received_frame.coordinates[MY_TEAM][ROBOT_ID][TOUCH]) #self.printConsole(received_frame.coordinates[OP_TEAM][ROBOT_ID][X]) #self.printConsole(received_frame.coordinates[OP_TEAM][ROBOT_ID][Y]) #self.printConsole(received_frame.coordinates[OP_TEAM][ROBOT_ID][TH]) #self.printConsole(received_frame.coordinates[OP_TEAM][ROBOT_ID][ACTIVE]) #self.printConsole(received_frame.coordinates[OP_TEAM][ROBOT_ID][TOUCH]) #self.printConsole(received_frame.coordinates[BALL][X]) #self.printConsole(received_frame.coordinates[BALL][Y]) self.get_coord(received_frame) ############################################################################## # Next state, Reward, Reset if self.done: self.control_idx += 1 self.control_idx %= 5 # Next state next_obs = self.pre_processing(self.control_idx) if self.done: next_state = np.append(next_obs, next_obs) # 2 frames position stack self.done = False else: next_state = np.append( next_obs, self.state[:-self.state_dim]) # 2 frames position stack # Reward reward = self.get_reward(received_frame.reset_reason, self.control_idx) # Reset if (received_frame.reset_reason != NONE) and (received_frame.reset_reason is not None): self.done = True self.printConsole("reset reason: " + str(received_frame.reset_reason)) else: self.done = False self.state = next_state # get action self.action = self.trainers.action(self.state) self.wheels = np.zeros(self.number_of_robots * 2) self.wheels[2*self.control_idx] = self.max_linear_velocity * \ (self.action[1]-self.action[2]+self.action[3]-self.action[4]) self.wheels[2*self.control_idx + 1] = self.max_linear_velocity * \ (self.action[1]-self.action[2]-self.action[3]+self.action[4]) # Send non-control robot to the side of the field for i in range(self.number_of_robots): if i == self.control_idx: continue else: if (i == 0) or (i == 2): x = self.cur_my[i][X] y = -1.35 elif (i == 1) or (i == 3): x = self.cur_my[i][X] y = 1.35 else: x = -2.1 y = 0 self.position(i, x, y) # increment global step counter # Increase count only when the control robot is active. if self.cur_my[self.control_idx][ACTIVE]: self.train_step += 1 self.rwd_sum += reward self.printConsole('step: ' + str(self.train_step)) set_wheel(self, self.wheels.tolist()) ############################################################################## # plot every 6000 steps (about 5 minuites) if ((self.train_step % self.stats_steps) == 0) \ and (self.train_step < 1992001): stats = [self.rwd_sum] for i in range(len(stats)): U.get_session().run(self.update_ops[i], feed_dict={ self.summary_placeholders[i]: float(stats[i]) }) summary_str = U.get_session().run(self.summary_op) self.summary_writer.add_summary(summary_str, self.train_step - 6000) self.rwd_sum = 0 # load new model print('Loading %s' % self.train_step) U.load_state("./save_model/aiwc_maddpg-%s" % self.train_step) ############################################################################## if (received_frame.reset_reason == GAME_END): #(virtual finish() in random_walk.cpp) #save your data with open(args.datapath + '/result.txt', 'w') as output: #output.write('yourvariables') output.close() #unsubscribe; reset or leave yield self.sub.unsubscribe() try: yield self.leave() except Exception as e: self.printConsole("Error: {}".format(e)) self.end_of_frame = False
def train(arglist): with U.single_threaded_session(): # Create environment env = make_env(arglist.scenario, arglist) n = len(env.agents) # Create agent trainers obs_shape_n = [env.observation_space[i].shape for i in range(n)] trainers = [] for i in range(n): trainers.append( MADDPGAgentTrainer( "agent_%d" % i, mlp_model, obs_shape_n, env.action_space, i, arglist, local_q_func=False ) ) saver = tf.train.Saver(max_to_keep=None) # Initialize U.initialize() # Load previous results, if necessary if arglist.load_dir == "": arglist.load_dir = arglist.save_dir if arglist.restore: print('Loading previous state...') saver.restore(U.get_session(), arglist.load_dir) rewards = np.zeros((1, n)) # agent reward per step obs_n = env.reset() episode_number = 0 episode_step = 0 train_step = 0 t_start = time.time() # stats buffers step_info = { 'dist': np.zeros((arglist.max_episode_len, n, n)), 'speed': np.zeros((arglist.max_episode_len, n,)), 'health': np.zeros((arglist.max_episode_len, n,)), 'fire': np.zeros((arglist.max_episode_len, n,)), 'bite': np.zeros((arglist.max_episode_len, n, n)), 'hit': np.zeros((arglist.max_episode_len, n, n)) } episode_info = { 'dist': np.zeros((arglist.num_episodes, n, n)), 'speed': np.zeros((arglist.num_episodes, n,)), 'health': np.zeros((arglist.num_episodes, n,)), 'fire': np.zeros((arglist.num_episodes, n,)), 'bite': np.zeros((arglist.num_episodes, n, n)), 'hit': np.zeros((arglist.num_episodes, n, n)) } print('Starting iterations...') while True: # get action action_n = [agent.action(obs) for agent, obs in zip(trainers,obs_n)] # environment step new_obs_n, rew_n, done_n, info_n = env.step(action_n) # update episode step stats for key in step_info: step_info[key][episode_step] = info_n[key] episode_step += 1 done = all(done_n) terminal = (episode_step >= arglist.max_episode_len) # collect experience for i, agent in enumerate(trainers): agent.experience(obs_n[i], action_n[i], rew_n[i], new_obs_n[i], done_n[i], terminal) obs_n = new_obs_n # record reward rewards[-1, :] += rew_n if done or terminal: obs_n = env.reset() episode_step = 0 rewards = np.concatenate((rewards, np.zeros((1, n)))) # aggregate step_info episode_info['dist'][episode_number] = np.mean(step_info['dist'], axis=0) episode_info['speed'][episode_number] = np.mean(step_info['speed'], axis=0) episode_info['health'][episode_number] = np.min(step_info['health'], axis=0) episode_info['fire'][episode_number] = np.sum(step_info['fire'], axis=0) episode_info['bite'][episode_number] = np.sum(step_info['bite'], axis=0) episode_info['hit'][episode_number] = np.sum(step_info['hit'], axis=0) # reset step_info for key in step_info: step_info[key][:] = 0. # increment global step counter train_step += 1 # for displaying policies while training if arglist.display and (episode_number % arglist.display_rate == 0) and episode_number > 0 and er_fill_frac_min >= 1.0: #if arglist.display and (episode_number % 5 == 0) and episode_number > 0: time.sleep(0.1) env.render() # update all trainers loss = None for agent in trainers: agent.preupdate() for agent in trainers: loss = agent.update(trainers, train_step) # save model, display training output if terminal: # check replay buffer status er_status = np.array([[len(t.replay_buffer), t.max_replay_buffer_len] for t in trainers]) er_fill_frac = er_status[:, 0] / er_status[:, 1] er_fill_frac_min = er_fill_frac[np.argmin(er_fill_frac)] # print progress offset = -1 if train_step == 1 else -2 print("steps: {}\tepisode: {}\treplay: {:.2f}%\treward: {}\ttime: {}".format( train_step, episode_number, er_fill_frac_min * 100, "\t".join(['[', *["%.2f" % r for r in list(rewards[offset])], ']']), round(time.time()-t_start, 3)) ) t_start = time.time() # save state if (episode_number % arglist.save_rate == 0) and er_fill_frac_min >= 1.0: print("saving...", end='') # save policy snapshot snapshot_folder = "{}/{}".format(arglist.save_dir, arglist.exp_name) os.makedirs(snapshot_folder, exist_ok=True) saver.save(U.get_session(), snapshot_folder + '/session', global_step=episode_number) # save rewards rew_file_name = arglist.plots_dir + arglist.exp_name + '_rewards.pkl' with open(rew_file_name, 'wb') as fp: pickle.dump(rewards, fp) # save stats for key in episode_info: stats_file_name = "{}{}_{}.pkl".format(arglist.plots_dir, arglist.exp_name, key) with open(stats_file_name, 'wb') as fp: pickle.dump(episode_info[key], fp) print("done") episode_number += 1 # saves final episode reward for plotting training curve later if episode_number == arglist.num_episodes: print('...Finished total of {} episodes.'.format(episode_number)) break
def init_variables(self, info): # Here you have the information of the game (virtual init() in random_walk.cpp) # List: game_time, goal, number_of_robots, penalty_area, codewords, # robot_height, robot_radius, max_linear_velocity, field, team_info, # {rating, name}, axle_length, resolution, ball_radius # self.game_time = info['game_time'] self.field = info['field'] self.robot_size = 2 * info['robot_radius'] self.goal = info['goal'] self.max_linear_velocity = info['max_linear_velocity'] self.number_of_robots = info['number_of_robots'] self.end_of_frame = False ################################################################## # team info, 5 robots, (x,y,th,active,touch) self.cur_my = [[] for _ in range(self.number_of_robots)] self.cur_ball = [] # ball (x,y) position self.prev_ball = [0., 0.] # previous ball (x,y) position # distance to the ball self.dist_ball = np.zeros(self.number_of_robots) # index for which robot is close to the ball self.idxs = [i for i in range(self.number_of_robots)] self.dlck_cnt = 0 # deadlock count # how many times avoid deadlock function was called self.avoid_dlck_cnt = 0 self.wheels = np.zeros(self.number_of_robots * 2) ################################################################## self.state_dim = 2 # relative ball self.history_size = 2 # frame history size self.action_dim = 2 # 2 # Histories of five robots. self.state = [np.zeros([self.state_dim * self.history_size]) \ for _ in range(self.number_of_robots)] self.arglist = Argument() # state dimension self.state_shape = (self.state_dim * self.history_size, ) self.act_space = [Discrete(self.action_dim * 2 + 1)] self.trainers = MADDPGAgentTrainer('agent_moving', self.mlp_model, self.state_shape, self.act_space, 0, self.arglist, local_q_func=False) ################################################################## self.load_step_list = np.loadtxt('./test_step_list.txt') self.step_idx = 0 # For self.load_step_list # # Load previous results. if self.arglist.restore: self.printConsole('Loading previous state... %d' % \ self.load_step_list[self.step_idx]) U.load_state('./save_model/aiwc_maddpg-%d' % \ self.load_step_list[self.step_idx]) ################################################################## # for tensorboard self.summary_placeholders, self.update_ops, self.summary_op = \ self.setup_summary() self.summary_writer = \ tf.summary.FileWriter('summary/moving_test', U.get_session().graph) ################################################################## self.test_step = 0 self.stats_steps = 12000 # For tensorboard, about 10 minutes self.scr_my = 0. # my team score self.scr_op = 0. # op team score self.scr_sum = 0 # score sum self.reset = False ################################################################## self.cur_time = time.time() # For check time to take return
def on_event(self, f): @inlineCallbacks def set_wheel(self, robot_wheels): yield self.call(u'aiwc.set_speed', args.key, robot_wheels) return def avoid_goal_foul(self): midfielder(self, self.idxs[0]) midfielder(self, self.idxs[1]) self.position(self.idxs[2], 0, 0) self.position(self.idxs[3], 0, 0) self.position(self.idxs[4], 0, 0) def avoid_penalty_foul(self): midfielder(self, self.idxs[0]) midfielder(self, self.idxs[1]) midfielder(self, self.idxs[2]) self.position(self.idxs[3], 0, 0) self.position(self.idxs[4], 0, 0) def avoid_deadlock(self): self.position(0, self.cur_ball[X], 0) self.position(1, self.cur_ball[X], 0) self.position(2, self.cur_ball[X], 0) self.position(3, self.cur_ball[X], 0) self.position(4, self.cur_ball[X], 0) # if closest ball is somhow away from the ball # or avoided deadlock to some extent if (self.dist_ball[self.idxs[0]] > 0.13) or (self.avoid_dlck_cnt > 20): offense(self) def midfielder(self, robot_id): goal_dist = helper.distance(self.cur_my[robot_id][X], self.field[X] / 2, self.cur_my[robot_id][Y], 0) shoot_mul = 1 dribble_dist = 0.426 v = 5 goal_to_ball_unit = helper.unit( [self.field[X] / 2 - self.cur_ball[X], -self.cur_ball[Y]]) delta = [ self.cur_ball[X] - self.cur_my[robot_id][X], self.cur_ball[Y] - self.cur_my[robot_id][Y] ] if (self.dist_ball[robot_id] < 0.5) and (delta[X] > 0): self.position(robot_id, self.cur_ball[X] + v * delta[X], self.cur_ball[Y] + v * delta[Y]) else: self.position( robot_id, self.cur_ball[X] - dribble_dist * goal_to_ball_unit[X], self.cur_ball[Y] - dribble_dist * goal_to_ball_unit[Y]) def offense(self): midfielder(self, 0) midfielder(self, 1) midfielder(self, 2) midfielder(self, 3) midfielder(self, 4) def set_formation(self): # count how many robots is in the goal area goal_area_cnt = self.count_goal_area() # count how many robots is in the penalty area penalty_area_cnt = self.count_penalty_area() self.count_deadlock() if goal_area_cnt > 2: avoid_goal_foul(self) self.printConsole('avoid goal foul') elif penalty_area_cnt > 3: avoid_penalty_foul(self) self.printConsole('avoid penalty foul') elif self.dlck_cnt > 15: avoid_deadlock(self) self.printConsole('avoid deadlock') self.avoid_dlck_cnt += 1 else: offense(self) self.printConsole('offense') # initiate empty frame received_frame = Frame() if 'time' in f: received_frame.time = f['time'] if 'score' in f: received_frame.score = f['score'] if 'reset_reason' in f: received_frame.reset_reason = f['reset_reason'] if 'coordinates' in f: received_frame.coordinates = f['coordinates'] if 'EOF' in f: self.end_of_frame = f['EOF'] #self.printConsole(received_frame.time) #self.printConsole(received_frame.score) #self.printConsole(received_frame.reset_reason) #self.printConsole(self.end_of_frame) ############################################################################## if (self.end_of_frame): # How to get the robot and ball coordinates: (ROBOT_ID can be 0,1,2,3,4) #self.printConsole(received_frame.coordinates[MY_TEAM][ROBOT_ID][X]) #self.printConsole(received_frame.coordinates[MY_TEAM][ROBOT_ID][Y]) #self.printConsole(received_frame.coordinates[MY_TEAM][ROBOT_ID][TH]) #self.printConsole(received_frame.coordinates[MY_TEAM][ROBOT_ID][ACTIVE]) #self.printConsole(received_frame.coordinates[MY_TEAM][ROBOT_ID][TOUCH]) #self.printConsole(received_frame.coordinates[OP_TEAM][ROBOT_ID][X]) #self.printConsole(received_frame.coordinates[OP_TEAM][ROBOT_ID][Y]) #self.printConsole(received_frame.coordinates[OP_TEAM][ROBOT_ID][TH]) #self.printConsole(received_frame.coordinates[OP_TEAM][ROBOT_ID][ACTIVE]) #self.printConsole(received_frame.coordinates[OP_TEAM][ROBOT_ID][TOUCH]) #self.printConsole(received_frame.coordinates[BALL][X]) #self.printConsole(received_frame.coordinates[BALL][Y]) ############################################################################## self.get_coord(received_frame) self.idxs = self.get_idxs() # Reset if (received_frame.reset_reason == SCORE_MYTEAM): self.reset = True self.scr_my += 1 self.scr_sum += 1 self.printConsole("reset reason: " + \ str(received_frame.reset_reason)) elif (received_frame.reset_reason == SCORE_OPPONENT): self.reset = True self.scr_op += 1 self.scr_sum -= 1 self.printConsole("reset reason: " + \ str(received_frame.reset_reason)) elif(received_frame.reset_reason != NONE) or \ (received_frame.reset_reason == None): self.reset = True self.printConsole("reset reason: " + \ str(received_frame.reset_reason)) else: self.reset = False set_formation(self) # rule-based formation set_wheel(self, self.wheels.tolist()) self.prev_ball = self.cur_ball # increment global step counter self.test_step += 1 self.printConsole('step: ' + str(self.test_step)) if (self.test_step % 1200) == 0: self.printConsole('%d seconds' % (time.time() - self.cur_time)) self.cur_time = time.time() ############################################################################## # plot every 72000 steps (about 10 minutes) if ((self.test_step % self.stats_steps) == 0) and (self.step_idx < 20): score_ratio = self.scr_my / self.scr_op \ if self.scr_op != 0. else 100 stats = [score_ratio, self.scr_sum] for i in range(len(stats)): U.get_session().run(self.update_ops[i], feed_dict={ self.summary_placeholders[i]: float(stats[i]) }) summary_str = U.get_session().run(self.summary_op) self.summary_writer.add_summary( summary_str, self.load_step_list[self.step_idx]) self.step_idx += 1 self.scr_my, self.scr_op, self.scr_sum = 0, 0, 0 # load new model print('Loading %s' % self.load_step_list[self.step_idx]) U.load_state('./save_model/aiwc_maddpg-%d' % \ self.load_step_list[self.step_idx]) ############################################################################## if (received_frame.reset_reason == GAME_END): #(virtual finish() in random_walk.cpp) #save your data with open(args.datapath + '/result.txt', 'w') as output: #output.write('yourvariables') output.close() #unsubscribe; reset or leave yield self.sub.unsubscribe() try: yield self.leave() except Exception as e: self.printConsole("Error: {}".format(e)) self.end_of_frame = False