def gen_action(self, agent_list, observation, free_map=None): """Action generation method. This is a required method that generates list of actions corresponding to the list of units. Args: agent_list (list): list of all friendly units. observation (np.array): 2d map of partially observable map. free_map (np.array): 2d map of static environment (optional). Returns: action_out (list): list of integers as actions selected for team. Note: The graph is not updated in this session. It only returns action for given input. """ if not self._reset_done: self.reset_network_weight() obs = one_hot_encoder(observation, agent_list, self.input_shape, reverse=not self.is_blue) action_prob = self.sess.run(self.action, feed_dict={self.state: obs}) # Action Probability action_out = [ np.random.choice(5, p=action_prob[x] / sum(action_prob[x])) for x in range(len(agent_list)) ] return action_out
def gen_action(self, agent_list, observation, free_map=None): """Action generation method. This is a required method that generates list of actions corresponding to the list of units. Args: agent_list (list): list of all friendly units. observation (np.array): 2d map of partially observable map. free_map (np.array): 2d map of static environment (optional). Returns: action_out (list): list of integers as actions selected for team. Note: The graph is not updated in this session. It only returns action for given input. """ obs = one_hot_encoder(observation, agent_list, self.input_shape, reverse=not self.is_blue) action_prob = self.sess.run(self.action, feed_dict={self.state:obs}) # Action Probability # If the policy is deterministic policy, return the argmax # The parameter can be changed with set_deterministic(bool) if self.deterministic: action_out = np.argmax(action_prob, axis=1).tolist() else: action_out = [np.random.choice(5, p=action_prob[x]/sum(action_prob[x])) for x in range(len(agent_list))] return action_out
def get_action(self, raw_observation, agent_list, process_ids): state = one_hot_encoder(raw_observation, agent_list, VISION_RANGE) state_wide = one_hot_encoder_v2(raw_observation, agent_list, 19) p = self.sub_policy choices = [p[0].get_action(state), p[1].get_action(state_wide)] # Arbitrary action_out = [choices[pid][aid] for aid, pid in enumerate(process_ids)] return action_out
def gen_action(self, agent_list, observation, free_map=None): state = one_hot_encoder(observation, agent_list, self.vision_range, reverse=not self.is_blue) state_wide = one_hot_encoder_v2(observation, agent_list, 19, reverse=not self.is_blue) p = self.policy choices = [p[0].get_action(state), p[1].get_action(state_wide)] # choices = [p.get_action(state) for p in self.policy] # Arbitrary action_out = [] si, ei = 0, 0 for pid, n in enumerate(self.fix_policy): ei += n action_out.extend(choices[pid][si:ei]) si = ei return action_out
def work(self, saver, writer): global global_rewards, global_ep_rewards, global_episodes, global_length, global_succeed total_step = 1 local_ep = 0 buffer = Experience_buffer(experience_shape=6, buffer_size=replay_capacity) epsilon = 1.0 epsilon_gamma = 0.9999 epsilon_final = 0.1 with self.sess.as_default(), self.sess.graph.as_default(): while global_episodes < total_episodes: local_ep += 1 raw_obs = self.env.reset() if partial_visible: s1 = one_hot_encoder(raw_obs, self.env.get_team_blue, VISION_RANGE) else: s1 = one_hot_encoder(self.env._env, self.env.get_team_blue, VISION_RANGE) # parameters ep_r = 0 prev_r = 0 is_alive = [True] * num_blue episode_buffer = [] for step in range(max_ep + 1): # Set sub-policy if step % 15 == 0: pids = self.Network.run_network( np.expand_dims(s1, axis=0))[0] if random.random() < epsilon: # Random Exploration a = random.choices(range(action_size), k=4) epsilon = max(epsilon_final, epsilon * epsilon_gamma) else: a = self.get_action(raw_obs, self.env.get_team_blue, pids) s0 = s1 raw_obs, rc, d, info = self.env.step(a) if partial_visible: s1 = one_hot_encoder(raw_obs, self.env.get_team_blue, VISION_RANGE) else: s1 = one_hot_encoder(self.env._env, self.env.get_team_blue, VISION_RANGE) is_alive = info['blue_alive'][-1] r = (rc - prev_r - 0.01) if step == max_ep and not d: r = -100 rc = -100 d = True r /= 100.0 ep_r += r # push to buffer for idx in range(num_blue): if step > 0: was_alive = info['blue_alive'][-2] else: was_alive = [True] * num_blue if was_alive[idx]: episode_buffer.append( [s0, a, r, s1, d, is_alive * 1]) # Iteration prev_r = rc total_step += 1 if d: buffer.add(episode_buffer) if local_ep % update_frequency == 0 and local_ep > 0: batch = buffer.pop(size=batch_size, shuffle=True) aloss = self.train(batch) # buffer.flush() break global_ep_rewards.append(ep_r) global_rewards.append(rc) global_length.append(step) global_succeed.append(self.env.blue_win) global_episodes += 1 self.sess.run(global_step_next) progbar.update(global_episodes) if global_episodes % save_stat_frequency == 0 and global_episodes != 0: summary = tf.Summary() summary.value.add(tag='Records/mean_reward', simple_value=global_rewards()) summary.value.add(tag='Records/mean_length', simple_value=global_length()) summary.value.add(tag='Records/mean_succeed', simple_value=global_succeed()) summary.value.add(tag='Records/mean_episode_reward', simple_value=global_ep_rewards()) summary.value.add(tag='summary/loss', simple_value=aloss) writer.add_summary(summary, global_episodes) writer.flush() if global_episodes % save_network_frequency == 0: saver.save(self.sess, MODEL_PATH + '/ctf_policy.ckpt', global_step=global_episodes)