def __init__(self, env): logger.info("SchedNet trainer is created") self._env = env self._eval = Evaluation() self._agent_profile = self._env.get_agent_profile() self._n_predator = self._agent_profile['predator']['n_agent'] self._n_prey = self._agent_profile['prey']['n_agent'] # State and obs additionally include history information self._state_dim = self._env.get_info( )[0]['state'].shape[0] + self._n_predator self._obs_dim = obs_dim = self._agent_profile['predator']['obs_dim'][ 0] + 1 # Predator agent self._predator_agent = PredatorAgent( n_agent=self._agent_profile['predator']['n_agent'], action_dim=self._agent_profile['predator']['act_dim'], state_dim=self._state_dim, obs_dim=self._obs_dim) # Prey agent (randomly moving) self._prey_agent = [] for _ in range(self._n_prey): self._prey_agent.append(RandomAgent(5)) self.epsilon = 0.5 # Init value for epsilon if FLAGS.gui: # Enable GUI self.canvas = canvas.Canvas(self._n_predator, 1, FLAGS.map_size) self.canvas.setup()
def __init__(self, env): logger.info("Centralized DQN Trainer is created") self._env = env self._eval = Evaluation() self._n_predator = FLAGS.n_predator self._n_prey = FLAGS.n_prey self.action_dim = self._env.call_action_dim() self.state_dim = self._env.call_state_dim() self._agent = Agent(self.action_dim, self.state_dim) self.epsilon = 1.0
def __init__(self, n_agent, action_dim, state_dim, obs_dim, name=""): logger.info("Predator Agent is created") self._n_agent = n_agent self._state_dim = state_dim self._action_dim_per_unit = action_dim self._obs_dim_per_unit = obs_dim self._obs_dim = self._obs_dim_per_unit * self._n_agent self._name = name self.update_cnt = 0 # Make Networks tf.reset_default_graph() my_graph = tf.Graph() with my_graph.as_default(): self.sess = tf.Session(graph=my_graph, config=tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))) self.action_selector = ActionSelectorNetwork(self.sess, self._n_agent, self._obs_dim_per_unit, self._action_dim_per_unit, self._name) self.weight_generator = WeightGeneratorNetwork(self.sess, self._n_agent, self._obs_dim) self.critic = CriticNetwork(self.sess, self._n_agent, self._state_dim, self._name) self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver() if FLAGS.load_nn: if FLAGS.nn_file == "": logger.error("No file for loading Neural Network parameter") exit() self.saver.restore(self.sess, FLAGS.nn_file) self.replay_buffer = ReplayBuffer() self._eval = Evaluation()
def __init__(self, env): logger.info("Centralized DQN Trainer is created") self._env = env self._eval = Evaluation() self._n_predator = FLAGS.n_predator self._n_prey = FLAGS.n_prey self._agent_profile = self._env.get_agent_profile() self._agent_precedence = self._env.agent_precedence self._agent = Agent(self._agent_profile["predator"]["act_dim"], self._agent_profile["predator"]["obs_dim"][0]) self._prey_agent = AcAgent(5) self.epsilon = 1.0 if FLAGS.load_nn: self.epsilon = epsilon_min if FLAGS.gui: self.canvas = canvas.Canvas(self._n_predator, self._n_prey, FLAGS.map_size) self.canvas.setup()
def __init__(self, action_dim, obs_dim, name=""): logger.info("Centralized DQN Agent") self._obs_dim = obs_dim self._n_player = FLAGS.n_predator self._action_dim = action_dim * self._n_player self._action_dim_single = action_dim self._state_dim = obs_dim self._name = name self.update_cnt = 0 self.target_update_period = 3000 self.df = FLAGS.df self.lr = FLAGS.lr # Make Q-network tf.reset_default_graph() my_graph = tf.Graph() with my_graph.as_default(): self.sess = tf.Session( graph=my_graph, config=tf.ConfigProto(gpu_options=tf.GPUOptions( allow_growth=True))) self.q_network = DQNetwork(self.sess, self._state_dim, self._action_dim_single, self._n_player) self.sess.run(tf.global_variables_initializer()) self.saver = tf.train.Saver() if FLAGS.load_nn: print "LOAD!" self.saver.restore(self.sess, FLAGS.nn_file) self.replay_buffer = ReplayBuffer() self._eval = Evaluation() self.q_prev = None self.ims = [] plt.clf() self.fig = plt.figure() self.axes = plt.gca() plt.xticks(list(range(0, 25, 5))) plt.yticks(list(range(0, 25, 5))) self.axes.tick_params(axis='both', labelsize=15)
class Trainer(object): def __init__(self, env): logger.info("Centralized DQN Trainer is created") self._env = env self._eval = Evaluation() self._n_predator = FLAGS.n_predator self._n_prey = FLAGS.n_prey self._agent_profile = self._env.get_agent_profile() self._agent_precedence = self._env.agent_precedence self._agent = Agent(self._agent_profile["predator"]["act_dim"], self._agent_profile["predator"]["obs_dim"][0]) self._prey_agent = AcAgent(5) self.epsilon = 1.0 if FLAGS.load_nn: self.epsilon = epsilon_min if FLAGS.gui: self.canvas = canvas.Canvas(self._n_predator, self._n_prey, FLAGS.map_size) self.canvas.setup() def learn(self): step = 0 episode = 0 print_flag = False count = 1 while step < training_step: episode += 1 ep_step = 0 obs = self._env.reset() state = self._env.get_full_encoding()[:, :, 2] total_reward = 0 total_reward_pos = 0 total_reward_neg = 0 self.random_action_generator() while True: step += 1 ep_step += 1 action = self.get_action(obs, step, state) obs_n, reward, done, info = self._env.step(action) state_n = self._env.get_full_encoding()[:, :, 2] done_single = sum(done) > 0 self.train_agents(state, action, reward, state_n, done_single) obs = obs_n state = state_n total_reward += np.sum(reward) if np.sum(reward) >= 0: total_reward_pos += np.sum(reward) else: total_reward_neg += np.sum(reward) if is_episode_done(done, step) or ep_step >= FLAGS.max_step: # print step, ep_step, total_reward if print_flag and episode % FLAGS.eval_step == 1: print "[train_ep %d]" % ( episode ), "\treward", total_reward_pos, total_reward_neg break if episode % FLAGS.eval_step == 0: self.test(episode) self._eval.summarize() def random_action_generator(self): rand_unit = np.random.uniform(size=(FLAGS.n_predator, 5)) self.rand = rand_unit / np.sum(rand_unit, axis=1, keepdims=True) def get_action(self, obs, step, state, train=True): act_n = [] if train == True: self.epsilon = max(self.epsilon - epsilon_dec, epsilon_min) # Action of predator action_list = self._agent.act(state) for i in range(self._n_predator): if train and (step < FLAGS.m_size * FLAGS.pre_train_step or np.random.rand() < self.epsilon): action = np.random.choice(5) act_n.append(action) else: act_n.append(action_list[i]) # Action of prey for i in range(FLAGS.n_prey): act_n.append(self._prey_agent.act(state, i)) # act_n[1] = 2 return np.array(act_n, dtype=np.int32) def train_agents(self, state, action, reward, state_n, done): self._agent.train(state, action, reward, state_n, done) def test(self, curr_ep=None): step = 0 episode = 0 test_flag = FLAGS.kt sum_reward = 0 sum_reward_pos = 0 sum_reward_neg = 0 while step < testing_step: episode += 1 obs = self._env.reset() state = self._env.get_full_encoding()[:, :, 2] if test_flag: print "\nInit\n", state total_reward = 0 total_reward_pos = 0 total_reward_neg = 0 ep_step = 0 while True: step += 1 ep_step += 1 action = self.get_action(obs, step, state, False) obs_n, reward, done, info = self._env.step(action) state_n = self._env.get_full_encoding()[:, :, 2] state_next = state_to_index(state_n) if FLAGS.gui: self.canvas.draw( state_next, done, "Score:" + str(total_reward) + ", Step:" + str(ep_step)) if test_flag: aa = raw_input('>') if aa == 'c': test_flag = False print action print state_n print reward obs = obs_n state = state_n r = np.sum(reward) # if r == 0.1: # r = r * (-1.) * FLAGS.penalty total_reward += r # * (FLAGS.df ** (ep_step-1)) if r > 0: total_reward_pos += r else: total_reward_neg -= r if is_episode_done(done, step, "test") or ep_step >= FLAGS.max_step: if FLAGS.gui: self.canvas.draw( state_next, done, "Hello", "Score:" + str(total_reward) + ", Step:" + str(ep_step)) break sum_reward += total_reward sum_reward_pos += total_reward_pos sum_reward_neg += total_reward_neg if FLAGS.scenario == "pursuit": print "Test result: Average steps to capture: ", curr_ep, float( step) / episode self._eval.update_value("training result: ", float(step) / episode, curr_ep) elif FLAGS.scenario == "endless" or FLAGS.scenario == "endless2" or FLAGS.scenario == "endless3": print "Average reward:", FLAGS.penalty, curr_ep, sum_reward / episode, sum_reward_pos / episode, sum_reward_neg / episode self._eval.update_value("training result: ", sum_reward / episode, curr_ep) self._agent.logging(sum_reward / episode, curr_ep * 100)
class Trainer(object): def __init__(self, env): logger.info("SchedNet trainer is created") self._env = env self._eval = Evaluation() self._agent_profile = self._env.get_agent_profile() self._n_predator = self._agent_profile['predator']['n_agent'] self._n_prey = self._agent_profile['prey']['n_agent'] # State and obs additionally include history information self._state_dim = self._env.get_info( )[0]['state'].shape[0] + self._n_predator self._obs_dim = obs_dim = self._agent_profile['predator']['obs_dim'][ 0] + 1 # Predator agent self._predator_agent = PredatorAgent( n_agent=self._agent_profile['predator']['n_agent'], action_dim=self._agent_profile['predator']['act_dim'], state_dim=self._state_dim, obs_dim=self._obs_dim) # Prey agent (randomly moving) self._prey_agent = [] for _ in range(self._n_prey): self._prey_agent.append(RandomAgent(5)) self.epsilon = 0.5 # Init value for epsilon if FLAGS.gui: # Enable GUI self.canvas = canvas.Canvas(self._n_predator, 1, FLAGS.map_size) self.canvas.setup() def learn(self): global_step = 0 episode_num = 0 print_flag = True while global_step < training_step: episode_num += 1 step_in_ep = 0 obs_n = self._env.reset() info_n = self._env.get_info() h_schedule_n = np.zeros(self._n_predator) # schedule history obs_n, state, _ = self.get_obs_state_with_schedule(obs_n, info_n, h_schedule_n, init=True) total_reward = 0 done = False while not done: global_step += 1 step_in_ep += 1 schedule_n, priority = self.get_schedule( obs_n, global_step, FLAGS.sched) action_n = self.get_action(obs_n, schedule_n, global_step) obs_n_without_schedule, reward_n, done_n, info_n = self._env.step( action_n) obs_n_next, state_next, h_schedule_n = self.get_obs_state_with_schedule( obs_n_without_schedule, info_n, h_schedule_n, schedule_n) if FLAGS.gui: self.canvas.draw(state_next * FLAGS.map_size, [0] * self._n_predator, "Train") done_single = sum(done_n) > 0 self.train_agents(state, obs_n, action_n, reward_n, state_next, obs_n_next, schedule_n, priority, done_single) obs_n = obs_n_next state = state_next total_reward += np.sum(reward_n) if is_episode_done(done_n, global_step): if FLAGS.gui: self.canvas.draw(state_next * FLAGS.map_size, [0] * self._n_predator, "Train", True) if print_flag: print("[train_ep %d]" % (episode_num), "\tstep:", global_step, "\tstep_per_ep:", step_in_ep, "\treward", total_reward) done = True if FLAGS.eval_on_train and global_step % FLAGS.eval_step == 0: self.test(global_step) break self._predator_agent.save_nn(global_step) self._eval.summarize() def get_action(self, obs_n, schedule_n, global_step, train=True): act_n = [0] * len(obs_n) self.epsilon = max(self.epsilon - epsilon_dec, epsilon_min) # Action of predator if train and (global_step < FLAGS.m_size * FLAGS.pre_train_step or np.random.rand() < self.epsilon): # with prob. epsilon # Exploration predator_action = self._predator_agent.explore() else: # Exploitation predator_obs = [ obs_n[i] for i in self._agent_profile['predator']['idx'] ] predator_action = self._predator_agent.act(predator_obs, schedule_n) for i, idx in enumerate(self._agent_profile['predator']['idx']): act_n[idx] = predator_action[i] # Action of prey for i, idx in enumerate(self._agent_profile['prey']['idx']): act_n[idx] = self._prey_agent[i].act(None) return np.array(act_n, dtype=np.int32) def get_schedule(self, obs_n, global_step, type, train=True): predator_obs = [ obs_n[i] for i in self._agent_profile['predator']['idx'] ] if train and (global_step < FLAGS.m_size * FLAGS.pre_train_step or np.random.rand() < self.epsilon): # Exploration: Schedule k random agent priority = np.random.rand(self._n_predator) i = np.argsort(-priority)[:FLAGS.s_num] ret = np.full(self._n_predator, 0.0) ret[i] = 1.0 return ret, priority else: # Exploitation return self._predator_agent.schedule(predator_obs) def train_agents(self, state, obs_n, action_n, reward_n, state_next, obs_n_next, schedule_n, priority, done): predator_obs = [ obs_n[i] for i in self._agent_profile['predator']['idx'] ] predator_action = [ action_n[i] for i in self._agent_profile['predator']['idx'] ] predator_reward = [ reward_n[i] for i in self._agent_profile['predator']['idx'] ] predator_obs_next = [ obs_n_next[i] for i in self._agent_profile['predator']['idx'] ] self._predator_agent.train(state, predator_obs, predator_action, predator_reward, state_next, predator_obs_next, schedule_n, priority, done) def get_h_obs_state(self, obs_n, state, h_schedule): obs_n_h = np.concatenate((obs_n[0:self._n_predator], h_schedule.reshape((self._n_predator, 1))), axis=1) obs_final = list() for i in range(self._n_predator): obs_final.append(obs_n_h[i]) for i in range(self._n_prey): obs_final.append(obs_n[self._n_predator + i]) obs_n = np.array(obs_final) state = np.concatenate((state, h_schedule), axis=-1) return obs_n, state def get_obs_state_with_schedule(self, obs_n_ws, info_n, h_schedule_n, schedule_n=None, init=False): if not init: h_schedule_n = self.update_h_schedule(h_schedule_n, schedule_n) obs_n_h = np.concatenate((obs_n_ws[0:self._n_predator], h_schedule_n.reshape((self._n_predator, 1))), axis=1) obs_final = list() for i in range(self._n_predator): obs_final.append(obs_n_h[i]) for i in range(self._n_prey): obs_final.append(obs_n_ws[self._n_predator + i]) obs_n = np.array(obs_final) state = np.concatenate((info_n[0]['state'], h_schedule_n), axis=-1) return obs_n, state, h_schedule_n def update_h_schedule(self, h_schedule, schedule_n): ret = h_schedule * 0.5 + schedule_n * 0.5 return ret def print_obs(self, obs): for i in range(FLAGS.n_predator): print(obs[i]) print("") def check_obs(self, obs): check_list = [] for i in range(FLAGS.n_predator): check_list.append(obs[i][2]) return np.array(check_list) def test(self, curr_ep=None): global_step = 0 episode_num = 0 total_reward = 0 obs_cnt = np.zeros(self._n_predator) while global_step < testing_step: episode_num += 1 step_in_ep = 0 obs_n = self._env.reset() info_n = self._env.get_info() h_schedule_n = np.zeros(self._n_predator) obs_n, state, _ = self.get_obs_state_with_schedule(obs_n, info_n, h_schedule_n, init=True) while True: global_step += 1 step_in_ep += 1 schedule_n, priority = self.get_schedule( obs_n, global_step, FLAGS.sched) action_n = self.get_action(obs_n, schedule_n, global_step, False) obs_n_without_schedule, reward_n, done_n, info_n = self._env.step( action_n) obs_n_next, state_next, h_schedule_n = self.get_obs_state_with_schedule( obs_n_without_schedule, info_n, h_schedule_n, schedule_n) obs_cnt += self.check_obs(obs_n_next) if FLAGS.gui: self.canvas.draw(state_next * FLAGS.map_size, [0] * self._n_predator, "Test") obs_n = obs_n_next state = state_next total_reward += np.sum(reward_n) if is_episode_done(done_n, global_step, "test") or step_in_ep > FLAGS.max_step: if FLAGS.gui: self.canvas.draw(state_next * FLAGS.map_size, [0] * self._n_predator, "Test", True) break print("Test result: Average steps to capture: ", curr_ep, float(global_step) / episode_num, "\t", float(total_reward) / episode_num, obs_cnt / episode_num) self._eval.update_value("test_result", float(global_step) / episode_num, curr_ep)
class Trainer(object): def __init__(self, env): logger.info("Centralized DQN Trainer is created") self._env = env self._eval = Evaluation() self._n_predator = FLAGS.n_predator self._n_prey = FLAGS.n_prey self.action_dim = self._env.call_action_dim() self.state_dim = self._env.call_state_dim() self._agent = Agent(self.action_dim, self.state_dim) self.epsilon = 1.0 def learn(self): step = 0 episode = 0 print_flag = False array = np.zeros([FLAGS.training_step / FLAGS.eval_step, 4]) while step < training_step: episode += 1 ep_step = 0 obs = self._env.reset() state = obs total_reward = 0 while True: step += 1 ep_step += 1 action = self.get_action(obs, step, state) obs_n, reward, done, info = self._env.step(action) state_n = obs_n done_single = sum(done) > 0 if ep_step >= FLAGS.max_step: done_single = True self.train_agents(state, action, reward, state_n, done_single) obs = obs_n state = state_n total_reward += np.sum(reward) * (FLAGS.df**(ep_step - 1)) # if step % 100 ==0: # print step, self._agent.q() if is_episode_done(done, step) or ep_step >= FLAGS.max_step: if print_flag: print "[train_ep %d]" % ( episode ), "\tstep:", step, "\tep_step:", ep_step, "\treward", total_reward break if episode % FLAGS.eval_step == 0: self.test(episode) self._eval.summarize() def get_action(self, obs, step, state, train=True): act_n = [] self.epsilon = max(self.epsilon - epsilon_dec, epsilon_min) # Action of predator action_list = self._agent.act(state) for i in range(self._n_predator): if train and ( step < FLAGS.m_size * FLAGS.pre_train_step or np.random.rand() < self.epsilon): # with prob. epsilon action = np.random.choice(self.action_dim) act_n.append(action) else: act_n.append(action_list[i]) return np.array(act_n, dtype=np.int32) def train_agents(self, state, action, reward, state_n, done): self._agent.train(state, action, reward, state_n, done) def test(self, curr_ep=None): step = 0 episode = 0 test_flag = FLAGS.kt sum_reward = 0 while step < testing_step: episode += 1 obs = self._env.reset() state = obs if test_flag: print "\nInit\n", state total_reward = 0 ep_step = 0 while True: step += 1 ep_step += 1 action = self.get_action(obs, step, state, False) obs_n, reward, done, info = self._env.step(action) state_n = obs_n if test_flag: aa = raw_input('>') if aa == 'c': test_flag = False print action print state_n print reward obs = obs_n state = state_n total_reward += np.sum(reward) * (FLAGS.df**(ep_step - 1)) if is_episode_done(done, step, "test") or ep_step >= FLAGS.max_step: break sum_reward += total_reward print "Algorithm ", FLAGS.algorithm, ",Average reward: ", curr_ep, sum_reward / episode self._eval.update_value("test_result", sum_reward / episode, curr_ep)