class Agent(object): def __init__(self, obs_space, act_space, sess, n_agents, name): self.act_space = act_space self.n_agents = n_agents self.dqn = DQN(sess, obs_space, sup_len, act_space, n_agents, name) self.rb = ReplayBuffer(capacity=rb_capacity) self.train_cnt = 0 self.sns_q = None def act_multi(self, obs, random): q_values = self.dqn.get_q_values([obs])[0] r_action = np.random.randint(self.act_space, size=(len(obs))) action_n = ( (random + 1) % 2) * (q_values.argmax(axis=1)) + (random) * r_action return action_n def incentivize_multi(self, info): state, action, reward, next_state, done = info return reward def add_to_memory(self, exp): self.rb.add_to_memory(exp) def sync_target(self): self.dqn.training_target_qnet() def train(self, use_rx): data = self.rb.sample_from_memory(minibatch_size) state = np.asarray([x[0] for x in data]) action = np.asarray([x[1] for x in data]) base_reward = np.asarray([x[2] for x in data]) next_state = np.asarray([x[3] for x in data]) done = np.asarray([x[4] for x in data]) not_done = (done + 1) % 2 if use_rx: rx_inc = np.asarray([x[5] for x in data]) reward = base_reward + rx_inc else: reward = base_reward td_error, _ = self.dqn.training_qnet(state, action, reward, not_done, next_state) self.train_cnt += 1 if self.train_cnt % (target_update) == 0: self.dqn.training_target_qnet() return td_error
class Agent(object): def __init__(self, global_state_space, obs_space, act_space, n_agents, sess, name): self.act_space = act_space self.obs_space = obs_space self.n_agents = n_agents self.dqn = DQN(sess, global_state_space, obs_space, act_space, n_agents, name) self.rb = ReplayBuffer(capacity=rb_capacity) self.train_cnt = 0 def act_multi(self, obs, random): if random.all(): return np.random.randint(self.act_space, size=(len(obs))) q_values = self.dqn.get_q_values([obs])[0] r_action = np.random.randint(self.act_space, size=(len(obs))) action_n = ( (random + 1) % 2) * (q_values.argmax(axis=1)) + (random) * r_action return action_n def add_to_memory(self, exp): self.rb.add_to_memory(exp) def sync_target(self): self.dqn.training_target_qnet() def train(self): data = self.rb.sample_from_memory(minibatch_size) state = np.asarray([x[0] for x in data]) action = np.asarray([x[1] for x in data]) reward = np.asarray([x[2] for x in data]) next_state = np.asarray([x[3] for x in data]) done = np.asarray([x[4] for x in data]) global_state = np.asarray([x[5] for x in data]) next_global_state = np.asarray([x[6] for x in data]) not_done = (done + 1) % 2 td_error, _ = self.dqn.training_qnet(global_state, state, action, reward, not_done, next_global_state, next_state) self.train_cnt += 1 if self.train_cnt % target_update == 0: self.dqn.training_target_qnet() return td_error
class Agent(object): def __init__(self, obs_space, act_space, sess, n_agents, name): self.obs_space = obs_space self.act_space = act_space self.n_agents = n_agents self.pe_dqn = DQN(sess, obs_space, act_space, n_agents, name) self.rb = ReplayBuffer(capacity=rb_capacity) self.train_cnt = 0 self.sns_q = None def reset(self): self.sns_q = None def act_multi(self, obs, random): if self.sns_q is None: q_values = self.pe_dqn.get_aq_values([obs])[0] else: q_values = self.sns_q r_action = np.random.randint(self.act_space, size=(len(obs))) action_n = ((random+1)%2)*(q_values.argmax(axis=1)) + (random)*r_action return action_n def incentivize_multi(self, info): state, action, reward, next_state, done = info done = done.all() # [ls_q, lns_q] = self.pe_dqn.get_mq_values([state, next_state]) [[x, self.sns_q],[ls_q, lns_q]] = self.pe_dqn.get_aq_tmq_values([state, next_state]) s_q = ls_q[range(self.n_agents), action] ns_q = discount_factor*lns_q.max(axis=1)*(not done) + reward td = ns_q - s_q if done: self.sns_q = None return td def add_to_memory(self, exp): self.rb.add_to_memory(exp) def sync_target(self): self.pe_dqn.training_target_qnet() def train(self, use_rx): # train short DQN with recent data data = self.rb.sample_from_memory(minibatch_size) state = np.asarray([x[0] for x in data]) action = np.asarray([x[1] for x in data]) base_reward = np.asarray([x[2] for x in data]) next_state = np.asarray([x[3] for x in data]) done = np.asarray([x[4] for x in data]) # tx_inc = np.asarray([x[6] for x in data]) not_done = (done+1)%2 if use_rx: rx_inc = np.asarray([x[5] for x in data]) reward = base_reward + rx_inc else: rx_inc = np.zeros(1) reward = base_reward td_error,_ = self.pe_dqn.training_a_qnet(state, action, reward, not_done, next_state) self.train_cnt += 1 peer_update = False if self.train_cnt % target_update == 0: self.pe_dqn.training_target_qnet() self.pe_dqn.training_peer_qnet() peer_update = True return td_error, peer_update, np.abs(rx_inc).mean()
class Agent(object): def __init__(self, obs_space, act_space, sess, n_agents, name): self.act_space = act_space self.n_agents = n_agents self.ped_dqn = DQN(sess, obs_space, sup_len, act_space, n_agents, name) self.action_rb = ReplayBuffer(capacity=rb_capacity) self.mission_rb = ReplayBuffer(capacity=mrb_capacity) self.train_cnt = 0 self.mission_train_cnt = 0 self.sns_q = None def reset(self): self.sns_q = None def act_multi(self, obs, random): if self.sns_q is None: q_values = self.ped_dqn.get_aq_values([obs])[0] else: q_values = self.sns_q r_action = np.random.randint(self.act_space, size=(len(obs))) action_n = ( (random + 1) % 2) * (q_values.argmax(axis=1)) + (random) * r_action return action_n def incentivize_multi(self, info): state, action, reward, next_state, done = info done = done.all() [[x, self.sns_q], [ls_q, lns_q]] = self.ped_dqn.get_aq_pmq_values([state, next_state]) s_q = ls_q[range(self.n_agents), action] ns_q = discount_factor * lns_q.max(axis=1) * (not done) + reward td = ns_q - s_q if done: self.sns_q = None return td def add_to_memory(self, exp): self.action_rb.add_to_memory(exp) self.mission_rb.add_to_memory(exp[:5]) def sync_target(self): self.ped_dqn.training_target_qnet() def train_mission_dqn(self): # train mission DQN with recent and old data data = self.mission_rb.sample_from_memory(minibatch_size) state = np.asarray([x[0] for x in data]) action = np.asarray([x[1] for x in data]) base_reward = np.asarray([x[2] for x in data]) next_state = np.asarray([x[3] for x in data]) done = np.asarray([x[4] for x in data]) not_done = (done + 1) % 2 mtd, _ = self.ped_dqn.training_m_qnet(state, action, base_reward, not_done, next_state, mlr) return mtd def train(self, use_rx): data = self.action_rb.sample_from_memory(minibatch_size) state = np.asarray([x[0] for x in data]) action = np.asarray([x[1] for x in data]) base_reward = np.asarray([x[2] for x in data]) next_state = np.asarray([x[3] for x in data]) done = np.asarray([x[4] for x in data]) not_done = (done + 1) % 2 if use_rx: rx_inc = np.asarray([x[5] for x in data]) reward = base_reward + rx_inc else: reward = base_reward td_error, _ = self.ped_dqn.training_a_qnet(state, action, reward, not_done, next_state) self.train_cnt += 1 if self.train_cnt % (target_update) == 0: self.ped_dqn.training_target_qnet() # if self.train_cnt % (target_update) == 0: # self.ped_dqn.mission.training_target_qnet() # if self.train_cnt % (target_update*10) == 0: self.ped_dqn.training_peer_qnet() # if self.train_cnt % target_update == 0: # x, y = self.ped_dqn.get_aq_pmq_values([state[0]]) # print self.ped_dqn.get_mq_values([state[0]]) # print y # x, y = self.ped_dqn.get_aq_tmq_values([state[0]]) # print y return td_error