def train(self, sess, buf, epsilon): batch_size = len(buf.obs) # Update value network feed = {self.obs: buf.obs_next} v_target_next, v_next = sess.run([self.v_target, self.v], feed_dict=feed) v_target_next = np.reshape(v_target_next, [batch_size]) v_next = np.reshape(v_next, [batch_size]) feed = {self.obs: buf.obs, self.v_target_next: v_target_next, self.reward: buf.reward} _, v = sess.run([self.v_op, self.v], feed_dict=feed) v = np.reshape(v, [batch_size]) actions_1hot = util.process_actions(buf.action, self.l_action) feed = {self.obs: buf.obs, self.action_taken: actions_1hot, self.r_sampled: buf.r_sampled, self.reward: buf.reward, self.epsilon: epsilon} feed[self.v_next_ph] = v_next feed[self.v_ph] = v feed[self.action_others] = util.get_action_others_1hot_batch( buf.action_all, self.agent_id, self.l_action_for_r) _ = sess.run(self.policy_op, feed_dict=feed) # Update target network sess.run(self.list_update_v_ops)
def train_reward(self, sess, list_buf, list_buf_new, epsilon): """Training step for incentive function. Args: sess: TF session list_buf: list of all agents' experience buffers list_buf_new: list of all agents' buffers of new experiences, after policy updates epsilon: float exploration lower bound """ buf_self = list_buf[self.agent_id] buf_self_new = list_buf_new[self.agent_id] n_steps = len(buf_self.obs) ones = np.ones(n_steps) feed = {} feed[self.epsilon] = epsilon for idx, agent_id_opp in enumerate(self.list_agent_id_opp): buf_other = list_buf[agent_id_opp] actions_other_1hot = util.process_actions(buf_other.action, self.l_action) feed[self.list_obs_opp[idx]] = buf_other.obs feed[self.list_action_taken_opp[idx]] = actions_other_1hot feed[self.list_r_ext_opp[idx]] = buf_other.reward feed[self.list_ones[idx]] = ones buf_other_new = list_buf_new[agent_id_opp] actions_other_1hot_new = util.process_actions(buf_other_new.action, self.l_action) feed[self.list_opp_policy_new[idx].obs] = buf_other_new.obs feed[self.list_opp_policy_new[idx].action_taken] = actions_other_1hot_new n_steps = len(buf_self_new.obs) total_reward = buf_self_new.reward returns_new = util.process_rewards(total_reward, self.gamma) feed[self.obs] = buf_self.obs feed[self.action_others] = util.get_action_others_1hot_batch( buf_self.action_all, self.agent_id, self.l_action) feed[self.ones] = ones feed[self.returns] = returns_new if self.separate_cost_optimizer: _ = sess.run([self.reward_op, self.cost_op], feed_dict=feed) else: _ = sess.run(self.reward_op, feed_dict=feed)
def train(self, sess, buf, epsilon): n_steps = len(buf.obs) actions_1hot = util.process_actions(buf.action, self.l_action) ones = np.ones(n_steps) feed = { self.obs: buf.obs, self.action_taken: actions_1hot, self.r_ext: buf.reward, self.ones: ones, self.epsilon: epsilon } _ = sess.run(self.policy_op, feed_dict=feed)
def train_opp_model(self, sess, list_buf, epsilon): """Fits opponent model. Args: sess: TF session list_buf: list of all buffers of agents' experiences epsilon: float """ for idx, agent_id_opp in enumerate(self.list_agent_id_opp): buf = list_buf[agent_id_opp] feed = {} feed[self.list_obs_opp[idx]] = buf.obs feed[self.list_action_taken_opp[idx]] = util.process_actions( buf.action, self.l_action) _ = sess.run(self.list_opp_op[idx], feed_dict=feed)
def update(self, sess, buf, epsilon): sess.run(self.list_copy_main_to_prime_ops) n_steps = len(buf.obs) actions_1hot = util.process_actions(buf.action, self.l_action) ones = np.ones(n_steps) feed = { self.obs: buf.obs, self.action_taken: actions_1hot, self.r_ext: buf.reward, self.ones: ones, self.epsilon: epsilon } feed[self.r_from_others] = buf.r_from_others if self.include_cost_in_chain_rule: feed[self.r_given] = buf.r_given _ = sess.run(self.policy_op_prime, feed_dict=feed)
def update(self, sess, buf, epsilon): """Training step for own policy. Args: sess: TF session buf: Buffer object epsilon: float exploration lower bound """ n_steps = len(buf.obs) actions_1hot = util.process_actions(buf.action, self.l_action) ones = np.ones(n_steps) feed = {self.obs: buf.obs, self.action_taken: actions_1hot, self.r_ext: buf.reward, self.ones: ones, self.epsilon: epsilon} feed[self.r_from_others] = buf.r_from_others _ = sess.run(self.policy_op, feed_dict=feed)
def train(self, sess, buf, epsilon): """On-policy training step. Args: sess: TF session buf: Buffer object epsilon: float """ n_steps = len(buf.obs) actions_1hot = util.process_actions(buf.action, self.l_action) ones = np.ones(n_steps) feed = {self.obs: buf.obs, self.action_taken: actions_1hot, self.r_sampled: buf.r_sampled, self.r_ext: buf.reward, self.ones: ones, self.epsilon: epsilon} feed[self.action_others] = util.get_action_others_1hot_batch( buf.action_all, self.agent_id, self.l_action_for_r) _ = sess.run(self.policy_op, feed_dict=feed)
def update(self, sess, buf, epsilon): sess.run(self.list_copy_main_to_prime_ops) batch_size = len(buf.obs) # Update value network feed = {self.obs: buf.obs_next} v_target_next, v_next = sess.run([self.v_target, self.v], feed_dict=feed) v_target_next = np.reshape(v_target_next, [batch_size]) v_next = np.reshape(v_next, [batch_size]) n_steps = len(buf.obs) if self.include_cost_in_chain_rule: total_reward = [buf.reward[idx] + buf.r_from_others[idx] - buf.r_given[idx] for idx in range(n_steps)] else: total_reward = [buf.reward[idx] + buf.r_from_others[idx] for idx in range(n_steps)] feed = {self.obs: buf.obs, self.v_target_next: v_target_next, self.total_reward: total_reward} _, v = sess.run([self.v_op, self.v], feed_dict=feed) v = np.reshape(v, [batch_size]) # Update prime policy network actions_1hot = util.process_actions(buf.action, self.l_action) feed = {self.obs: buf.obs, self.action_taken: actions_1hot, self.r_ext: buf.reward, self.epsilon: epsilon} feed[self.r_from_others] = buf.r_from_others if self.include_cost_in_chain_rule: feed[self.r_given] = buf.r_given feed[self.v_next_ph] = v_next feed[self.v_ph] = v _ = sess.run(self.policy_op_prime, feed_dict=feed) # Update target network sess.run(self.list_update_v_ops)
def train(self, sess, buf, epsilon): batch_size = len(buf.obs) # Update value network feed = {self.obs: buf.obs_next} if self.obs_image_vec: feed[self.obs_v] = buf.obs_v_next v_target_next, v_next = sess.run([self.v_target, self.v], feed_dict=feed) v_target_next = np.reshape(v_target_next, [batch_size]) v_next = np.reshape(v_next, [batch_size]) feed = { self.obs: buf.obs, self.v_target_next: v_target_next, self.reward: buf.reward } if self.obs_image_vec: feed[self.obs_v] = buf.obs_v _, v = sess.run([self.v_op, self.v], feed_dict=feed) v = np.reshape(v, [batch_size]) actions_1hot = util.process_actions(buf.action, self.l_action) feed = { self.obs: buf.obs, self.action_taken: actions_1hot, self.reward: buf.reward, self.epsilon: epsilon } feed[self.v_next_ph] = v_next feed[self.v_ph] = v if self.obs_image_vec: feed[self.obs_v] = buf.obs_v _ = sess.run(self.policy_op, feed_dict=feed) # Update target network sess.run(self.list_update_v_ops)
def train_reward(self, sess, list_buf, list_buf_new, epsilon, reg_coeff=1e-3, summarize=False, writer=None): buf_self = list_buf[self.agent_id] buf_self_new = list_buf_new[self.agent_id] n_steps = len(buf_self.obs) ones = np.ones(n_steps) feed = {} for agent in self.list_of_agents: other_id = agent.agent_id if other_id == self.agent_id: continue buf_other = list_buf[other_id] v_next = np.reshape(sess.run( agent.v, feed_dict={agent.obs: buf_other.obs_next}), [n_steps]) v = np.reshape(sess.run( agent.v, feed_dict={agent.obs: buf_other.obs}), [n_steps]) actions_other_1hot = util.process_actions(buf_other.action, self.l_action) feed[agent.obs] = buf_other.obs feed[agent.action_taken] = actions_other_1hot feed[agent.r_ext] = buf_other.reward feed[agent.epsilon] = epsilon feed[agent.v_next_ph] = v_next feed[agent.v_ph] = v # This is needed for the case N > 2. From an agent i's perspective, # another agent j will receive reward from a third agent k, # so to compute j's policy update we need to input agent k's observation # of all other agents' actions (from agent k's perspective). # So in general we just feed action_others from all agents' perspectives. feed[agent.action_others] = util.get_action_others_1hot_batch( buf_other.action_all, other_id, agent.l_action_for_r) buf_other_new = list_buf_new[other_id] actions_other_1hot_new = util.process_actions(buf_other_new.action, self.l_action) other_policy_new = self.list_policy_new[other_id] feed[other_policy_new.obs] = buf_other_new.obs feed[other_policy_new.action_taken] = actions_other_1hot_new if self.include_cost_in_chain_rule: # Needed to compute the chain rule, # These are for the update from \theta to \hat{\theta} action_self_1hot = util.process_actions(buf_self.action, self.l_action) feed[self.action_taken] = action_self_1hot feed[self.r_ext] = buf_self.reward feed[self.epsilon] = epsilon v_next = np.reshape(sess.run( self.v, feed_dict={self.obs: buf_self.obs_next}), [n_steps]) v = np.reshape(sess.run( self.v, feed_dict={self.obs: buf_self.obs}), [n_steps]) feed[self.v_next_ph] = v_next feed[self.v_ph] = v # These are needed for the factor # \nabla_{\hat{\theta}^j} J^i(\hat{\tau}, \hat{\theta}) when i == j action_self_1hot_new = util.process_actions(buf_self_new.action, self.l_action) self_policy_new = self.list_policy_new[self.agent_id] feed[self_policy_new.obs] = buf_self_new.obs feed[self_policy_new.action_taken] = action_self_1hot_new feed[self.obs] = buf_self.obs feed[self.action_others] = util.get_action_others_1hot_batch( buf_self.action_all, self.agent_id, self.l_action_for_r) feed[self.ones] = ones n_steps = len(buf_self_new.obs) v_new = np.reshape(sess.run( self.v, feed_dict={self.obs: buf_self_new.obs}), [n_steps]) v_next_new = np.reshape(sess.run( self.v, feed_dict={self.obs: buf_self_new.obs_next}), [n_steps]) if self.include_cost_in_chain_rule: total_reward = [buf_self_new.reward[idx] + buf_self_new.r_from_others[idx] - buf_self_new.r_given[idx] for idx in range(n_steps)] else: total_reward = buf_self_new.reward feed[self.v_td_error] = total_reward + self.gamma*v_next_new - v_new if not (self.include_cost_in_chain_rule or self.separate_cost_optimizer): feed[self.reg_coeff] = reg_coeff if self.separate_cost_optimizer: _ = sess.run([self.reward_op, self.cost_op], feed_dict=feed) else: _ = sess.run(self.reward_op, feed_dict=feed) sess.run(self.list_update_v_ops)