Пример #1
0
    def train(self, sess, buf, epsilon):

        batch_size = len(buf.obs)
        # Update value network
        feed = {self.obs: buf.obs_next}
        v_target_next, v_next = sess.run([self.v_target, self.v],
                                         feed_dict=feed)
        v_target_next = np.reshape(v_target_next, [batch_size])
        v_next = np.reshape(v_next, [batch_size])
        feed = {self.obs: buf.obs,
                self.v_target_next: v_target_next,
                self.reward: buf.reward}
        _, v = sess.run([self.v_op, self.v], feed_dict=feed)
        v = np.reshape(v, [batch_size])

        actions_1hot = util.process_actions(buf.action, self.l_action)
        feed = {self.obs: buf.obs,
                self.action_taken: actions_1hot,
                self.r_sampled: buf.r_sampled,
                self.reward: buf.reward,
                self.epsilon: epsilon}
        feed[self.v_next_ph] = v_next
        feed[self.v_ph] = v
        feed[self.action_others] = util.get_action_others_1hot_batch(
            buf.action_all, self.agent_id, self.l_action_for_r)
        _ = sess.run(self.policy_op, feed_dict=feed)

        # Update target network
        sess.run(self.list_update_v_ops)
Пример #2
0
    def train_reward(self, sess, list_buf, list_buf_new, epsilon):
        """Training step for incentive function.

        Args:
            sess: TF session
            list_buf: list of all agents' experience buffers
            list_buf_new: list of all agents' buffers of new experiences, 
                          after policy updates
            epsilon: float exploration lower bound
        """
        buf_self = list_buf[self.agent_id]
        buf_self_new = list_buf_new[self.agent_id]

        n_steps = len(buf_self.obs)
        ones = np.ones(n_steps)

        feed = {}
        feed[self.epsilon] = epsilon
        for idx, agent_id_opp in enumerate(self.list_agent_id_opp):
            buf_other = list_buf[agent_id_opp]
            actions_other_1hot = util.process_actions(buf_other.action, self.l_action)
            feed[self.list_obs_opp[idx]] = buf_other.obs
            feed[self.list_action_taken_opp[idx]] = actions_other_1hot
            feed[self.list_r_ext_opp[idx]] = buf_other.reward
            feed[self.list_ones[idx]] = ones

            buf_other_new = list_buf_new[agent_id_opp]
            actions_other_1hot_new = util.process_actions(buf_other_new.action,
                                                          self.l_action)
            feed[self.list_opp_policy_new[idx].obs] = buf_other_new.obs
            feed[self.list_opp_policy_new[idx].action_taken] = actions_other_1hot_new

        n_steps = len(buf_self_new.obs)
        total_reward = buf_self_new.reward
        returns_new = util.process_rewards(total_reward, self.gamma)
        feed[self.obs] = buf_self.obs
        feed[self.action_others] = util.get_action_others_1hot_batch(
            buf_self.action_all, self.agent_id, self.l_action)
        feed[self.ones] = ones
        feed[self.returns] = returns_new

        if self.separate_cost_optimizer:
            _ = sess.run([self.reward_op, self.cost_op], feed_dict=feed)
        else:
            _ = sess.run(self.reward_op, feed_dict=feed)
    def train(self, sess, buf, epsilon):
        """On-policy training step.

        Args:
            sess: TF session
            buf: Buffer object
            epsilon: float
        """
        n_steps = len(buf.obs)
        actions_1hot = util.process_actions(buf.action, self.l_action)
        ones = np.ones(n_steps)
        feed = {self.obs: buf.obs,
                self.action_taken: actions_1hot,
                self.r_sampled: buf.r_sampled,
                self.r_ext: buf.reward,
                self.ones: ones,
                self.epsilon: epsilon}
        feed[self.action_others] = util.get_action_others_1hot_batch(
            buf.action_all, self.agent_id, self.l_action_for_r)

        _ = sess.run(self.policy_op, feed_dict=feed)
Пример #4
0
    def train_reward(self, sess, list_buf, list_buf_new, epsilon,
                     reg_coeff=1e-3, summarize=False, writer=None):
        buf_self = list_buf[self.agent_id]
        buf_self_new = list_buf_new[self.agent_id]
        n_steps = len(buf_self.obs)
        ones = np.ones(n_steps)
        feed = {}

        for agent in self.list_of_agents:
            other_id = agent.agent_id
            if other_id == self.agent_id:
                continue
            buf_other = list_buf[other_id]

            v_next = np.reshape(sess.run(
                agent.v, feed_dict={agent.obs: buf_other.obs_next}), [n_steps])
            v = np.reshape(sess.run(
                agent.v, feed_dict={agent.obs: buf_other.obs}), [n_steps])

            actions_other_1hot = util.process_actions(buf_other.action, self.l_action)
            feed[agent.obs] = buf_other.obs
            feed[agent.action_taken] = actions_other_1hot
            feed[agent.r_ext] = buf_other.reward
            feed[agent.epsilon] = epsilon
            feed[agent.v_next_ph] = v_next
            feed[agent.v_ph] = v

            # This is needed for the case N > 2. From an agent i's perspective,
            # another agent j will receive reward from a third agent k, 
            # so to compute j's policy update we need to input agent k's observation
            # of all other agents' actions (from agent k's perspective).
            # So in general we just feed action_others from all agents' perspectives.
            feed[agent.action_others] = util.get_action_others_1hot_batch(
                buf_other.action_all, other_id, agent.l_action_for_r)

            buf_other_new = list_buf_new[other_id]
            actions_other_1hot_new = util.process_actions(buf_other_new.action,
                                                          self.l_action)
            other_policy_new = self.list_policy_new[other_id]
            feed[other_policy_new.obs] = buf_other_new.obs
            feed[other_policy_new.action_taken] = actions_other_1hot_new

        if self.include_cost_in_chain_rule:
            # Needed to compute the chain rule,
            # These are for the update from \theta to \hat{\theta}
            action_self_1hot = util.process_actions(buf_self.action, self.l_action)
            feed[self.action_taken] = action_self_1hot
            feed[self.r_ext] = buf_self.reward
            feed[self.epsilon] = epsilon
            v_next = np.reshape(sess.run(
                self.v, feed_dict={self.obs: buf_self.obs_next}), [n_steps])
            v = np.reshape(sess.run(
                self.v, feed_dict={self.obs: buf_self.obs}), [n_steps])
            feed[self.v_next_ph] = v_next
            feed[self.v_ph] = v
            # These are needed for the factor
            # \nabla_{\hat{\theta}^j} J^i(\hat{\tau}, \hat{\theta}) when i == j
            action_self_1hot_new = util.process_actions(buf_self_new.action,
                                                        self.l_action)
            self_policy_new = self.list_policy_new[self.agent_id]
            feed[self_policy_new.obs] = buf_self_new.obs
            feed[self_policy_new.action_taken] = action_self_1hot_new

        feed[self.obs] = buf_self.obs
        feed[self.action_others] = util.get_action_others_1hot_batch(
            buf_self.action_all, self.agent_id, self.l_action_for_r)
        feed[self.ones] = ones

        n_steps = len(buf_self_new.obs)
        v_new = np.reshape(sess.run(
            self.v, feed_dict={self.obs: buf_self_new.obs}), [n_steps])
        v_next_new = np.reshape(sess.run(
            self.v, feed_dict={self.obs: buf_self_new.obs_next}), [n_steps])

        if self.include_cost_in_chain_rule:
            total_reward = [buf_self_new.reward[idx] + buf_self_new.r_from_others[idx]
                            - buf_self_new.r_given[idx] for idx in range(n_steps)]
        else:
            total_reward = buf_self_new.reward

        feed[self.v_td_error] = total_reward + self.gamma*v_next_new - v_new

        if not (self.include_cost_in_chain_rule or self.separate_cost_optimizer):
            feed[self.reg_coeff] = reg_coeff

        if self.separate_cost_optimizer:
            _ = sess.run([self.reward_op, self.cost_op], feed_dict=feed)
        else:
            _ = sess.run(self.reward_op, feed_dict=feed)

        sess.run(self.list_update_v_ops)