Exemplo n.º 1
0
class MATD3AgentTrainer(AgentTrainer):
    def __init__(self,
                 name,
                 model,
                 obs_shape_n,
                 act_space_n,
                 agent_index,
                 args,
                 local_q_func=False):
        self.name = name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(
                U.BatchInput(obs_shape_n[i],
                             name="observation" + str(i)).get())

        # Create all the functions necessary to train the model
        self.q_train1, self.q_update1, self.q_debug1 = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            agent_idx=agent_index,
            q_function_idx=1,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units)
        self.q_train2, self.q_update2, self.q_debug2 = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            agent_idx=agent_index,
            q_func=model,
            q_function_idx=2,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units)

        self.act, self.p_train, self.p_update, self.p_debug = p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            agent_idx=agent_index,
            p_func=model,
            q_func=model,  #MLPmodel()
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units)
        # Create experience buffer
        self.replay_buffer = ReplayBuffer(1e6)
        self.min_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None
        a = tf.summary.FileWriter("logdirMaddpg", tf.get_default_graph())
        a.flush()
        a.close()

    def action(self, obs):
        return self.act(obs[None])[0]

    def experience(self, obs, act, rew, new_obs, done, terminal):
        # Store transition in the replay buffer.
        self.replay_buffer.add(obs, act, rew, new_obs, float(done))

    def preupdate(self):
        self.replay_sample_index = None

    @property
    def q_debug(self):
        return self.q_debug1

    def update(self, agents, train_step):
        if len(
                self.replay_buffer
        ) < self.min_replay_buffer_len:  # replay buffer is not large enough
            return

        if not train_step % self.args.update_rate == 0:
            return

        self.replay_sample_index = self.replay_buffer.generate_sample_indices(
            self.args.batch_size)
        # collect replay sample from all agents
        obs_n = []
        obs_next_n = []
        act_n = []
        index = self.replay_sample_index
        for i in range(self.n):
            obs, act, rew, obs_next, done = agents[
                i].replay_buffer.sample_index(index)
            obs_n.append(obs)
            obs_next_n.append(obs_next)
            act_n.append(act)
        obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index)

        # train q network
        target_act_next_n = [
            agents[i].p_debug['target_act'](obs_next_n[i])
            for i in range(self.n)
        ]
        if self.args.use_critic_noise:
            for agent_idx in range(self.n):
                noise = np.random.normal(
                    0,
                    self.args.critic_action_noise_stddev,
                    size=target_act_next_n[agent_idx].shape)
                clipped_noise = np.clip(noise, -self.args.action_noise_clip,
                                        self.args.action_noise_clip)
                target_act_next_n[agent_idx] = (target_act_next_n[agent_idx] +
                                                clipped_noise).tolist()
        elif self.args.use_critic_noise_self:
            noise = np.random.normal(
                0,
                self.args.critic_action_noise_stddev,
                size=target_act_next_n[self.agent_index].shape)
            clipped_noise = np.clip(noise, -self.args.action_noise_clip,
                                    self.args.action_noise_clip)
            target_act_next_n[self.agent_index] = target_act_next_n[
                self.agent_index] + clipped_noise
            target_act_next_n = target_act_next_n.tolist()
        else:
            target_act_next_n = target_act_next_n
        target_q_next1 = self.q_debug1['target_q_values'](*(obs_next_n +
                                                            target_act_next_n))
        target_q_next2 = self.q_debug2['target_q_values'](*(obs_next_n +
                                                            target_act_next_n))
        target_q_next = np.min([target_q_next1, target_q_next2], 0)
        if self.args.critic_zero_if_done:
            done_cond = done == True
            target_q_next[done_cond] = 0

        target_q = rew + self.args.gamma * target_q_next
        q_loss = self.q_train1(*(obs_n + act_n + [target_q]))
        q_loss = self.q_train2(*(obs_n + act_n + [target_q]))

        # train p network
        if train_step % (self.args.update_rate *
                         self.args.policy_update_rate) == 0:
            p_loss = self.p_train(*(obs_n + act_n))
            self.p_update()
            self.q_update1()
            self.q_update2()

        # print('Agent' + str(self.agent_index)  + ' Qloss = ' + str(q_loss) + ' Ploss = ' + str(p_loss))
        # print('Replay buffer size:' + str(len(self.replay_buffer)))

        return [
            q_loss,
            np.mean(target_q),
            np.mean(rew),
            np.mean(target_q_next),
            np.std(target_q)
        ]
Exemplo n.º 2
0
class MADDPGAgentTrainer(AgentTrainer):
    def __init__(self,
                 name,
                 model,
                 obs_shape_n,
                 act_space_n,
                 agent_index,
                 args,
                 local_q_func=False):
        self.name = name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(
                U.BatchInput(obs_shape_n[i],
                             name="observation" + str(i)).get())

        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            q_index=agent_index,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units)
        self.act, self.p_train, self.p_update, self.p_debug = p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            p_index=agent_index,
            p_func=model,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units)
        # Create experience buffer
        self.replay_buffer = ReplayBuffer(1e6)
        self.min_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None
        a = tf.summary.FileWriter("logdirMaddpg", tf.get_default_graph())
        a.flush()
        a.close()

    def action(self, obs):
        return self.act(obs[None])[0]

    def experience(self, obs, act, rew, new_obs, done, terminal):
        # Store transition in the replay buffer.
        self.replay_buffer.add(obs, act, rew, new_obs, float(done))

    def preupdate(self):
        self.replay_sample_index = None

    def update(self, agents, t):
        if len(
                self.replay_buffer
        ) < self.min_replay_buffer_len:  # replay buffer is not large enough
            return
        if not t % self.args.update_rate == 0:  # only update every 100 steps
            return

        self.replay_sample_index = self.replay_buffer.generate_sample_indices(
            self.args.batch_size)
        # collect replay sample from all agents
        obs_n = []
        obs_next_n = []
        act_n = []
        index = self.replay_sample_index
        for i in range(self.n):
            obs, act, rew, obs_next, done = agents[
                i].replay_buffer.sample_index(index)
            obs_n.append(obs)
            obs_next_n.append(obs_next)
            act_n.append(act)
        obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index)

        # train q network
        target_act_next_n = [
            agents[i].p_debug['target_act'](obs_next_n[i])
            for i in range(self.n)
        ]

        target_q_next = self.q_debug['target_q_values'](*(obs_next_n +
                                                          target_act_next_n))
        if self.args.critic_zero_if_done:
            done_cond = done == True
            target_q_next[done_cond] = 0

        target_q = rew + self.args.gamma * target_q_next
        q_loss = self.q_train(*(obs_n + act_n + [target_q]))
        # print('Action gradient = ')

        # train p network
        p_loss = self.p_train(*(obs_n + act_n))

        self.p_update()
        self.q_update()
        #embed()

        return [
            q_loss, p_loss,
            np.mean(target_q),
            np.mean(rew),
            np.mean(target_q_next),
            np.std(target_q)
        ]