示例#1
0
    def __init__(self,
                 name,
                 model,
                 lstm_model,
                 obs_shape_n,
                 act_space_n,
                 agent_index,
                 args,
                 local_q_func=False):
        self.args = args
        self.name = name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index

        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(
                U.BatchInput(obs_shape_n[i],
                             name="observation" + str(i)).get())

        self.local_q_func = local_q_func
        self.act, self.p_debug = p_act(scope=self.name,
                                       make_obs_ph_n=obs_ph_n,
                                       act_space_n=act_space_n,
                                       p_index=self.agent_index,
                                       p_func=model,
                                       lstm_model=lstm_model,
                                       num_units=self.args.num_units,
                                       use_lstm=False,
                                       reuse=False)
        # Create experience buffer
        self.replay_buffer = ReplayBuffer(args.buffer_size)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None
示例#2
0
    def __init__(self,
                 name,
                 agents_number,
                 agent_index,
                 actors,
                 act_space_n,
                 args,
                 common_obs_shape,
                 sep_obs_shape,
                 model,
                 lstm_model,
                 cnn_model,
                 cnn_scope=None,
                 lstm_scope=None,
                 reuse=False,
                 local_q_func=False,
                 session=None):
        self.actors = actors
        self.name = name
        self.n = agents_number
        self.agent_index = agent_index
        self.args = args
        self.history_length = args.history_length

        common_obs_shape = [args.history_length] + list(common_obs_shape)
        common_obs_ph = U.BatchInput(common_obs_shape,
                                     name="common_observation").get()
        sep_obs_shape = [args.history_length] + list(sep_obs_shape[1:])
        sep_obs_ph_n = [
            U.BatchInput(sep_obs_shape,
                         name="common_observation" + str(i)).get()
            for i in range(self.n)
        ]

        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,
            make_common_obs_ph=common_obs_ph,
            make_sep_obs_ph_n=sep_obs_ph_n,
            act_space_n=act_space_n,
            cnn_model=cnn_model,
            cnn_scope=cnn_scope,
            q_index=agent_index,
            q_func=model,
            lstm_model=lstm_model,
            lstm_scope=lstm_scope,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            args=self.args,
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units,
            reuse=False,
            use_lstm=self.args.use_lstm,
            session=session)

        self.replay_buffer = ReplayBuffer(args.buffer_size,
                                          args.history_length)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None
示例#3
0
    def __init__(self,
                 name,
                 agents_number,
                 act_space_n,
                 agent_index,
                 args,
                 common_obs_shape,
                 sep_obs_shape,
                 model,
                 lstm_model,
                 cnn_model,
                 lstm_scope=None,
                 cnn_scope=None,
                 reuse=False,
                 session=None,
                 local_q_func=False):
        self.args = args
        self.name = name
        self.n = agents_number
        self.agent_index = agent_index
        self.local_q_func = local_q_func

        sep_obs_shape = [args.history_length] + list(sep_obs_shape[1:])
        common_obs_shape = [args.history_length] + list(common_obs_shape)

        common_obs_ph = U.BatchInput(common_obs_shape,
                                     name="common_observation").get()
        sep_obs_ph_n = [
            U.BatchInput(sep_obs_shape,
                         name="common_observation" + str(i)).get()
            for i in range(self.n)
        ]

        self.act, self.p_debug = p_act(
            make_common_obs_ph=common_obs_ph,
            make_sep_obs_ph_n=sep_obs_ph_n,
            act_space_n=act_space_n,
            p_index=self.agent_index,
            p_func=model,
            lstm_model=lstm_model,
            cnn_model=cnn_model,
            lstm_scope=lstm_scope,
            cnn_scope=cnn_scope,
            use_lstm=self.args.use_lstm,
            use_cnn=self.args.use_cnn,
            reuse=reuse,
            session=session,
            scope=self.name,
            num_units=self.args.num_units,
        )
        # Create experience buffer
        self.replay_buffer = ReplayBuffer(args.buffer_size,
                                          args.history_length)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None
示例#4
0
    def __init__(self,
                 env_name,
                 name,
                 model,
                 obs_shape_n,
                 act_space_n,
                 agent_index,
                 args,
                 local_q_func=False):
        self.env_name = env_name
        self.name = name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(
                U.BatchInput(obs_shape_n[i],
                             name="observation" + str(i)).get())

        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.env_name + self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            q_index=agent_index,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units)
        self.act, self.p_train, self.p_update, self.p_debug = p_train(
            scope=self.env_name + self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            p_index=agent_index,
            p_scope="common_" + self.name,
            p_func=model,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units)
        # Create experience buffer
        self.replay_buffer = ReplayBuffer(args.buffer_size)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None
示例#5
0
    def __init__(self,
                 name,
                 model,
                 lstm_model,
                 obs_shape_n,
                 act_space_n,
                 agent_index,
                 actors,
                 args,
                 local_q_func=False,
                 session=None,
                 lstm_scope=None):
        self.actors = actors
        self.name = name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        self.history_length = args.history_length

        obs_ph_n = []
        for i in range(self.n):
            obs_shape = [args.history_length] + list(obs_shape_n[i])
            obs_ph_n.append(
                U.BatchInput((obs_shape), name="observation" + str(i)).get())

        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            q_index=agent_index,
            q_func=model,
            lstm_model=lstm_model,
            lstm_scope=lstm_scope,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            args=self.args,
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units,
            reuse=False,
            use_lstm=self.args.use_lstm,
            session=session)

        self.replay_buffer = ReplayBuffer(args.buffer_size,
                                          args.history_length)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None
示例#6
0
class MADDPGAgentTrainer(AgentTrainer):
    def __init__(self,
                 name,
                 model,
                 lstm_model,
                 obs_shape_n,
                 act_space_n,
                 agent_index,
                 args,
                 local_q_func=False,
                 reuse=False,
                 session=None):
        self.args = args
        self.name = name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index

        obs_ph_n = []
        for i in range(self.n):
            obs_shape = [args.history_length] + list(obs_shape_n[i])
            obs_ph_n.append(
                U.BatchInput((obs_shape), name="observation" + str(i)).get())

        self.local_q_func = local_q_func
        self.act, self.p_debug = p_act(scope=self.name,
                                       make_obs_ph_n=obs_ph_n,
                                       act_space_n=act_space_n,
                                       p_index=self.agent_index,
                                       p_func=model,
                                       lstm_model=lstm_model,
                                       num_units=self.args.num_units,
                                       use_lstm=self.args.use_lstm,
                                       reuse=reuse,
                                       session=session)
        # Create experience buffer
        self.replay_buffer = ReplayBuffer(args.buffer_size,
                                          args.history_length)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None

    def action(self, obs):
        obs = np.array(obs.queue)
        return self.act(obs[None])[0]

    def experience(self, obs, act, rew, new_obs, done, terminal):
        # Store transition in the replay buffer.
        self.replay_buffer.add(obs, act, rew, new_obs, float(done))
示例#7
0
    def __init__(self,
                 name,
                 model,
                 lstm_model,
                 obs_shape_n,
                 act_space_n,
                 agent_index,
                 actor_env,
                 args,
                 local_q_func=False,
                 session=None):
        self.args = args
        self.name = name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index

        obs_ph_n = []
        for i in range(self.n):
            obs_shape = [args.history_length] + list(obs_shape_n[i])
            # obs_shape.append()
            obs_ph_n.append(
                U.BatchInput((obs_shape), name="observation" + str(i)).get())

        optimizer = tf.train.AdamOptimizer(learning_rate=self.args.lr)

        self.p_train, self.p_update = p_train(scope=self.name,
                                              p_scope=actor_env,
                                              make_obs_ph_n=obs_ph_n,
                                              act_space_n=act_space_n,
                                              p_index=self.agent_index,
                                              p_func=model,
                                              q_func=model,
                                              lstm_model=lstm_model,
                                              optimizer=optimizer,
                                              grad_norm_clipping=0.5,
                                              local_q_func=local_q_func,
                                              num_units=self.args.num_units,
                                              reuse=True,
                                              use_lstm=self.args.use_lstm,
                                              session=session,
                                              args=args)

        # Create experience buffer
        self.replay_buffer = ReplayBuffer(args.buffer_size,
                                          args.history_length)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None
示例#8
0
class MADDPGAgentTrainer(AgentTrainer):
    def __init__(self,
                 env_name,
                 name,
                 model,
                 obs_shape_n,
                 act_space_n,
                 agent_index,
                 args,
                 local_q_func=False):
        self.env_name = env_name
        self.name = name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(
                U.BatchInput(obs_shape_n[i],
                             name="observation" + str(i)).get())

        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.env_name + self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            q_index=agent_index,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units)
        self.act, self.p_train, self.p_update, self.p_debug = p_train(
            scope=self.env_name + self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            p_index=agent_index,
            p_scope="common_" + self.name,
            p_func=model,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units)
        # Create experience buffer
        self.replay_buffer = ReplayBuffer(args.buffer_size)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None

    def action(self, obs):
        return self.act(obs[None])[0]

    def experience(self, obs, act, rew, new_obs, done, terminal):
        # Store transition in the replay buffer.
        self.replay_buffer.add(obs, act, rew, new_obs, float(done))

    def preupdate(self):
        self.replay_sample_index = None

    def update(self, agents, t):
        if len(
                self.replay_buffer
        ) < self.max_replay_buffer_len:  # replay buffer is not large enough
            return
        if not t % 100 == 0:  # only update every 100 steps
            return

        self.replay_sample_index = self.replay_buffer.make_index(
            self.args.batch_size)
        # collect replay sample from all agents
        obs_n = []
        obs_next_n = []
        act_n = []
        index = self.replay_sample_index
        for i in range(self.n):
            # buffer
            obs, act, rew, obs_next, done = agents[
                i].replay_buffer.sample_index(index)
            obs_n.append(obs)
            obs_next_n.append(obs_next)
            act_n.append(act)
        obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index)

        # train q network
        num_sample = 1
        target_q = 0.0
        for j in range(num_sample):
            target_act_next_n = [
                agents[i].p_debug['target_act'](obs_next_n[i])
                for i in range(self.n)
            ]
            target_q_next = self.q_debug['target_q_values'](
                *(obs_next_n + target_act_next_n))
            target_q += rew + self.args.gamma * (1.0 - done) * target_q_next
        target_q /= num_sample
        q_loss = self.q_train(*(obs_n + act_n + [target_q]))

        # train p network
        p_loss = self.p_train(*(obs_n + act_n))

        self.p_update()
        self.q_update()
        return [
            q_loss, p_loss,
            np.mean(target_q),
            np.mean(rew),
            np.mean(target_q_next),
            np.std(target_q)
        ]
示例#9
0
class MADDPGAgentTrainer(AgentTrainer):
    def __init__(self,
                 name,
                 model,
                 lstm_model,
                 obs_shape_n,
                 act_space_n,
                 agent_index,
                 actors,
                 args,
                 local_q_func=False,
                 session=None,
                 lstm_scope=None):
        self.actors = actors
        self.name = name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        self.history_length = args.history_length

        obs_ph_n = []
        for i in range(self.n):
            obs_shape = [args.history_length] + list(obs_shape_n[i])
            obs_ph_n.append(
                U.BatchInput((obs_shape), name="observation" + str(i)).get())

        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            q_index=agent_index,
            q_func=model,
            lstm_model=lstm_model,
            lstm_scope=lstm_scope,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            args=self.args,
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units,
            reuse=False,
            use_lstm=self.args.use_lstm,
            session=session)

        self.replay_buffer = ReplayBuffer(args.buffer_size,
                                          args.history_length)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None

    def experience(self, obs, act, rew, new_obs, done, terminal):
        # Store transition in the replay buffer.
        self.replay_buffer.add(obs, act, rew, new_obs, float(done))

    def preupdate(self):
        self.replay_sample_index = None

    def update(self, agents, t):
        # 训练critic
        # print("hello, nihao a ")
        if len(
                self.replay_buffer
        ) < self.max_replay_buffer_len:  # replay buffer is not large enough
            return
        if not t % 100 == 0:  # only update every 100 steps
            return
        # print("critic update")
        self.replay_sample_index = self.replay_buffer.make_index(
            self.args.batch_size)
        # collect replay sample from all agents
        obs_n = []
        obs_next_n = []
        act_n = []
        index = self.replay_sample_index
        for i in range(self.n):
            # buffer
            obs, act, rew, obs_next, done = agents[
                i].replay_buffer.sample_index(index)
            obs_n.append(obs)
            obs_next_n.append(obs_next)
            act_n.append(act)
        obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index)

        # train q network
        num_sample = 1
        target_q = 0.0
        for i in range(num_sample):
            target_act_next_n = [
                self.actors[j].p_debug['target_act'](obs_next_n[i])
                for j in range(self.n)
            ]
            target_q_next = self.q_debug['target_q_values'](
                *(obs_next_n + target_act_next_n))
            target_q += rew + self.args.gamma * (1.0 - done) * target_q_next
        target_q /= num_sample

        q_loss = self.q_train(*(obs_n + act_n + [target_q]))
        # train p network

        self.q_update()
        # print("step: ", t, "q_loss: ", q_loss)
        return [q_loss, np.mean(target_q), np.mean(rew), np.std(target_q)]
示例#10
0
class MADDPGAgentTrainer(AgentTrainer):
    def __init__(self,
                 name,
                 agents_number,
                 agent_index,
                 actors,
                 act_space_n,
                 args,
                 common_obs_shape,
                 sep_obs_shape,
                 model,
                 lstm_model,
                 cnn_model,
                 cnn_scope=None,
                 lstm_scope=None,
                 reuse=False,
                 local_q_func=False,
                 session=None):
        self.actors = actors
        self.name = name
        self.n = agents_number
        self.agent_index = agent_index
        self.args = args
        self.history_length = args.history_length

        common_obs_shape = [args.history_length] + list(common_obs_shape)
        common_obs_ph = U.BatchInput(common_obs_shape,
                                     name="common_observation").get()
        sep_obs_shape = [args.history_length] + list(sep_obs_shape[1:])
        sep_obs_ph_n = [
            U.BatchInput(sep_obs_shape,
                         name="common_observation" + str(i)).get()
            for i in range(self.n)
        ]

        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,
            make_common_obs_ph=common_obs_ph,
            make_sep_obs_ph_n=sep_obs_ph_n,
            act_space_n=act_space_n,
            cnn_model=cnn_model,
            cnn_scope=cnn_scope,
            q_index=agent_index,
            q_func=model,
            lstm_model=lstm_model,
            lstm_scope=lstm_scope,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            args=self.args,
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units,
            reuse=False,
            use_lstm=self.args.use_lstm,
            session=session)

        self.replay_buffer = ReplayBuffer(args.buffer_size,
                                          args.history_length)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None

    def experience(self, obs, act, rew, new_obs, done, terminal):
        # Store transition in the replay buffer.
        self.replay_buffer.add(obs, act, rew, new_obs, done)

    def preupdate(self):
        self.replay_sample_index = None

    def update(self, agents, t, agent_index):
        # 训练critic
        # print("hello, nihao a ")
        if len(
                self.replay_buffer
        ) < self.max_replay_buffer_len:  # replay buffer is not large enough
            return
        if not t % 100 == 0:  # only update every 100 steps
            return
        # print("critic update")
        self.replay_sample_index = agents[0].replay_buffer.make_index(
            self.args.batch_size, agent_index)
        # collect replay sample from all agents
        index = self.replay_sample_index
        (common_obs_n, sep_obs_n), act_n, rew_n, (common_obs_next_n, sep_obs_next_n), done_n = \
            agents[0].replay_buffer.sample_index(index)
        act, rew, done = act_n[:,
                               agent_index], rew_n[:,
                                                   agent_index], done_n[:,
                                                                        agent_index]
        # obs, obs_next = sep_obs_n[:, agent_index], sep_obs_next_n[:, agent_index]
        sep_obs_n_list, sep_obs_next_n_list, act_n_list = [], [], []
        for i in range(self.n):
            sep_obs_n_list.append(sep_obs_n[:, :, i])
            sep_obs_next_n_list.append(sep_obs_next_n[:, :, i])
            act_n_list.append(act_n[:, i])
        # train q network
        num_sample = 1
        target_q = 0.0
        for i in range(num_sample):
            # obs_next_n[j]是第j个智能体的observation
            # p_debug的输入应该是[[?, 4, 30, 30, 3], [?, 4, 4]]
            target_act_next_n = [
                self.actors[j].p_debug['target_act'](common_obs_n,
                                                     sep_obs_next_n[:, :, j])
                for j in range(self.n)
            ]
            target_q_next = self.q_debug['target_q_values'](
                *([common_obs_next_n] + sep_obs_next_n_list +
                  target_act_next_n))
            target_q += rew + self.args.gamma * (1.0 - done) * target_q_next
        target_q /= num_sample

        q_loss = self.q_train(*([common_obs_n] + sep_obs_n_list + act_n_list +
                                [target_q]))
        # train p network

        self.q_update()
        # print("step: ", t, "q_loss: ", q_loss)33
        return [q_loss, np.mean(target_q), np.mean(rew), np.std(target_q)]
示例#11
0
class MADDPGAgentTrainer(AgentTrainer):
    def __init__(self,
                 name,
                 model,
                 lstm_model,
                 obs_shape_n,
                 act_space_n,
                 agent_index,
                 actor_env,
                 args,
                 local_q_func=False,
                 session=None):
        self.args = args
        self.name = name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index

        obs_ph_n = []
        for i in range(self.n):
            obs_shape = [args.history_length] + list(obs_shape_n[i])
            # obs_shape.append()
            obs_ph_n.append(
                U.BatchInput((obs_shape), name="observation" + str(i)).get())

        optimizer = tf.train.AdamOptimizer(learning_rate=self.args.lr)

        self.p_train, self.p_update = p_train(scope=self.name,
                                              p_scope=actor_env,
                                              make_obs_ph_n=obs_ph_n,
                                              act_space_n=act_space_n,
                                              p_index=self.agent_index,
                                              p_func=model,
                                              q_func=model,
                                              lstm_model=lstm_model,
                                              optimizer=optimizer,
                                              grad_norm_clipping=0.5,
                                              local_q_func=local_q_func,
                                              num_units=self.args.num_units,
                                              reuse=True,
                                              use_lstm=self.args.use_lstm,
                                              session=session,
                                              args=args)

        # Create experience buffer
        self.replay_buffer = ReplayBuffer(args.buffer_size,
                                          args.history_length)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None

    def experience(self, obs, act, rew, new_obs, done, terminal):
        # Store transition in the replay buffer.
        self.replay_buffer.add(obs, act, rew, new_obs, float(done))

    def preupdate(self):
        self.replay_sample_index = None

    def update(self, agents, critics, t, index):
        # 这个就是训练actor的
        # if len(self.replay_buffer) < self.max_replay_buffer_len:
        if len(critics[0].replay_buffer) < self.max_replay_buffer_len:
            return
        if not t % 100 == 0:  # only update every 100 steps
            return
        # print("actor  update")
        self.replay_sample_index = critics[index].replay_sample_index
        # collect replay sample from all agents
        obs_n = []
        obs_next_n = []
        act_n = []
        index = self.replay_sample_index
        for i in range(self.n):
            # buffer
            obs, act, rew, obs_next, done = critics[
                i].replay_buffer.sample_index(index)
            obs_n.append(obs)
            obs_next_n.append(obs_next)
            act_n.append(act)

        # train p network
        p_loss = self.p_train(*(obs_n + act_n))

        self.p_update()
        # print("step: ", t, "p_loss: ", p_loss)
        return [p_loss]
示例#12
0
class MADDPGAgentTrainer(AgentTrainer):
    def __init__(self,
                 name,
                 agents_number,
                 act_space_n,
                 agent_index,
                 actor_scope,
                 args,
                 common_obs_shape,
                 sep_obs_shape,
                 model,
                 lstm_model,
                 cnn_model,
                 cnn_scope=None,
                 lstm_scope=None,
                 reuse=False,
                 local_q_func=False,
                 session=None):
        self.args = args
        self.name = name
        self.n = agents_number
        self.agent_index = agent_index
        optimizer = tf.train.AdamOptimizer(learning_rate=self.args.lr)

        sep_obs_shape = [args.history_length] + list(sep_obs_shape[1:])
        common_obs_shape = [args.history_length] + list(common_obs_shape)
        common_obs_ph = U.BatchInput(common_obs_shape,
                                     name="common_observation").get()
        sep_obs_ph_n = [
            U.BatchInput(sep_obs_shape,
                         name="common_observation" + str(i)).get()
            for i in range(self.n)
        ]

        self.p_train, self.p_update = p_train(scope=self.name,
                                              p_scope=actor_scope,
                                              make_common_obs_ph=common_obs_ph,
                                              make_sep_obs_ph_n=sep_obs_ph_n,
                                              act_space_n=act_space_n,
                                              p_index=self.agent_index,
                                              p_func=model,
                                              q_func=model,
                                              cnn_model=cnn_model,
                                              cnn_scope=cnn_scope,
                                              lstm_model=lstm_model,
                                              lstm_scope=lstm_scope,
                                              optimizer=optimizer,
                                              grad_norm_clipping=0.5,
                                              local_q_func=local_q_func,
                                              num_units=self.args.num_units,
                                              reuse=True,
                                              use_lstm=self.args.use_lstm,
                                              session=session,
                                              args=args)

        # Create experience buffer
        self.replay_buffer = ReplayBuffer(args.buffer_size,
                                          args.history_length)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None

    def experience(self, obs, act, rew, new_obs, done, terminal):
        # Store transition in the replay buffer.
        self.replay_buffer.add(obs, act, rew, new_obs, float(done))

    def preupdate(self):
        self.replay_sample_index = None

    def update(self, agents, critics, t, agent_index):
        # 这个就是训练actor的
        # if len(self.replay_buffer) < self.max_replay_buffer_len:
        if len(critics[0].replay_buffer) < self.max_replay_buffer_len:
            return
        if not t % 100 == 0:  # only update every 100 steps
            return
        # print("actor  update")
        self.replay_sample_index = critics[0].replay_buffer.make_index(
            self.args.batch_size, agent_index)
        # collect replay sample from all agents

        index = self.replay_sample_index

        (common_obs_n, sep_obs_n), act_n, rew_n, (common_obs_next_n, sep_obs_next_n), done_n = \
            critics[0].replay_buffer.sample_index(index)
        act, rew, done = act_n[:,
                               agent_index], rew_n[:,
                                                   agent_index], done_n[:,
                                                                        agent_index]
        # obs, obs_next = sep_obs_n[:, agent_index], sep_obs_next_n[:, agent_index]
        sep_obs_n_list, sep_obs_next_n_list, act_n_list = [], [], []
        for i in range(self.n):
            sep_obs_n_list.append(sep_obs_n[:, :, i])
            sep_obs_next_n_list.append(sep_obs_next_n[:, :, i])
            act_n_list.append(act_n[:, i])

        # train p network
        p_loss = self.p_train(*([common_obs_n] + sep_obs_n_list + act_n_list))

        self.p_update()
        # print("step: ", t, "p_loss: ", p_loss)
        return [p_loss]
示例#13
0
class MADDPGAgentTrainer(AgentTrainer):
    def __init__(self,
                 name,
                 agents_number,
                 act_space_n,
                 agent_index,
                 args,
                 common_obs_shape,
                 sep_obs_shape,
                 model,
                 lstm_model,
                 cnn_model,
                 lstm_scope=None,
                 cnn_scope=None,
                 reuse=False,
                 session=None,
                 local_q_func=False):
        self.args = args
        self.name = name
        self.n = agents_number
        self.agent_index = agent_index
        self.local_q_func = local_q_func

        sep_obs_shape = [args.history_length] + list(sep_obs_shape[1:])
        common_obs_shape = [args.history_length] + list(common_obs_shape)

        common_obs_ph = U.BatchInput(common_obs_shape,
                                     name="common_observation").get()
        sep_obs_ph_n = [
            U.BatchInput(sep_obs_shape,
                         name="common_observation" + str(i)).get()
            for i in range(self.n)
        ]

        self.act, self.p_debug = p_act(
            make_common_obs_ph=common_obs_ph,
            make_sep_obs_ph_n=sep_obs_ph_n,
            act_space_n=act_space_n,
            p_index=self.agent_index,
            p_func=model,
            lstm_model=lstm_model,
            cnn_model=cnn_model,
            lstm_scope=lstm_scope,
            cnn_scope=cnn_scope,
            use_lstm=self.args.use_lstm,
            use_cnn=self.args.use_cnn,
            reuse=reuse,
            session=session,
            scope=self.name,
            num_units=self.args.num_units,
        )
        # Create experience buffer
        self.replay_buffer = ReplayBuffer(args.buffer_size,
                                          args.history_length)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None

    def action(self, common_obs, sep_obs):
        # obs = np.array(obs.queue)
        # print(obs)
        return self.act(common_obs[None], sep_obs[None])[0]

    def experience(self, obs, act, rew, new_obs, done, terminal):
        # Store transition in the replay buffer.
        self.replay_buffer.add(obs, act, rew, new_obs, float(done))