Python ReplayBuffer.add示例

编程语言: Python

命名空间/包名称: maddpg.trainer.replay_buffer

类/类型: ReplayBuffer

方法/功能: add

hotexamples.com的示例: 26

Python ReplayBuffer.add - 已找到26个示例。这些是从开源项目中提取的最受好评的maddpg.trainer.replay_buffer.ReplayBuffer.add现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

ReplayBuffer(30)

add(26)

make_index(24)

sample_index(22)

generate_sample_indices(2)

clear(1)

make_latest_index(1)

sequence_sample_index(1)

示例#1

显示文件

class SheldonPolicy(Policy):
    def __init__(self, env, landmark_id, args):
        super(SheldonPolicy, self).__init__()
        self.env = env
        self.landmark_id = landmark_id
        # dummy replay buffer for collecting experiences
        self.replay_buffer = ReplayBuffer(
            args.num_episodes * args.max_episode_len
            if args.benchmark and args.save_replay else 1e6)

    def action(self, obs):
        delta_pos = obs[(4 + self.landmark_id * 2):(4 + self.landmark_id * 2 +
                                                    2)]
        # ignore observation and just act based on keyboard events
        if self.env.discrete_action_input:
            # not tested!
            u = 0
            horizontal = abs(delta_pos[0]) > abs(delta_pos[1])
            if horizontal and delta_pos[0] < 0: u = 1  # LEFT
            if horizontal and delta_pos[0] > 0: u = 2  # RIGHT
            if not horizontal and delta_pos[1] < 0: u = 3  # UP
            if not horizontal and delta_pos[1] > 0: u = 4  # DOWN
        else:
            u = np.zeros(5)  # 5-d because of no-move action
            if delta_pos[0] > 0: u[1] += delta_pos[0]  # RIGHT
            if delta_pos[0] < 0: u[2] += -delta_pos[0]  # LEFT
            if delta_pos[1] > 0: u[3] += delta_pos[1]  # UP
            if delta_pos[1] < 0: u[4] += -delta_pos[1]  # DOWN
        #print(delta_pos, u)
        #return np.concatenate([u, np.zeros(self.env.world.dim_c)])
        return u

    def experience(self, obs, act, rew, new_obs, done, terminal):
        # Store transition in the replay buffer.
        self.replay_buffer.add(obs, act, rew, new_obs, float(done))

示例#2

显示文件

文件： maddpg.py 项目： cying9/maddpg

class MADDPG():
    def __init__(self, obs_shape_n, act_info_n, agent_index, args, local_q_func=False):
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        self.grad_norm_clipping = 0.5
        # Networks
        self.device = args.device

        self.vf = Critic(
            obs_shape_n=obs_shape_n,
            act_info_n=act_info_n,
            num_units=args.num_units,
            q_index=agent_index,
            local_q_func=local_q_func,
        ).to(self.device)

        act_dim, self.pdtype = act_info_n[agent_index]
        self.pi = MLP(obs_shape_n[agent_index],
                      act_dim,
                      num_units=args.num_units).to(self.device)

        # Initialize
        init_params(self.vf)
        init_params(self.pi)

        # Target Networks
        self.pi_targ = deepcopy(self.pi)
        for p in self.pi_targ.parameters():
            p.requires_grad = False
        self.vf_targ = deepcopy(self.vf)
        for p in self.vf_targ.parameters():
            p.requires_grad = False

        # Optimizer
        self.pi_optim = Adam(self.pi.parameters(), lr=args.lr)
        self.vf_optim = Adam(self.vf.parameters(), lr=args.lr)

        # Create Replay Buffer
        self.replay_buffer = ReplayBuffer(1e6)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None

    @torch.no_grad()
    def action(self, x):
        return self.pdtype(
            self.pi(torch.FloatTensor(x).to(
                self.device)).cpu()).sample().numpy()

    def experience(self, obs, act, rew, new_obs, done, terminal):
        # Store transition in the replay buffer.
        self.replay_buffer.add(obs, act, rew, new_obs, float(done))

    def preupdate(self):
        self.replay_sample_index = None

    def update(self, agents, t):
        if len(self.replay_buffer) < self.max_replay_buffer_len:
            return
        if not (t % 100 == 0):
            return

        self.replay_sample_index = self.replay_buffer.make_index(self.args.batch_size)
        # collect replay sample from all agents
        obs_n = []
        obs_next_n = []
        act_n = []
        index = self.replay_sample_index
        for i in range(self.n):
            obs, act, rew, obs_next, done = agents[
                i].replay_buffer.sample_index(index)
            obs_n.append(torch.FloatTensor(obs).to(self.device))
            obs_next_n.append(torch.FloatTensor(obs_next).to(self.device))
            act_n.append(torch.FloatTensor(act).to(self.device))
        obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index)
        # Create tensors
        rew = torch.FloatTensor(rew).to(self.device)
        done = torch.FloatTensor(done).to(self.device)

        # Calculate q loss
        num_sample = 1
        target_q = 0.0
        with torch.no_grad():
            for i in range(num_sample):
                target_act_next_n = [self.pdtype(agents[i].pi_targ(obs_next_n[i])).sample() for i in range(self.n)]
                target_q_next = self.vf_targ(obs_next_n, target_act_next_n).squeeze(-1)
                target_q += rew + self.args.gamma * (1.0 - done) * target_q_next
        target_q /= num_sample
        q = self.vf(obs_n, act_n).squeeze(-1)
        vf_loss = torch.mean(torch.square(q - target_q))

        # optimization step
        self.vf_optim.zero_grad(set_to_none=True)
        vf_loss.backward()
        nn.utils.clip_grad_norm_(self.vf.parameters(), self.grad_norm_clipping)
        self.vf_optim.step()

        # Calculate policy loss
        for p in self.vf.parameters():
            p.requires_grad = False

        piflat = self.pi(obs_n[self.agent_index])
        p_reg = torch.mean(torch.square(piflat))
        act_input_n = copy(act_n)
        act_input_n[self.agent_index] = self.pdtype(piflat).sample()
        pg_loss = - self.vf(obs_n, act_input_n).mean()
        pi_loss = pg_loss + p_reg * 1e-3

        self.pi_optim.zero_grad(set_to_none=True)
        pi_loss.backward()
        nn.utils.clip_grad_norm_(self.pi.parameters(), self.grad_norm_clipping)
        self.pi_optim.step()

        for p in self.vf.parameters():
            p.requires_grad = True

        make_update_exp(self.pi, self.pi_targ)
        make_update_exp(self.vf, self.vf_targ)

        return [pi_loss.item(), vf_loss.item()]

示例#3

显示文件

class MADDPGAgentTrainer(AgentTrainer):
    def __init__(self,
                 name,
                 learning_rate,
                 obs_shape_n,
                 act_space_n,
                 agent_index,
                 args,
                 local_q_func=False):
        self.name = name
        self.learning_rate = learning_rate
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.obs_size = obs_shape_n[agent_index]
        self.joint_obs_size = np.sum(obs_shape_n)
        self.act_size = act_space_n[agent_index].n
        self.act_pdtype_n = [
            make_pdtype(act_space) for act_space in act_space_n
        ]

        self.joint_act_size = 0
        for i_act in act_space_n:
            self.joint_act_size += i_act.n
        self.args = args
        self.actor = Actor(self.obs_size, self.act_size)
        self.actor_target = Actor(self.obs_size, self.act_size)
        self.critic = self.build_critic()
        self.critic_target = self.build_critic()
        update_target(self.actor, self.actor_target, 0)
        update_target(self.critic, self.critic_target, 0)
        #self.actor, self.critic = self.build_model()
        #self.actor_target, self.critic_target = self.build_model()
        self.actor_optimizer = self.build_actor_optimizer()

        # Create experience buffer
        self.replay_buffer = ReplayBuffer(1e6)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None

        gpu = -1

        self.device = "/gpu:{}".format(gpu) if gpu >= 0 else "/cpu:0"

    def build_model(self):
        """ actor (policy) neural network """
        inp = Input(self.obs_size)
        x = Dense(64, activation='relu')(inp)
        x = Dense(64, activation='relu')(x)
        actor_out = Dense(self.act_size)(x)

        actor = Model(inp, actor_out)
        # Note: "actor" is not compiled because we want customize the training process
        """ critic (value) neural network """
        inp = Input((self.joint_obs_size + self.joint_act_size, ))
        x = Dense(64, activation='relu')(inp)
        x = Dense(64, activation='relu')(x)
        critic_out = Dense(1, activation='linear')(x)

        critic = Model(inp, critic_out)

        critic.compile(loss="mse",
                       optimizer=Adam(lr=self.learning_rate, clipnorm=0.5))

        return actor, critic

    def build_critic(self):
        """ critic (value) neural network """
        inp = Input((self.joint_obs_size + self.joint_act_size, ))
        x = Dense(64, activation='relu')(inp)
        x = Dense(64, activation='relu')(x)
        critic_out = Dense(1, activation='linear')(x)

        critic = Model(inp, critic_out)

        critic.compile(loss="mse",
                       optimizer=Adam(lr=self.learning_rate, clipnorm=0.5))

        return critic

    def build_actor_optimizer(self):
        return Adam(learning_rate=self.learning_rate, clipnorm=0.5)

    def action(self, obs):
        #a = self.sample_action(obs[None])
        #print(obs[None].shape)
        #a = self._get_action_body(tf.constant(obs[None], dtype='float32'))
        #a = self.actor.predict_on_batch(tf.constant(obs[None], dtype='float32'))
        #print(a)
        a = self.actor.action(tf.constant(obs[None], dtype='float32'))
        ##a = self.actor(self.actor.dist(obs[None]))
        return a[0]

    def sample_action(self, obs):
        logits = self.actor.predict(obs, batch_size=len(obs))
        u = np.random.uniform(size=logits.shape)
        return a

    """
    @tf.function
    def _get_action_body(self, obs_tensor):
        with tf.device(self.device):
            logits = self.actor(obs_tensor)
            act_pd = self.act_pdtype_n[self.agent_index].pdfromflat(logits)
            a = act_pd.sample()
            return a
    """

    @tf.function
    def _get_action_body(self, obs_tensor):
        logits = self.actor(obs_tensor)
        u = tf.random.uniform(tf.shape(logits))
        a = tf.nn.softmax(logits - tf.math.log(-tf.math.log(u)), axis=-1)
        return a

    def experience(self, obs, act, rew, new_obs, done, terminal):
        # Store transition in the replay buffer.
        self.replay_buffer.add(obs, act, rew, new_obs, float(done))

    def preupdate(self):
        self.replay_sample_index = None

    def update(self, agents, t):
        if len(
                self.replay_buffer
        ) < self.max_replay_buffer_len:  # replay buffer is not large enough
            return
        if not t % 100 == 0:  # only update every 100 steps
            return
        self.replay_sample_index = self.replay_buffer.make_index(
            self.args.batch_size)
        # collect replay sample from all agents
        obs_n = []
        obs_next_n = []
        act_n = []
        index = self.replay_sample_index
        for i in range(self.n):
            obs, act, rew, obs_next, done = agents[
                i].replay_buffer.sample_index(index)
            obs_n.append(obs)
            obs_next_n.append(obs_next)
            act_n.append(act)
        obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index)
        rew = np.expand_dims(rew, axis=-1)
        done = np.expand_dims(done, axis=-1)
        # train q network
        num_sample = 1
        target_q = 0.0
        """
        next_logits = self.actor_target.predict(obs_next)
        next_act_pd = self.act_pdtype_n[self.agent_index].pdfromflat(next_logits)
        new_next_act = next_act_pd.sample()
        """

        # train critic
        for i in range(num_sample):
            target_act_next_n = []
            for j in range(self.n):
                new_next_act = agents[j].actor_target.predict_many(
                    obs_next_n[j])
                target_act_next_n.append(new_next_act)  #TODO: mode
            #target_act_next_n[self.agent_index] = new_next_act
            next_state_action_n = np.concatenate(
                (obs_next_n, target_act_next_n), axis=-1)
            next_state_action_attached = np.concatenate(next_state_action_n,
                                                        axis=0)
            target_q_next = self.critic_target.predict(
                next_state_action_attached)
            target_q += rew + self.args.gamma * (1.0 - done) * target_q_next

        target_q /= num_sample
        state_action_n = np.concatenate((obs_n, act_n), axis=-1)
        state_action_attached = np.concatenate(state_action_n, axis=0)
        hist = self.critic.fit(state_action_attached,
                               target_q,
                               epochs=1,
                               verbose=0)
        #q_loss = self.critic.train_on_batch(state_action_attached, target_q)

        q_loss = hist.history['loss'][0]

        obs_tensor = tf.constant(obs, dtype=tf.float32)
        obs_n_tensor = tf.constant(obs_n, dtype=tf.float32)
        act_n_tensor = tf.constant(act_n, dtype=tf.float32)
        #obs_tensor = tf.Variable(obs, dtype=tf.float32)
        #obs_n_tensor = tf.Variable(np.array(obs_n), dtype=tf.float32)
        #act_n_tensor = tf.Variable(np.array(act_n), dtype=tf.float32)

        # train actor network
        #p_loss = self.update_actor(obs, obs_n, act_n)
        p_loss = self.update_actor(obs_tensor, obs_n_tensor, act_n_tensor)
        """
        logits = self.actor.predict(obs)
        act_pd = self.act_pdtype_n[self.agent_index].pdfromflat(logits)
        new_act = act_pd.mode()
        act_n[self.agent_index] = new_act
        
        grads = self.critic.gradients(obs_n, act_n)

        np.concatenate(act_n, ) 

        state_action_n = np.concatenate((obs_n, act_n), axis=-1)
        state_action_attached = np.concatenate(state_action_n, axis=-1)
        hist = self.actor.fit(obs, state_action_attached, epochs=1, verbose=0)
        p_loss = hist.history['loss'][0]
        """
        update_target(self.actor, self.actor_target)
        update_target(self.critic, self.critic_target)
        return [
            q_loss, p_loss,
            np.mean(target_q),
            np.mean(rew),
            np.mean(target_q_next),
            np.std(target_q)
        ]

    @tf.function
    def update_actor(self, obs, obs_n, act_n):
        with tf.GradientTape() as tape:
            logits = self.actor(obs)
            #new_act = self.act_pdtype_n[self.agent_index].pdfromflat(logits).mode()
            new_act = self.actor.dist(logits)
            new_act = tf.expand_dims(new_act, axis=0)
            new_act_head = act_n[:self.agent_index]
            new_act_tail = act_n[self.agent_index + 1:]

            new_act_n = tf.concat((new_act_head, new_act, new_act_tail),
                                  axis=0)

            state_action_n = tf.concat((obs_n, new_act_n), axis=-1)
            state_action_attached = tf.squeeze(state_action_n)
            q_val = self.critic(state_action_attached)[:, 0]
            p_loss = -tf.reduce_mean(q_val)
            reg_loss = tf.reduce_mean(tf.square(logits))
            total_loss = p_loss + reg_loss * 1e-3

        actor_grad = tape.gradient(total_loss, self.actor.trainable_weights)
        self.actor_optimizer.apply_gradients(
            zip(actor_grad, self.actor.trainable_weights))
        return total_loss

    def _actor_loss(self, actions_and_values, logits):
        # A trick to input actions and advantages through the same API.
        actions, values = tf.split(actions_and_values, 2, axis=-1)

        # Sparse categorical CE loss obj that supports sample_weight arg on `call()`.
        # `from_logits` argument ensures transformation into normalized probabilities.

        #weighted_sparse_ce = kls.SparseCategoricalCrossentropy(from_logits=True)

        # Policy loss is defined by policy gradients, weighted by advantages.
        # Note: we only calculate the loss on the actions we've actually taken.
        #actions = tf.cast(actions, tf.int32)
        #policy_loss = weighted_sparse_ce(actions, logits, sample_weight=values)

        # Entropy loss can be calculated as cross-entropy over itself.
        #probs = tf.nn.softmax(logits)
        #entropy_loss = kls.categorical_crossentropy(probs, probs)
        print(values)
        print(tf.shape(logits))
        policy_loss = -tf.reduce_mean(values)
        logits_loss = tf.reduce_mean(logits)
        # We want to minimize policy and maximize entropy losses.
        # Here signs are flipped because the optimizer minimizes.
        return policy_loss + 1e-3 * logits_loss

    def _actor_loss(self, values, logits):
        p_loss = -tf.reduce_mean(values)
        reg_loss = tf.reduce_mean(tf.square(logits))
        total_loss = p_loss + reg_loss * 1e-3
        return total_loss

    def load_models(self, path, version_name):
        file_name = 'a' + str(self.agent_index) + 'A' + version_name
        self.actor.load_weights(path + file_name)
        file_name = 'a' + str(self.agent_index) + 'C' + version_name
        self.critic.load_weights(path + file_name)
        file_name = 'a' + str(self.agent_index) + 'AT' + version_name
        self.actor_target.load_weights(path + file_name)
        file_name = 'a' + str(self.agent_index) + 'CT' + version_name
        self.critic_target.load_weights(path + file_name)

    def save_models(self, path, version_name):
        file_name = 'a' + str(self.agent_index) + 'A' + version_name
        self.actor.save_weights(path + file_name)
        file_name = 'a' + str(self.agent_index) + 'C' + version_name
        self.critic.save_weights(path + file_name)
        file_name = 'a' + str(self.agent_index) + 'AT' + version_name
        self.actor_target.save_weights(path + file_name)
        file_name = 'a' + str(self.agent_index) + 'CT' + version_name
        self.critic_target.save_weights(path + file_name)

示例#4

显示文件

文件： maddpg_vanila_indep_learner.py 项目： stamilse/multiagent-rl

class MADDPGAgentTrainerIndepLearner(AgentTrainer):
    def __init__(self,
                 name,
                 model,
                 obs_shape_n,
                 act_space_n,
                 agent_index,
                 args,
                 local_q_func=False,
                 u_estimation=False):
        print('in here')
        self.name = name
        self.n = 1  #len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        obs_ph_n = []
        obs_ph_n.append(
            U.BatchInput(obs_shape_n[agent_index], name="observation0").get())
        self.u_estimation = u_estimation

        # Create all the functions necessary to train the model
        l = q_train(scope=self.name,
                    make_obs_ph_n=obs_ph_n,
                    act_space_n=act_space_n,
                    q_index=agent_index,
                    q_func=model,
                    u_func=model,
                    optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
                    grad_norm_clipping=0.5,
                    local_q_func=local_q_func,
                    num_units=args.num_units,
                    u_estimation=self.u_estimation)

        if self.u_estimation:
            self.q_train, self.q_update, self.u_update, self.q_debug = l
        else:
            self.q_train, self.q_update, self.q_debug = l

        self.act, self.p_train, self.p_update, self.p_debug = p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            p_index=agent_index,
            p_func=model,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units)
        # Create experience buffer
        self.replay_buffer = ReplayBuffer(1e6)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None

    def action(self, obs):
        return self.act(obs[None])[0]

    def experience(self, obs, act, rew, new_obs, done, terminal):
        # Store transition in the replay buffer.
        self.replay_buffer.add(obs, act, rew, new_obs, float(done))

    def preupdate(self):
        self.replay_sample_index = None

    def update(self, agents, t):
        if len(
                self.replay_buffer
        ) < self.max_replay_buffer_len:  # replay buffer is not large enough
            return
        if not t % 100 == 0:  # only update every 100 steps
            return

        self.replay_sample_index = self.replay_buffer.make_index(
            self.args.batch_size)
        # collect replay sample from all agents
        obs_n = []
        obs_next_n = []
        act_n = []
        index = self.replay_sample_index
        obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index)
        obs_n = [obs]  #+[np.zeros_like(obs)]*(self.n-1)
        obs_next_n = [obs_next]  #+[np.zeros_like(obs_next)]*(self.n-1)
        act_n = [act]  #+[np.zeros_like(act)]*(self.n-1)

        # train q network
        num_sample = 1
        target_q = 0.0
        target_u = 0.0
        for i in range(num_sample):
            t = self.p_debug['target_act'](obs_next_n[0])
            target_act_next_n = [t]  # + [np.zeros_like(t)]*(self.n-1)
            #print('target_act_next_n ', np.asarray(target_act_next_n).shape)
            #print('obs_next_n', len(obs_next_n), obs_next_n[0].shape)
            target_q_next = self.q_debug['target_q_values'](
                *(obs_next_n + target_act_next_n))
            if self.u_estimation:
                target_u_next = self.q_debug['target_u_values'](
                    *(obs_next_n + target_act_next_n))
                target_u += math.pow(self.args.gamma,
                                     2.0) * (1.0 - done) * target_u_next
            target_q += self.args.gamma * (1.0 - done) * target_q_next
        target_q /= num_sample
        if self.u_estimation:
            q_loss, u_loss = self.q_train(*(obs_n + act_n + [target_q] +
                                            [target_u] + [rew]))
        else:
            q_loss = self.q_train(*(obs_n + act_n + [target_q] + [rew]))
        var_rew = np.array(self.q_debug['var'](*(obs_n + act_n + [target_q] +
                                                 [rew]))).mean()
        # train p network
        p_loss = self.p_train(*(obs_n + act_n))

        self.p_update()
        self.q_update()
        if self.u_estimation:
            self.u_update()

        return [
            np.asarray(q_loss).mean(),
            np.asarray(p_loss).mean(),
            np.mean(target_q),
            np.mean(rew), var_rew,
            np.mean(target_q_next),
            np.std(target_q)
        ]

示例#5

显示文件

文件： maddpg.py 项目： sahilPereira/gym-highway

class MADDPGAgentTrainer(AgentTrainer):
    def __init__(self,
                 name,
                 model,
                 obs_shape_n,
                 act_space_n,
                 agent_index,
                 args,
                 actor_lr=None,
                 critic_lr=None,
                 gamma=None,
                 num_units=None,
                 rb_size=None,
                 batch_size=None,
                 max_episode_len=None,
                 clip_norm=0.5,
                 local_q_func=False):
        self.name = name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args

        # training parameters
        self.actor_lr = actor_lr if actor_lr else args.lr
        self.critic_lr = critic_lr if critic_lr else args.lr
        self.gamma = gamma if gamma else args.gamma
        self.num_units = num_units if num_units else args.num_units
        self.rb_size = rb_size if rb_size else args.rb_size
        self.batch_size = batch_size if batch_size else args.batch_size
        self.max_episode_len = max_episode_len if max_episode_len else args.max_episode_len
        self.clip_norm = clip_norm

        # TODO: remove after testing
        import models.config as Config
        assert actor_lr == Config.maddpg_train_args['actor_lr']
        assert critic_lr == Config.maddpg_train_args['critic_lr']
        assert gamma == Config.maddpg_train_args['gamma']
        assert num_units == Config.maddpg_train_args['num_hidden']
        assert rb_size == Config.maddpg_train_args['rb_size']
        assert batch_size == Config.maddpg_train_args['batch_size']
        assert max_episode_len == Config.maddpg_train_args['nb_rollout_steps']
        assert clip_norm == Config.maddpg_train_args['clip_norm']

        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(
                U.BatchInput(obs_shape_n[i],
                             name="observation" + str(i)).get())

        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            q_index=agent_index,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=self.critic_lr),
            grad_norm_clipping=self.clip_norm,
            local_q_func=local_q_func,
            num_units=self.num_units)
        self.act, self.p_train, self.p_update, self.p_debug = p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            p_index=agent_index,
            p_func=model,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=self.actor_lr),
            grad_norm_clipping=self.clip_norm,
            local_q_func=local_q_func,
            num_units=self.num_units)
        # Create experience buffer
        self.replay_buffer = ReplayBuffer(self.rb_size)
        self.max_replay_buffer_len = self.batch_size * self.max_episode_len
        self.replay_sample_index = None
        self.loss_names = [
            'q_loss', 'p_loss', 'mean_target_q', 'mean_rew',
            'mean_target_q_next', 'std_target_q'
        ]

    def action(self, obs):
        return self.act(obs[None])[0]

    def experience(self, obs, act, rew, new_obs, done, terminal):
        # Store transition in the replay buffer.
        self.replay_buffer.add(obs, act, rew, new_obs, float(done))

    def preupdate(self):
        self.replay_sample_index = None

    def update(self, agents, t):
        if len(
                self.replay_buffer
        ) < self.max_replay_buffer_len:  # replay buffer is not large enough
            return
        if not t % 100 == 0:  # only update every 100 steps
            return

        self.replay_sample_index = self.replay_buffer.make_index(
            self.batch_size)
        # collect replay sample from all agents
        obs_n = []
        obs_next_n = []
        act_n = []
        index = self.replay_sample_index
        for i in range(self.n):
            obs, act, rew, obs_next, done = agents[
                i].replay_buffer.sample_index(index)
            obs_n.append(obs)
            obs_next_n.append(obs_next)
            act_n.append(act)
        obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index)

        # train q network
        num_sample = 1
        act_space = act.shape[-1]
        target_q = 0.0
        for i in range(num_sample):
            target_act_next_n = [
                agents[i].p_debug['target_act'](obs_next_n[i])
                for i in range(self.n)
            ]

            # flatten multi agent actions and observations
            act_serial_vals = self.q_debug['act_serial_values'](
                *(target_act_next_n))
            obs_serial_vals = self.q_debug['obs_serial_values'](*(obs_next_n))
            assert len(act_serial_vals) == self.batch_size
            assert len(obs_serial_vals) == self.batch_size

            # compute L2 normalized partial derivatives of target Q function wrt actions
            # NOTE: this is done one sample at a time to prevent tf.gradient from summing over all target q values
            grad_norm_value = [
                self.q_debug['grad_norm_value'](*([[obs_serial_vals[j]]] +
                                                  [[act_serial_vals[j]]]))
                for j in range(self.batch_size)
            ]
            assert len(grad_norm_value) == self.batch_size

            # scale the raw gradients by alpha
            # TODO: set alpha during init or compute as function of policy or loss
            perturb = np.array(grad_norm_value) * 0.01

            # update leader actions using gradients
            for b in range(self.batch_size):
                # find all the leaders wrt current agent (agent_index)
                leading_agents = [
                    [1.0] * act_space
                    if obs_next_n[k][b][2] > obs_next_n[self.agent_index][b][2]
                    else [0.0] * act_space for k in range(self.n)
                ]
                # filter perturbations to only apply for leading agents
                # scale by L2 norm of original actions to prevent the perturb from overwhelming action
                epsilon = perturb[b].flatten() * np.array(
                    leading_agents).flatten() * np.linalg.norm(
                        act_serial_vals[b], 2)
                act_serial_vals[b] += epsilon

            # target_q_next = self.q_debug['target_q_values'](*(obs_next_n + target_act_next_n))
            target_q_next = self.q_debug['target_q_values'](
                *([obs_serial_vals] + [act_serial_vals]))
            target_q += rew + self.gamma * (1.0 - done) * target_q_next
        target_q /= num_sample
        q_loss = self.q_train(*(obs_n + act_n + [target_q]))

        # get current actions and observations flattened
        act_serial_vals = self.q_debug['act_serial_values'](*(act_n))
        obs_serial_vals = self.q_debug['obs_serial_values'](*(obs_n))
        # compute L2 normalized partial derivatives of Q function wrt actions
        grad_norm_value = [
            self.p_debug['grad_norm_value'](*([[obs_serial_vals[j]]] +
                                              [[act_serial_vals[j]]]))
            for j in range(self.batch_size)
        ]
        assert len(grad_norm_value) == self.batch_size
        # scale the raw gradients by alpha
        perturb = np.array(grad_norm_value) * 0.01
        # update leader actions using these perturbations
        for b in range(self.batch_size):
            # find all the leaders wrt current agent (agent_index)
            leading_agents = [
                [1.0] * act_space
                if obs_next_n[k][b][2] > obs_next_n[self.agent_index][b][2]
                else [0.0] * act_space for k in range(self.n)
            ]
            # filter perturbations to only apply for leading agents
            epsilon = perturb[b].flatten() * np.array(leading_agents).flatten(
            ) * np.linalg.norm(act_serial_vals[b], 2)
            epsilon_n = [
                epsilon[k * act_space:(k * act_space) + act_space]
                for k in range(self.n)
            ]
            # update each agent action for current batch sample "b"
            for k in range(self.n):
                act_n[k][b] += epsilon_n[k]

        # train p network
        p_loss = self.p_train(*(obs_n + act_n))

        self.p_update()
        self.q_update()

        return [
            q_loss, p_loss,
            np.mean(target_q),
            np.mean(rew),
            np.mean(target_q_next),
            np.std(target_q)
        ]

示例#6

显示文件

文件： maddpg.py 项目： walternie/UAV-RL-real-dynamics

class MADDPGAgentTrainer(AgentTrainer):
    def __init__(self,
                 name,
                 model,
                 obs_shape_n,
                 act_space_n,
                 agent_index,
                 args,
                 safety_layer=None,
                 local_q_func=False):
        self.name = name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        self.safety_layer = safety_layer
        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(
                U.BatchInput(obs_shape_n[i],
                             name="observation" + str(i)).get())
        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            q_index=agent_index,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units)
        self.act, self.p_train, self.p_update, self.p_debug = p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            p_index=agent_index,
            p_func=model,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units)
        # Create experience buffer
        self.replay_buffer = ReplayBuffer(1e6)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None

    def action(self, obs, c=None, env=None):
        action = self.act(obs[None])[0]
        if_call = False
        return action, if_call

    def action_real(self, obs, c=None, env=None):
        # get action from DDPG
        action = self.act(obs[None])[0]
        action_real = action
        if_call = False
        dist = np.sqrt(
            np.sum(
                np.square(env.agents[0].state.p_pos -
                          env.world.landmarks[-1].state.p_pos)))

        # call for the safety_layer
        if self.safety_layer and c is not None and env is not None and dist > 1.5:
            # judge the collision in future 10 steps
            collision_flag = False
            env_future = copy.deepcopy(env)
            obs_future = copy.deepcopy(obs)
            trajectory = np.zeros([4, self.safety_layer.UAV_config.N + 1])
            trajectory[0, 0] = obs_future[2]
            trajectory[1, 0] = obs_future[3]
            trajectory[2, 0] = obs_future[4]
            trajectory[3, 0] = obs_future[5]
            for i in range(self.safety_layer.UAV_config.N):
                action_future = [self.act(obs_future[None])[0]]
                # environment step
                new_obs_n, rew_n, done_n, info_n = env_future.step(
                    action_future)
                is_any_collision = []
                for agent in env_future.agents:
                    temp = False
                    for _, landmark in enumerate(
                            env_future.world.landmarks[0:-1]):
                        dist = np.sqrt(np.sum(np.square(agent.state.p_pos - landmark.state.p_pos))) \
                               - (agent.size + landmark.size)
                        if dist <= 0:
                            temp = True
                    is_any_collision.append(temp)
                if is_any_collision[0]:
                    collision_flag = True
                done_future = all(done_n)
                if done_future:
                    break
                obs_future = new_obs_n[0]
                trajectory[0, i + 1] = obs_future[2]
                trajectory[1, i + 1] = obs_future[3]
                trajectory[2, i + 1] = obs_future[4]
                trajectory[3, i + 1] = obs_future[5]
            if not collision_flag:
                return action_real, action, if_call
            action, if_call = self.safety_layer.get_safe_action(
                obs, action, trajectory)
        return action_real, action, if_call

    def set_safety_layer(self, safety_layer):
        self.safety_layer = safety_layer

    def experience(self, obs, act, rew, new_obs, done, terminal):
        # Store transition in the replay buffer.
        self.replay_buffer.add(obs, act, rew, new_obs, float(done))

    def preupdate(self):
        self.replay_sample_index = None

    def update(self, agents, t):
        if len(
                self.replay_buffer
        ) < self.max_replay_buffer_len:  # replay buffer is not large enough
            return
        if not t % 100 == 0:  # only update every 100 steps
            return

        self.replay_sample_index = self.replay_buffer.make_index(
            self.args.batch_size)
        # collect replay sample from all agents
        obs_n = []
        obs_next_n = []
        act_n = []
        index = self.replay_sample_index
        for i in range(self.n):
            obs, act, rew, obs_next, done = agents[
                i].replay_buffer.sample_index(index)
            obs_n.append(obs)
            obs_next_n.append(obs_next)
            act_n.append(act)
        obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index)

        # train q network
        num_sample = 1
        target_q = 0.0
        for i in range(num_sample):
            target_act_next_n = [
                agents[i].p_debug['target_act'](obs_next_n[i])
                for i in range(self.n)
            ]
            target_q_next = self.q_debug['target_q_values'](
                *(obs_next_n + target_act_next_n))
            target_q += rew + self.args.gamma * (1.0 - done) * target_q_next
        target_q /= num_sample
        q_loss = self.q_train(*(obs_n + act_n + [target_q]))

        # train p network
        p_loss = self.p_train(*(obs_n + act_n))

        self.p_update()
        self.q_update()

        return [
            q_loss, p_loss,
            np.mean(target_q),
            np.mean(rew),
            np.mean(target_q_next),
            np.std(target_q)
        ]

示例#7

显示文件

class MADDPGAgentTrainer(AgentTrainer):
    def __init__(self, env, name, model, CNN_model, obs_shape_n, obs_map_shape_n,act_space_n, agent_index, args, local_q_func=False):
        self.name = name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        obs_ph_n = []
        obs_map_ph_n=[]
        for i in range(self.n):
            obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i)).get())
            obs_map_ph_n.append(U.BatchInput(obs_map_shape_n[i], name="observation_map"+str(i)).get())
        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            q_index=agent_index,
            q_func=model,
            shared_CNN=CNN_model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units,
            make_obs_map_ph_n=obs_map_ph_n
        )
        self.act, self.p_train, self.vf_t, self.p_update, self.vf_u, self.p_debug = p_train(
            scope=self.name,
            env = env,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            p_index=agent_index,
            vf_func=model,
            shana = GMMPolicy,
            q_func=model,
            shared_CNN=CNN_model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units,
            make_obs_map_ph_n=obs_map_ph_n
        )
        # Create experience buffer
        self.replay_buffer = ReplayBuffer(1e6)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None
        self.batch_size=args.batch_size

    def action(self, obs):
        return self.act([obs[0]],[obs[1]])[0]

    def experience(self, obs, act, rew, new_obs, done, terminal):
        # Store transition in the replay buffer.
        self.replay_buffer.add(obs, act, rew, new_obs, float(done))

    def preupdate(self):
        self.replay_sample_index = None

    def update(self, agents, t):
        if len(self.replay_buffer) < self.max_replay_buffer_len: # replay buffer is not large enough
            return
        if not t % 100 == 0:  # only update every 100 steps
            return

        self.replay_sample_index = self.replay_buffer.make_index(self.args.batch_size)
        # collect replay sample from all agents
        # obs_n = []
        # obs_next_n = []
        # act_n = []
        # index = self.replay_sample_index
        # for i in range(self.n):
        #     obs, act, rew, obs_next, done = agents[i].replay_buffer.sample_index(index)
        #     obs_n.append(obs)
        #     obs_next_n.append(obs_next)
        #     act_n.append(act)
        # obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index)

        obs_n = []
        obs_map_n=[]
        obs_next_map=[]
        obs_next_n = []
        act_n = []
        index = self.replay_sample_index
        for i in range(self.n):
            obs, act, rew, obs_next, done = agents[i].replay_buffer.sample_index(index)

            #pdb.set_trace()
            obs_n.append(obs[:,0].tolist())
            obs_next_n.append(obs_next[:,0].tolist())
            obs_map_n.append(obs[:,1].tolist())
            obs_next_map.append(obs_next[:,1].tolist())
            act_n.append(act)
        # train q network
        num_sample = 1
        target_q = 0.0
        for i in range(num_sample):
            #current_target_act_n = [agents[i].p_debug['target_act'](obs_n[i]) for i in range(self.n)]
            current_target_act_n = [np.array([np.reshape(np.array(agents[i].p_debug['target_act']([obs_n[i][j]],[obs_map_n[i][j]])),-1) for j in range(self.batch_size)]) for i in range(self.n)]
            target_vf_next = self.q_debug['target_vf_values'](*(obs_next_n+obs_next_map))
            target_q += rew + self.args.gamma * (1.0 - done) * target_vf_next
        target_q /= num_sample

        #pdb.set_trace()
        q_loss = self.q_train(*(obs_n + obs_map_n+act_n + [target_q]))

        # train p network
        p_loss = self.p_train(*(obs_n + act_n+obs_map_n))
        vf_loss = self.vf_t(*(obs_n + current_target_act_n+obs_map_n))

        self.p_update()
        self.q_update()
        self.vf_u()
        return [q_loss, p_loss, np.mean(target_q), np.mean(rew), np.mean(target_vf_next), np.std(target_q)]

示例#8

显示文件

文件： maddpg_indep_learner.py 项目： stamilse/multiagent-rl

class MADDPGAgentTrainerIndepLearner(AgentTrainer):
    def __init__(self,
                 name,
                 model,
                 obs_shape_n,
                 act_space_n,
                 agent_index,
                 args,
                 agent_type,
                 local_q_func=False):
        self.name = name
        self.n = 1
        self.agent_index = agent_index
        self.args = args
        self.u_estimation = args.u_estimation
        self.constrained = args.constrained
        self.constraint_type = args.constraint_type
        self.agent_type = agent_type
        if self.agent_type == "good":
            cvar_alpha = args.cvar_alpha_good_agent
        elif self.agent_type == "adversary":
            cvar_alpha = args.cvar_alpha_adv_agent
        obs_ph_n = []
        obs_ph_n.append(
            U.BatchInput(obs_shape_n[agent_index], name="observation0").get())
        # Create all the functions necessary to train the model
        self.q_train, self.q_train2, self.q_train3, self.q_update, self.u_update, self.q_debug = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            q_index=agent_index,
            q_func=model,
            u_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr_critic),
            optimizer_lamda=tf.train.AdamOptimizer(
                learning_rate=args.lr_lamda),
            exp_var_alpha=args.exp_var_alpha,
            cvar_alpha=cvar_alpha,
            cvar_beta=args.cvar_beta,
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units,
            u_estimation=self.u_estimation,
            constrained=self.constrained,
            constraint_type=self.constraint_type,
            agent_type=self.agent_type)
        self.act, self.p_train, self.p_update, self.p_debug = p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            p_index=agent_index,
            p_func=model,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr_actor),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units)
        # Create experience buffer
        self.replay_buffer = ReplayBuffer(1e6)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None

    def action(self, obs):
        return self.act(obs[None])[0]

    def experience(self, obs, act, rew, new_obs, done, terminal):
        # Store transition in the replay buffer.
        self.replay_buffer.add(obs, act, rew, new_obs, float(done))

    def preupdate(self):
        self.replay_sample_index = None

    def update(self, agents, t, frozen=False):
        if len(
                self.replay_buffer
        ) < self.max_replay_buffer_len:  # replay buffer is not large enough
            return
        if not t % 100 == 0:  # only update every 100 steps
            return

        self.replay_sample_index = self.replay_buffer.make_index(
            self.args.batch_size)
        # collect replay sample from all agents
        obs_n = []
        obs_next_n = []
        act_n = []
        index = self.replay_sample_index
        obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index)
        obs_n.append(obs)
        obs_next_n.append(obs_next)
        act_n.append(act)
        # train q network
        num_sample = 1
        target_q = 0.0
        if self.u_estimation:
            target_u = 0.0
        for i in range(num_sample):
            target_act_next_n = [
                self.p_debug['target_act'](obs_next_n[0])
            ]  # WHY IS THIS ON AGENT[0]'s target_act ????????
            target_q_next = self.q_debug['target_q_values'](
                *(obs_next_n + target_act_next_n))
            target_q = self.args.gamma * (1.0 - done) * target_q_next
            if self.u_estimation:
                target_u_next = self.q_debug['target_u_values'](
                    *(obs_next_n + target_act_next_n))
                target_u = math.pow(self.args.gamma,
                                    2.0) * (1.0 - done) * target_u_next
            #rew += (rew - self.lamda_constraint*(var_rew - self.args.alpha))
        target_q /= num_sample
        if self.u_estimation:
            target_u /= num_sample
        if not frozen:
            if self.u_estimation:
                q_loss, u_loss = self.q_train(*(obs_n + act_n + [target_q] +
                                                [target_u] + [rew]))
                if self.constrained:
                    q_loss2 = self.q_train2(*(obs_n + act_n + [target_q] +
                                              [target_u] + [rew]))
            else:
                q_loss = self.q_train(*(obs_n + act_n + [target_q] + [rew]))
                if self.constrained:
                    q_loss2 = self.q_train2(*(obs_n + act_n + [target_q] +
                                              [rew]))
            # train p network
            p_loss = self.p_train(*(obs_n + act_n))

            self.p_update()
            self.q_update()
            if self.u_estimation:
                self.u_update()
        if update_v_constraint_only and not self.constrained:
            v_constraint_loss = self.q_train3(*(obs_n + act_n + [target_q] +
                                                [rew]))
        else:
            v_constraint_loss = 0.0
        if self.constrained:
            lamda_constraint = np.array(
                self.q_debug['lamda_constraint'].eval()).mean()
            if lamda_constraint <= 0:
                print("Value of Lamda violated", lamda_constraint)
        else:
            lamda_constraint = 0.0
        if self.constraint_type == "CVAR":
            v_constraint = np.array(self.q_debug['v_constraint'].eval()).mean()
        else:
            v_constraint = 0.0
        if self.u_estimation:
            var_rew = np.array(self.q_debug['var'](
                *(obs_n + act_n + [target_q] + [target_u] + [rew]))).mean()
        else:
            var_rew = np.array(self.q_debug['var'](
                *(obs_n + act_n + [target_q] + [rew]))).mean()
        if self.constrained and self.constraint_type == "CVAR":
            cvar = np.array(self.q_debug['cvar'](*(obs_n + act_n + [target_q] +
                                                   [rew]))).mean()
        else:
            cvar = 0.0
        if not frozen:
            q_loss_mean = np.asarray(q_loss).mean()
            if self.u_estimation:
                u_loss_mean = np.asarray(u_loss).mean()
            else:
                u_loss_mean = 0.0
            p_loss_mean = np.asarray(p_loss).mean()
            if self.constrained:
                q_loss2_mean = np.asarray(q_loss2).mean()
            else:
                q_loss2_mean = 0.0
        else:
            q_loss_mean = 0.0
            u_loss_mean = 0.0
            p_loss_mean = 0.0
            q_loss2_mean = 0.0
        q_values = np.asarray(self.q_debug['q_values'](*(obs_n + act_n)))
        #print ('q_values', q_values.shape)
        mean_q_values = np.mean(q_values)
        std_q_values = np.std(q_values)
        return [
            q_loss_mean, u_loss_mean, q_loss2_mean, p_loss_mean,
            np.mean(rew), var_rew, cvar, lamda_constraint, v_constraint,
            mean_q_values, std_q_values
        ]

示例#9

显示文件

class MADDPGAgentTrainer(AgentTrainer):
    def __init__(self,
                 name,
                 model,
                 obs_shape_n,
                 act_space_n,
                 agent_index,
                 args,
                 agent_type="good",
                 local_q_func=False):
        self.name = name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(
                U.BatchInput(obs_shape_n[i],
                             name="observation" + str(i)).get())

        if (agent_type == "good"):
            self.mic = float(args.good_mic)
        else:
            self.mic = float(args.adv_mic)

        print("MIC for ", agent_type, " agent is ", self.mic)
        self.agent_type = agent_type

        # make a multivariate for each agent.

        self.multivariate_mean = None
        self.multivariate_cov = None
        self.margian_aprox_lr = 1e-2
        self.action_history = []

        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            q_index=agent_index,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            mut_inf_coef=self.mic,
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units)
        self.act, self.p_train, self.p_update, self.p_debug = p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            p_index=agent_index,
            p_func=model,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            mut_inf_coef=self.mic,
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units)
        # Create experience buffer
        self.replay_buffer = ReplayBuffer(1e6)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None

    def sleep_regimen(self):
        return self.args.sleep_regimen

    def agent_mic(self):
        return self.mic

    def action(self, obs):
        action = self.act(obs[None])[0]

        if (len(self.replay_buffer) > self.max_replay_buffer_len
            ):  # dont add random actions to action history
            self.action_history.append(action)

        if (self.mic > 0 and len(self.action_history) >= 100):
            actions = np.stack(self.action_history)
            act_mu = actions.mean(axis=0)
            act_std = actions.std(axis=0)

            if (self.multivariate_mean is None):
                self.multivariate_mean = act_mu
            else:
                previous_mean = self.multivariate_mean
                self.multivariate_mean = (
                    (1 - self.margian_aprox_lr) *
                    self.multivariate_mean) + (self.margian_aprox_lr * act_mu)

            if (self.multivariate_cov is None):
                self.multivariate_cov = np.diag(act_std)
            else:
                cov = (self.margian_aprox_lr * np.diag(act_std) +
                       (1 - self.margian_aprox_lr) * self.multivariate_cov)
                mom_1 = (self.margian_aprox_lr *
                         np.square(np.diag(act_mu))) + (
                             (1 - self.margian_aprox_lr) *
                             np.square(np.diag(previous_mean)))
                mom_2 = np.square((self.margian_aprox_lr * np.diag(act_mu)) +
                                  (1 - self.margian_aprox_lr) *
                                  np.diag(previous_mean))
                self.multivariate_cov = cov + mom_1 - mom_2

        if (len(self.action_history) > 100):
            self.action_history.pop(0)

        return action

    def experience(self, obs, act, rew, new_obs, done, terminal):
        # Store transition in the replay buffer.
        self.replay_buffer.add(obs, act, rew, new_obs, float(done))

    def preupdate(self):
        self.replay_sample_index = None

    def update(self, agents, t, sleeping=False):
        if len(
                self.replay_buffer
        ) < self.max_replay_buffer_len:  # replay buffer is not large enough
            return
        if not t % 100 == 0:  # only update every 100 steps
            return

        self.replay_sample_index = self.replay_buffer.make_index(
            self.args.batch_size)
        obs_n = []
        obs_next_n = []
        act_n = []
        index = self.replay_sample_index
        for i in range(self.n):
            obs, act, rew, obs_next, done = agents[
                i].replay_buffer.sample_index(index)
            obs_n.append(obs)
            obs_next_n.append(obs_next)
            act_n.append(act)
        obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index)

        mir_penalty = 0
        if (self.mic > 0 and (not self.args.sleep_regimen or
                              (self.args.sleep_regimen and sleeping))
            ):  # If sleep regimen is on, only use mic when sleeping
            try:
                multivar = multivariate_normal(self.multivariate_mean,
                                               self.multivariate_cov)
                logp_phi = multivar.logpdf(act)
                logp_phi = logp_phi.reshape(self.args.batch_size, )

                p_phi = multivar.pdf(act)
                p_phi = p_phi.reshape(self.args.batch_size, )

                action_mean = np.mean(act, axis=0)
                action_std = np.std(act, axis=0)
                action_cov = np.diag(action_std)
                policy_multivar = multivariate_normal(action_mean, action_cov)
                logp_pi = policy_multivar.logpdf(act)
                logp_pi = logp_pi.reshape(self.args.batch_size, )

                p_pi = policy_multivar.pdf(act)
                p_pi = p_pi.reshape(self.args.batch_size, )

                phi_entropy = -1 * np.sum(logp_phi * p_phi)
                pi_entropy = -1 * np.sum(logp_pi * p_pi)

                mir_penalty = self.mic * (phi_entropy - pi_entropy)
            except:
                mir_penalty = 0

        print(mir_penalty)

        num_sample = 1
        target_q = 0.0
        for i in range(num_sample):
            target_act_next_n = [
                agents[i].p_debug['target_act'](obs_next_n[i])
                for i in range(self.n)
            ]
            target_q_next = self.q_debug['target_q_values'](
                *(obs_next_n + target_act_next_n))
            target_q += (rew - mir_penalty
                         ) + self.args.gamma * (1.0 - done) * target_q_next
        target_q /= num_sample

        print(target_q)
        assert (False)

        q_loss = self.q_train(*(obs_n + act_n + [target_q]))

        # train p network
        p_loss = self.p_train(*(obs_n + act_n))

        self.p_update()
        self.q_update()

        return [
            q_loss, p_loss,
            np.mean(target_q),
            np.mean(rew),
            np.mean(target_q_next),
            np.std(target_q)
        ]

示例#10

显示文件

class MADDPGAgentTrainer(AgentTrainer):
    def __init__(self,
                 name,
                 model,
                 obs_shape_n,
                 act_space_n,
                 agent_index,
                 args,
                 local_q_func=False):
        self.name = name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        obs_ph_mems = []
        for i in range(args.num_groups):
            # assumes agents have same observation shape
            obs_ph_mems.append(
                U.BatchInput(obs_shape_n[i],
                             name="observation" + str(i)).get())

        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_mems,
            act_space_n=act_space_n,
            q_index=agent_index,
            q_func=model,
            optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units,
            num_groups=args.num_groups)
        self.act, self.p_train, self.p_update, self.p_debug = p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_mems,
            act_space_n=act_space_n,
            p_index=agent_index,
            p_func=model,
            q_func=model,
            optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units,
            num_groups=args.num_groups)
        # Create experience buffer
        self.replay_buffer = ReplayBuffer(1e6)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None

    def action(self, obs):
        return self.act(obs[None])[0]

    def experience(self, obs, act, rew, new_obs, done, terminal,
                   emergency_score, group_members):
        # Store transition in the replay buffer.
        self.replay_buffer.add(obs, act, rew, new_obs, float(done),
                               emergency_score, group_members)

    def preupdate(self):
        self.replay_sample_index = None

    def update(self, agents, t):
        if len(
                self.replay_buffer
        ) < self.max_replay_buffer_len:  # replay buffer is not large enough
            return
        if not t % 100 == 0:  # only update every 100 steps
            return

        self.replay_sample_index = self.replay_buffer.make_index(
            self.args.batch_size)
        # collect replay sample from all agents
        obs_n = []
        obs_next_n = []
        act_n = []
        emerg_n = []
        mems_n = []
        index = self.replay_sample_index

        obs, act, rew, obs_next, done, emerg, mems = self.replay_buffer.sample_index(
            index)
        for i in range(self.n):
            obs_i, act_i, _, obs_next_i, _, emerg_i, mems_i = agents[
                i].replay_buffer.sample_index(index)
            obs_n.append(obs_i)
            obs_next_n.append(obs_next_i)
            act_n.append(act_i)
            emerg_n.append(emerg_i)
            mems_n.append(mems_i)

        # train q network
        num_sample = 1
        target_q = 0.0
        for i in range(num_sample):
            target_act_next_n = [
                agents[i].p_debug['target_act'](obs_next_n[i])
                for i in range(self.n)
            ]  # 9*1024*(act_size)
            obs_next_mems = []
            target_act_next_mems = []
            for t, group_members in enumerate(mems):
                # potential performance optimization here
                curr_obs_next_mems = [obs_next_n[self.agent_index][t]]
                curr_act_next_mems = [target_act_next_n[self.agent_index][t]
                                      ]  # 3*(obs_size)

                for i in group_members:
                    if i == self.agent_index:
                        continue
                    curr_obs_next_mems.append(obs_next_n[i][t])
                    curr_act_next_mems.append(target_act_next_n[i][t])

                obs_next_mems.append(curr_obs_next_mems)
                target_act_next_mems.append(curr_act_next_mems)

            target_act_next_mems = np.swapaxes(target_act_next_mems, 0, 1)
            obs_next_mems = np.swapaxes(obs_next_mems, 0, 1)

            input_list = list(obs_next_mems) + list(target_act_next_mems)
            target_q_next = self.q_debug['target_q_values'](*input_list)
            target_q += rew + self.args.gamma * (1.0 - done) * target_q_next
            """
            target_act_next_mems = []
            for t, group_members in enumerate(mems):
                # potential performance optimization here
                target_act_next_mems.append([target_act_next_n[i][t] for i in group_members])
            """
            """ ### OLD CODE ###
            target_act_next_n = [agents[i].p_debug['target_act'](obs_next_n[i]) for i in range(self.n)]
            target_q_next = self.q_debug['target_q_values'](*(obs_next_n + target_act_next_n))
            target_q += rew + self.args.gamma * (1.0 - done) * target_q_next
            """
        target_q /= num_sample

        act_mems = []
        obs_mems = []
        for t, group_members in enumerate(mems):
            # potential performance optimization here
            curr_obs_mems = [obs_n[i][t] for i in group_members]
            curr_act_mems = [act_n[i][t] for i in group_members]
            obs_mems.append(curr_obs_mems)
            act_mems.append(curr_act_mems)

        act_mems = np.swapaxes(act_mems, 0, 1)
        obs_mems = np.swapaxes(obs_mems, 0, 1)

        input_list = list(obs_mems) + list(act_mems) + [target_q]

        q_loss = self.q_train(*input_list)

        # train p network
        input_list = list(obs_mems) + list(act_mems)
        p_loss = self.p_train(*input_list)

        self.p_update()
        self.q_update()

        return [
            q_loss, p_loss,
            np.mean(target_q),
            np.mean(rew),
            np.mean(target_q_next),
            np.std(target_q)
        ]

示例#11

显示文件

文件： maddpg.py 项目： laurathepluralized/maddpg

class MADDPGAgentTrainer():
    """Train MADDPG Agent.

    The vast majority of the modifications to this class (as well as other
    parts of this file) are drawn from
    https://github.com/sunshineclt/maddpg/blob/master/maddpg/trainer/maddpg.py.
    """
    def __init__(self, name, model_value, model_policy, obs_shape_n,
                 act_space_n, agent_index, args, hparams,
                 summary_writer=None, local_q_func=False, rngseed=None):
        self.name = name
        self.rngseed = rngseed
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        self.hparams = hparams
        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(U.BatchInput(
                obs_shape_n[i], name="observation" + str(i)).get())

        # Create all the functions necessary to train the model

        # train critic
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            q_index=agent_index,
            q_func=model_value,
            optimizer=tf.train.AdamOptimizer(learning_rate=hparams['learning_rate']),
            grad_norm_clipping=hparams['grad_norm_clipping'],
            local_q_func=local_q_func,
            num_units=args.num_units
        )

        # train policy
        self.act, self.p_train, self.p_update, self.p_debug = p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            p_index=agent_index,
            p_func=model_policy,
            q_func=model_value,
            optimizer=tf.train.AdamOptimizer(learning_rate=hparams['learning_rate']),
            grad_norm_clipping=hparams['grad_norm_clipping'],
            local_q_func=local_q_func,
            num_units=args.num_units
        )
        # Create experience buffer
        self.replay_buffer = ReplayBuffer(hparams['replay_buffer_len'], self.rngseed)
        try:
            if hparams['test_saving']:
                self.max_replay_buffer_len = 100
        except KeyError:
            self.max_replay_buffer_len = hparams['batch_size'] * args.max_episode_len
        self.replay_sample_index = None
        self.summary_writer = summary_writer

    def action(self, obs):
        # return self.act(obs[None])[0]
        theac = self.act(obs[None])[0]
        # print("p", self.p_debug["p_values"](obs[None])[0])
        # print("act", self.act(obs[None])[0])
        if any(np.isnan(theac)):
            print('NaN action in MADDPGAgentTrainer')
            pdb.set_trace()
            print('NaN action in MADDPGAgentTrainer')
        return theac

    def experience(self, obs, act, rew, new_obs, done, terminal):
        # Store transition in the replay buffer.
        self.replay_buffer.add(obs, act, rew, new_obs, float(done))

    def preupdate(self):
        self.replay_sample_index = None

    def set_memory_index(self, replay_sample_index):
        self.replay_sample_index = replay_sample_index

    def get_memory_index(self, batch_size):
        return self.replay_buffer.make_index(batch_size)

    def get_replay_data(self):
        return self.replay_buffer.sample_index(self.replay_sample_index)

    def get_target_act(self, obs):
        return self.p_debug['target_act'](obs[self.agent_index])

    def update(self, agents, t, episodenum, savestuff=False):
        """Pull from replay buffer and update policy and critic."""
        # replay buffer is not large enough
        if len(self.replay_buffer) < self.max_replay_buffer_len:
            return False, []
        if not t % 100 == 0:  # only update every 100 steps
            return False, []

        self.replay_sample_index = \
            self.replay_buffer.make_index(self.hparams['batch_size'])
        # collect replay sample from all agents
        obs_n = []
        obs_next_n = []
        act_n = []
        index = self.replay_sample_index
        for i in range(self.n):
            obs, act, rew, obs_next, done = \
                    agents[i].replay_buffer.sample_index(index)
            obs_n.append(obs)
            obs_next_n.append(obs_next)
            act_n.append(act)
        obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index)

        # train Q-function network
        num_sample = 1
        target_q = 0.0
        for i in range(num_sample):
            target_act_next_n = \
                [agents[i].p_debug['target_act'](obs_next_n[i]) for i in range(self.n)]
            target_q_next = self.q_debug['target_q_values'](*(obs_next_n + target_act_next_n))
            target_q += rew + self.hparams['gamma'] * (1.0 - done) * target_q_next
        target_q /= float(num_sample)
        q_loss, q_loss_summary = self.q_train(*(obs_n + act_n + [target_q]))

        if q_loss > 10000000:
            print('Huge Q loss! Seed was {}'.format(self.rngseed))
            pdb.set_trace()
            print('Huge Q loss! Seed was {}'.format(self.rngseed))

        # train policy network
        p_loss, p_summary = self.p_train(*(obs_n + act_n))

        if p_loss > 10000000:
            print('Huge policy loss! Seed was {}'.format(self.rngseed))
            pdb.set_trace()
            print('Huge policy loss! Seed was {}'.format(self.rngseed))

        if self.summary_writer is not None and savestuff:
            self.summary_writer.add_summary(p_summary, global_step=episodenum)
            self.summary_writer.add_summary(q_loss_summary, global_step=episodenum)
        self.p_update()  # update policy
        self.q_update()  # update critic
        return True, [q_loss, p_loss, np.mean(target_q), np.mean(rew),
                      np.mean(target_q_next), np.std(target_q)]

示例#12

显示文件

class MADDPGAgentTrainer(AgentTrainer):
    """
    Agent Trainer using MADDPG Algorithm
    """
    def __init__(self,
                 name,
                 model,
                 obs_shape_n,
                 act_space_n,
                 agent_index,
                 args,
                 role="",
                 local_q_func=False):
        """
        Args:
            name (str): Name of the agent
            model (function): MLP Neural Network model for the agent.
            obs_shape_n (tf.placeholder): Placeholder for the observation space of all agents
            act_space_n (list): A list of the action spaces for all agents
            agent_index (int): Agent index number
            args (argparse.Namespace): Parsed commandline arguments object
            role (str): Role of the agent i.e. adversary
            local_q_func (boolean): Flag for using local q function
        """
        # super(MADDPGAgentTrainer, self).__init__()

        self.name = name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args

        # Set up observation space placeholder
        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(
                tf_util.BatchInput(obs_shape_n[i],
                                   name="observation" + str(i)).get())

        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            q_index=agent_index,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units)
        self.act, self.p_train, self.p_update, self.p_debug = p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            p_index=agent_index,
            p_func=model,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units)

        # Create experience buffer
        self.replay_buffer = ReplayBuffer(int(1e6))
        self.max_replay_buffer_len = 30  # args.batch_size * args.max_episode_len TODO: Change back
        self.replay_sample_index = None

    def action(self, obs):
        """
        Retrieves action for agent from the P network given the observations

        Args:
            obs (np.array): Observations of the world for an agent

        Returns:
            Action for an agent
        """
        return self.act(obs[None])[0]

    def experience(self, obs, act, rew, new_obs, done, terminal):
        """
        Store transition in the replay buffer.

        Args:
            obs (np.array): Observations of the world for an agent
            act (list): Action for an agent
            rew (float): Reward for an agent
            new_obs (np.array): New observations of the world for an agent
            done (): Done for an agent
            terminal (boolean): Flag for whether the final episode has been reached.
        """
        self.replay_buffer.add(obs, act, rew, new_obs, float(done))

    def preupdate(self):
        """
        Reset replay_sample_index to None.
        """
        self.replay_sample_index = None

    def update(self, agents, steps):
        """
        Update agent networks

        Args:
            agents (list): List of MADDPGAgentTrainer objects
            steps (int): Current training step

        Returns:
            (list) Training loss for the agents
                   [q_loss, p_loss, mean_target_q, mean_reward, mean_target_q_next, std_target_q]
        """
        # Replay buffer is not large enough
        if len(self.replay_buffer) < self.max_replay_buffer_len:
            return

        # Only update every 100 steps
        if not steps % 100 == 0:
            return

        # Collect replay sample from all agents
        obs_n = []
        obs_next_n = []
        act_n = []
        self.replay_sample_index = self.replay_buffer.make_index(
            self.args.batch_size)
        self_index = self.replay_sample_index
        for i in range(self.n):
            index = agents[i].replay_buffer.make_index(self.args.batch_size)
            obs, act, rew, obs_next, done = agents[
                i].replay_buffer.sample_index(index)
            obs_n.append(obs)
            obs_next_n.append(obs_next)
            act_n.append(act)
        obs, act, rew, obs_next, done = self.replay_buffer.sample_index(
            self_index)

        # Train Q Network
        num_sample = 1
        target_q = 0.0
        target_q_next = 0.0
        for i in range(num_sample):
            target_act_next_n = [
                agents[i].p_debug['target_act'](obs_next_n[i])
                for i in range(self.n)
            ]
            target_q_next = self.q_debug['target_q_values'](
                *(obs_next_n + target_act_next_n))
            target_q += rew + self.args.gamma * (1.0 - done) * target_q_next
        target_q /= num_sample
        q_loss = self.q_train(*(obs_n + act_n + [target_q]))

        # Train P Network
        p_loss = self.p_train(*(obs_n + act_n))

        self.p_update()
        self.q_update()

        return [
            q_loss, p_loss,
            np.mean(target_q),
            np.mean(rew),
            np.mean(target_q_next),
            np.std(target_q)
        ]

示例#13

显示文件

class MADDPGAgentTrainer(AgentTrainer):
    def __init__(self,
                 name,
                 model,
                 obs_shape_n,
                 act_space_n,
                 agent_index,
                 args,
                 local_q_func=False):
        self.name = name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(
                U.BatchInput(obs_shape_n[i],
                             name="observation" + str(i)).get())

        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            q_index=agent_index,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units,
            # reuse = tf.compat.v1.AUTO_REUSE,
        )
        self.act, self.p_train, self.p_update, self.p_debug, num_actions = p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            p_index=agent_index,
            p_func=model,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units,
            # reuse = tf.compat.v1.AUTO_REUSE,
        )

        # Create experience buffer
        self.replay_buffer = ReplayBuffer(1e6, args.batch_size, num_actions,
                                          obs_ph_n[0].shape[1])
        #self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.max_replay_buffer_len = args.batch_size  # I mean this is how it should be. This is what we're actually doing...

        self.replay_sample_index = None

    def action(self, obs):
        return self.act(obs[None])[0]

    def experience(self, obs, act, rew, new_obs, done, terminal):
        # Store transition in the replay buffer.
        self.replay_buffer.add(obs, act, rew, new_obs, float(done))

    def preupdate(self):
        self.replay_sample_index = None

    def update(self, agents, t, replay_index=None):
        single_nn = replay_index != None

        if single_nn:
            assert self.agent_index == 0
        else:
            replay_index = self.agent_index

        if len(
                self.replay_buffer
        ) < self.max_replay_buffer_len:  # replay buffer is not large enough
            return
        if not t % 100 == 0:  # only update every 100 steps
            return

        self.replay_sample_index = self.replay_buffer.make_index(
            self.args.batch_size)
        # collect replay sample from all agents
        # This is silly. We only need to do this once per step, not for each agent. That is true also when we have multiple nn.
        obs_n = []
        obs_next_n = []
        act_n = []
        index = self.replay_sample_index
        for i in range(self.n):
            j = (i + replay_index) % self.n if single_nn else i
            obs, act, rew, obs_next, done = agents[
                j].replay_buffer.sample_index(index)
            obs_n.append(obs)
            obs_next_n.append(obs_next)
            act_n.append(act)
        obs, act, rew, obs_next, done = agents[
            replay_index].replay_buffer.sample_index(index)

        # train q network. I don't understand how this matters. Where do we use the q-network????
        # we have a separate q-network in p_train are they connected because they share the same name in tf??
        num_sample = 1
        target_q = 0.0
        for i in range(num_sample):
            if single_nn:
                target_act_next_n = [
                    self.p_debug['target_act'](obs_next_n[i])
                    for i in range(self.n)
                ]
            else:
                target_act_next_n = [
                    agents[i].p_debug['target_act'](obs_next_n[i])
                    for i in range(self.n)
                ]

            target_q_next = self.q_debug['target_q_values'](
                *(obs_next_n + target_act_next_n))
            target_q += rew + self.args.gamma * (1.0 - done) * target_q_next
        target_q /= num_sample
        q_loss = self.q_train(*(obs_n + act_n + [target_q]))

        # train p network
        p_loss = self.p_train(*(obs_n + act_n))

        self.p_update()
        self.q_update()

        return [
            q_loss, p_loss,
            np.mean(target_q),
            np.mean(rew),
            np.mean(target_q_next),
            np.std(target_q)
        ]

示例#14

显示文件

文件： ibmac.py 项目： jkun-332/IMAC

class IBMACAgentTrainer(AgentTrainer):
    def __init__(self,
                 name,
                 before_com_model,
                 channel,
                 after_com_model,
                 critic_mlp_model,
                 obs_shape_n,
                 act_space_n,
                 args,
                 local_q_func=False):
        self.name = name
        self.n = len(obs_shape_n)
        self.args = args
        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(
                U.BatchInput(obs_shape_n[i],
                             name="observation_" + str(i)).get())

        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            q_func=critic_mlp_model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units,
        )
        self.act, self.p_train, self.p_update, self.p_debug = p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            before_com_func=before_com_model,
            channel=channel,
            after_com_func=after_com_model,
            q_func=critic_mlp_model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units,
            beta=args.beta,
            ibmac_com=args.ibmac_com,
        )
        # Create experience buffer
        self.replay_buffer = ReplayBuffer(1e6)
        # self.max_replay_buffer_len = 50 * args.max_episode_len
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None

        self.message_1_for_record = []

    def action(self, obs_n, is_norm_training=False, is_inference=False):
        obs = [obs[None] for obs in obs_n]
        message_n = self.p_debug['check_message_n'](
            *(list(obs) + [is_norm_training, is_inference]))
        self.message_1_for_record.append(message_n[0])
        if len(self.message_1_for_record) % 2500 == 0:
            # print(np.var(self.message_1_for_record, axis=0))
            # print(0.5 * np.log(2 * np.pi * np.mean(np.var(self.message_1_for_record, axis=0))) + 0.5)
            self.message_1_for_record = []
        return self.act(*(list(obs) + [is_norm_training, is_inference]))

    def experience(self, obs, act, rew, new_obs, done, terminal):
        # Store transition in the replay buffer.
        self.replay_buffer.add(obs, act, rew, new_obs,
                               [float(d) for d in done])

    def preupdate(self):
        self.replay_sample_index = None

    def update(self, agents, t):
        if len(
                self.replay_buffer
        ) < self.max_replay_buffer_len:  # replay buffer is not large enough
            return
        if not t % 100 == 0:  # only update every 100 steps
            return
        is_norm_training = True
        is_inference = False
        self.replay_sample_index = self.replay_buffer.make_index(
            self.args.batch_size)
        # collect replay sample from all agents
        obs_n = []
        obs_next_n = []
        act_n = []
        index = self.replay_sample_index
        samples = self.replay_buffer.sample_index(index)
        obs_n, act_n, rew_n, obs_next_n, done_n = [
            np.swapaxes(item, 0, 1) for item in samples
        ]
        # for i in range(self.n):
        #     obs, act, rew, obs_next, done = agents[i].replay_buffer.sample_index(index)
        #     obs_n.append(obs)
        #     obs_next_n.append(obs_next)
        #     act_n.append(act)
        # obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index)

        # train q network
        num_sample = 1
        target_q = 0.0
        # print(len(obs_next_n))
        for i in range(num_sample):
            target_act_next_n = self.p_debug['target_act'](
                *(list(obs_next_n) + [is_norm_training, is_inference]))
            target_q_next_n = self.q_debug['target_q_values'](
                *(list(obs_next_n) + list(target_act_next_n) +
                  [is_norm_training, is_inference]))
            target_q_n = [
                rew + self.args.gamma * (1.0 - done) * target_q_next for rew,
                done, target_q_next in zip(rew_n, done_n, target_q_next_n)
            ]
        target_q_n = [target_q / num_sample for target_q in target_q_n]
        q_loss = self.q_train(*(list(obs_n) + list(act_n) + target_q_n +
                                [is_norm_training, is_inference]))

        # train p network
        p_loss = self.p_train(*(list(obs_n) + list(act_n) +
                                [is_norm_training, is_inference]))

        self.p_update()
        self.q_update()

        # p_values = self.p_debug['p_values'](*(list(obs_n)))
        kl_loss = self.p_debug['kl_loss'](*(list(obs_n) + list(act_n) +
                                            [is_norm_training, is_inference]))
        # print('kl_loss', self.p_debug['kl_loss'](*(list(obs_n) + list(act_n))))
        # if t % 5000 == 0:
        #     print('p_values', p_values[0][0])
        #     print('check_value', self.p_debug['p_values'](*(list(obs_n)))[0][0])
        #     print('check_mu', self.p_debug['check_mu'](*(list(obs_n)))[0][0])
        #     print('check_log', self.p_debug['check_log'](*(list(obs_n)))[0][0])

        # print('kl_loss', kl_loss)
        # message_n = self.p_debug['check_message_n'](*(list(obs_n)+[is_norm_training, is_inference]))
        # hiddens_n = self.p_debug['check_hiddens_n'](*list(obs_n))
        # print("message_n", message_n[0][0])
        # for message in message_n:
        #     print("mean, var", np.mean(message, axis=0), np.var(message,axis=0))
        # print("hiddens_n", hiddens_n[0][0])
        # entropy = self.p_debug['check_entropy'](*list(obs_n))
        # print("entropy",np.mean(entropy, (1,2)))

        return [
            q_loss, p_loss,
            np.mean(target_q),
            np.mean(rew_n),
            np.mean(target_q_next_n),
            np.std(target_q), kl_loss
        ]

示例#15

显示文件

class COMAAgentTrainer(AgentTrainer):
    def __init__(self, name, model, obs_shape_n, act_space_n, agent_index,
                 action_number, args):
        self.name = name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(
                U.BatchInput(obs_shape_n[i],
                             name="observation" + str(i)).get())

        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            q_index=agent_index,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            num_units=args.num_units,
            num_outputs=action_number)
        self.act, self.p_train, self.p_update, self.p_debug = p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            p_index=agent_index,
            p_func=model,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=False,
            num_units=args.num_units,
            num_outputs=action_number)
        # Create experience buffer
        self.replay_buffer = ReplayBuffer(1e6)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None

    def get_inputs(self):
        pass

    def action(self, obs):
        return self.act(obs[None])[0]

    def experience(self, obs, act, rew, new_obs, done, terminal):
        # Store transition in the replay buffer.
        self.replay_buffer.add(obs, act, rew, new_obs, float(done))

    def preupdate(self):
        self.replay_sample_index = None

    def update(self, agents, t):
        if len(
                self.replay_buffer
        ) < self.max_replay_buffer_len:  # replay buffer is not large enough
            return
        if not t % 100 == 0:  # only update every 100 steps
            return

        self.replay_sample_index = self.replay_buffer.make_index(
            self.args.batch_size)
        # collect replay sample from all agents
        obs_n = []
        obs_next_n = []
        act_n = []
        index = self.replay_sample_index
        # 每个agent获得replay batch
        for i in range(self.n):
            obs, act, rew, obs_next, done = agents[
                i].replay_buffer.sample_index(index)
            obs_n.append(obs)
            obs_next_n.append(obs_next)
            act_picked = [
                softmax_act.tolist().index(max(softmax_act))
                for softmax_act in act
            ]
            act_n.append(act_picked)
        # 当前trainer的replay batch
        obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index)

        # train q network
        # 向后看1步，相当于TD1？
        num_sample = 1
        target_q = 0.0
        for i in range(num_sample):
            # 获得下一step的所有agent动作，每个agent根据 本地 的下一step的obs做决策
            target_act_next_n = [
                agents[i].p_debug['target_act'](obs_next_n[i])
                for i in range(self.n)
            ]

            act_picked = []
            for i in range(self.n):
                act_picked += [
                    softmax_act.tolist().index(max(softmax_act))
                    for softmax_act in target_act_next_n[i]
                ]

            # 利用target网络得到target q值
            target_q_next = self.q_debug['target_q_values'](*(obs_next_n +
                                                              act_picked))

            # Q network得到的是当前用户取不同动作的q，计算loss需要得到真实动作下的Q
            target_q_picked_next = [
                q[act]
                for act, q in zip(act_picked[self.agent_index], target_q_next)
            ]

            target_q += rew + self.args.gamma * (1.0 -
                                                 done) * target_q_picked_next
        target_q /= num_sample
        q_loss = self.q_train(*(obs_n + act_n + [target_q]))

        # train p network
        p_loss = self.p_train(*(obs_n + act_n))

        self.p_update()
        self.q_update()

        return [
            q_loss, p_loss,
            np.mean(target_q),
            np.mean(rew),
            np.mean(target_q_next),
            np.std(target_q)
        ]

示例#16

显示文件

class MADDPGApproxAgentTrainer(AgentTrainer):
    def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args,
                 use_approx_policy = True,
                 sync_replay = True, local_q_func=False, update_gap=100):
        self.use_approx_policy = use_approx_policy
        self.name = name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        args.num_units = 64
        self.sync_replay = sync_replay
        self.counter = 0
        self.args = args
        self.update_gap = update_gap

        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation" + str(i)).get())

        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_sync, self.q_debug = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,  # [lambda name: U.BatchInput(obs_shape, name=name) for obs_shape in obs_shape_n],
            act_space_n=act_space_n,
            q_index=agent_index,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units
        )
        self.act, self.p_train, self.p_update, self.p_sync, self.p_debug = p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,  # [lambda name: U.BatchInput(obs_shape, name=name) for obs_shape in obs_shape_n],
            act_space_n=act_space_n,
            p_index=agent_index,
            p_func=model,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units
        )
        self.approx_act, self.approx_p_train, self.approx_p_update, self.approx_p_sync, self.approx_p_debug = [],[],[],[],[]
        for i in range(self.n):
            if i == self.agent_index:
                t_act, t_p_train, t_p_update, t_p_sync, t_p_debug = self.act, self.p_train, self.p_update, self.p_sync, self.p_debug
            else:
                t_act, t_p_train, t_p_update, t_p_sync, t_p_debug = p_approx_train(
                    scope=self.name+'approx_p_%d'%i,
                    make_obs_ph_n=obs_ph_n,
                    # [lambda name: U.BatchInput(obs_shape, name=name) for obs_shape in obs_shape_n],
                    act_space_n=act_space_n,
                    p_index=i,
                    p_func=model,
                    q_func=model,
                    optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
                    grad_norm_clipping=0.5,
                    local_q_func=local_q_func,
                    num_units=args.num_units
                )
            self.approx_act.append(t_act)
            self.approx_p_train.append(t_p_train)
            self.approx_p_update.append(t_p_update)
            self.approx_p_sync.append(t_p_sync)
            self.approx_p_debug.append(t_p_debug)

        # Create experience buffer
        self.replay_buffer = ReplayBuffer(int(1e6))
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None

    def action(self, obs):
        return self.act(obs[None])[0]
        # return self.p_debug['target_act'](obs[None])[0]

    def experience(self, obs, act, rew, new_obs, done, terminal):
        # Store transition in the replay buffer.
        self.replay_buffer.add(obs, act, rew, new_obs, float(done))

    def sync_target_nets(self):
        for i in range(self.n):
            self.approx_p_sync[i]()
        self.q_sync()

    def preupdate(self):
        self.replay_sample_index = None
        self.counter += 1

    def update(self, agents):
        # replay buffer is not large enough
        if len(self.replay_buffer) < self.max_replay_buffer_len:
            return None

        if not self.counter % self.update_gap == 0:
            return None

        # agree on a replay samples across all agents
        # as in https://arxiv.org/abs/1703.06182
        if self.sync_replay:
            if agents[0].replay_sample_index is None:
                agents[0].replay_sample_index = agents[0].replay_buffer.make_index(agents[0].args.batch_size)
            self.replay_sample_index = agents[0].replay_sample_index
        else:
            self.replay_sample_index = self.replay_buffer.make_index(self.args.batch_size)

        # collect replay sample from all agents
        obs_n = []
        obs_next_n = []
        act_n = []
        index = self.replay_sample_index
        for i in range(self.n):
            obs, act, rew, obs_next, done = agents[i].replay_buffer.sample_index(index)
            obs_n.append(obs)
            obs_next_n.append(obs_next)
            act_n.append(act)
        obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index)

        # evaluate kl divergence between approximate policy and the target policy
        target_logits_n = [agents[i].p_debug['target_p_values'](obs_n[i]) for i in range(self.n)]
        kl_loss = 0.0
        for i in range(self.n):
            if i == self.agent_index: continue
            kl_loss += self.approx_p_debug[i]['kl_loss'](obs_n[i], target_logits_n[i])

        # collect latest samples for approximate policy
        latest_obs_n = []
        latest_act_n = []
        latest_index = self.replay_buffer.make_latest_index(self.update_gap)
        for i in range(self.n):
            # TODO: now we approximate the *true policy*, but what we want is actually the target_policy!
            #       Shall we approximate the target net instead???
            t_obs, t_act, _, _, _ = agents[i].replay_buffer.sample_index(latest_index)
            #t_act = agents[i].p_debug['target_act'](t_obs)
            latest_obs_n.append(t_obs)
            latest_act_n.append(t_act)

        # train approximate p network
        for i in range(self.n):
            if i == self.agent_index: continue
            self.approx_p_train[i](*(latest_obs_n + latest_act_n))
            self.approx_p_update[i]()

        # train q network
        if self.use_approx_policy:
            target_act_next_n = [self.approx_p_debug[i]['target_act'](obs_next_n[i]) for i in range(self.n)]
        else:  # use true policy
            target_act_next_n = [agents[i].p_debug['target_act'](obs_next_n[i]) for i in range(self.n)]
        target_q_next = self.q_debug['target_q_values'](*(obs_next_n + target_act_next_n))
        target_q = rew + self.args.gamma * (1.0 - done) * target_q_next

        q_loss = self.q_train(*(obs_n + act_n + [target_q]))

        # train p network
        p_loss = self.p_train(*(obs_n + act_n))

        self.p_update()
        self.q_update()
        return [q_loss, p_loss, kl_loss]

示例#17

显示文件

文件： maddpg.py 项目： lqy0057/MADDPG-Congestion-Control-Based-on-OMNETPP

class MADDPGAgentTrainer(AgentTrainer):  # model是采用的神经网络模型的输出，即神经网络模型
    def __init__(self,
                 name,
                 model,
                 obs_shape_n,
                 act_space_n,
                 agent_index,
                 args,
                 local_q_func=False):  # 是否用ddpg训练
        self.name = name
        self.n = len(obs_shape_n)  # 总的agent个数
        self.agent_index = agent_index  # 当前是几号agent
        self.args = args  # cmd传入的训练参数，交互用
        obs_ph_n = []
        for i in range(self.n):  # 用于一批环境数据放入的占位符集合，收集所有agent的observations，
            # 依据他们observation的shape创造不同大小的批量占位符集合
            obs_ph_n.append(
                U.BatchInput(obs_shape_n[i],
                             name="observation" + str(i)).get())

        # Create all the functions necessary to train the model
        # 训练节点，更新target网络，字典得到对应输出的q值与target-q值(已经被session激活)
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            q_index=agent_index,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units
        )  # 得到act，训练策略网络，策略网络的target网络更新，字典给出p值和target策略网络的输出动作值
        self.act, self.p_train, self.p_update, self.p_debug = p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            p_index=agent_index,
            p_func=model,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units)
        # Create experience buffer
        self.replay_buffer = ReplayBuffer(1e6)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None

    def action(self, obs):  # 选择动作
        return self.act(obs[None])[0]  # 返回的是一组act中的第一个,[None]表示数组的一维

    def experience(self, obs, act, rew, new_obs, done,
                   terminal):  # 为该agent收集经验
        # Store transition in the replay buffer.
        self.replay_buffer.add(obs, act, rew, new_obs, float(done))

    def preupdate(self):
        self.replay_sample_index = None  # batch大小数组置空

    def update(self, agents, t):  # 经验回放训练该agent,100步才训练，train step为100倍数
        if len(
                self.replay_buffer
        ) < self.max_replay_buffer_len:  # replay buffer is not large enough
            return
        if not t % 100 == 0:  # only update every 100 steps
            return

        self.replay_sample_index = self.replay_buffer.make_index(
            self.args.batch_size)  # 一个batch大小的数组，存放随机生成的index
        # collect replay sample from all agents
        obs_n = []
        obs_next_n = []
        act_n = []
        index = self.replay_sample_index  # index数组
        for i in range(self.n):  # 从每个agent的经验池采样
            obs, act, rew, obs_next, done = agents[
                i].replay_buffer.sample_index(index)  # 采样一个agent的一批经验
            obs_n.append(obs)  # n批经验构成的二维数组，每个元素为一个agent的一次对环境的观察，下同理
            obs_next_n.append(obs_next)
            act_n.append(act)
        obs, act, rew, obs_next, done = self.replay_buffer.sample_index(
            index)  # 采样自己的经验，和上面的采样相同
        # 最后obs_n中有n个元素，每个元素表示一个agent的一个batch_size大小的经验集合
        # train q network
        num_sample = 1
        target_q = 0.0
        for i in range(num_sample):
            # debug是一个字典，值是函数
            target_act_next_n = [
                agents[i].p_debug['target_act'](obs_next_n[i])
                for i in range(self.n)
            ]
            # target网络预测的所有agent的下一个行为！！！！！！！！！每个agent维护一个用于预测其余agent动作的神经网络
            # 这里用于选择自己的policy的神经网络与预测别人行为的网络是同一个
            target_q_next = self.q_debug['target_q_values'](
                *(obs_next_n + target_act_next_n))
            # target网络预测的下一个状态的所有agent的q值集合！！！！！！！！！！
            target_q += rew + self.args.gamma * (1.0 - done) * target_q_next
            # 上式为target-q值，

        target_q /= num_sample
        q_loss = self.q_train(*(obs_n + act_n + [target_q]))  # 仅用当前经验完成对q值的训练

        # train p network
        p_loss = self.p_train(*(obs_n +
                                act_n))  # 填入所有观测值是因为用于q网络对p网络的改善，选择行为时实际上只用p网络

        self.p_update(
        )  # 每一次完成神经网络训练后softupdate target网络的值,其实是100步才会执行一次update
        self.q_update()  # 同上
        return [
            q_loss, p_loss,
            np.mean(target_q),
            np.mean(rew),
            np.mean(target_q_next),
            np.std(target_q)
        ]

示例#18

显示文件

文件： maddpg.py 项目： sunshineclt/maddpg

class MADDPGAgentTrainer():
    def __init__(self,
                 name,
                 model_value,
                 model_policy,
                 obs_shape_n,
                 act_space_n,
                 agent_index,
                 args,
                 board_writer,
                 local_q_func=False):
        self.name = name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(
                U.BatchInput(obs_shape_n[i],
                             name="observation" + str(i)).get())

        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            q_index=agent_index,
            q_func=model_value,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=5,
            local_q_func=local_q_func,
            num_units=args.num_units)
        self.act, self.p_train, self.p_update, self.p_debug = p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            p_index=agent_index,
            p_func=model_policy,
            q_func=model_value,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=5,
            local_q_func=local_q_func,
            num_units=args.num_units)
        # Create experience buffer
        self.replay_buffer = ReplayBuffer(1e6)
        self.max_replay_buffer_len = args.batch_size_q * args.max_episode_len
        self.replay_sample_index = None
        self.board_writer = board_writer

    def action(self, obs):
        # print("p", self.p_debug["p_values"](obs[None])[0])
        # print("act", self.act(obs[None])[0])
        return self.act(obs[None])[0]

    def experience(self, obs, act, rew, new_obs, done, terminal):
        # Store transition in the replay buffer.
        self.replay_buffer.add(obs, act, rew, new_obs, float(done))

    def preupdate(self):
        self.replay_sample_index = None

    def set_memory_index(self, replay_sample_index):
        self.replay_sample_index = replay_sample_index

    def get_memory_index(self, batch_size):
        return self.replay_buffer.make_index(batch_size)

    def get_replay_data(self):
        return self.replay_buffer.sample_index(self.replay_sample_index)

    def get_target_act(self, obs):
        return self.p_debug['target_act'](obs[self.agent_index])

    def update_q(self, t, obs_n, act_n, obs_next_n, target_act_next_n):
        obs, act, rew, obs_next, done = self.replay_buffer.sample_index(
            self.replay_sample_index)

        # train q network
        target_q = 0.0
        target_q_next = self.q_debug['target_q_values'](*(obs_next_n +
                                                          target_act_next_n))
        target_q += rew + self.args.gamma * (1.0 - done) * target_q_next
        q_loss, q_loss_summary = self.q_train(*(obs_n + act_n + [target_q]))

        self.board_writer.add_summary(q_loss_summary, global_step=t)

        self.q_update()

    def update_p(self, t, obs_n, target_act_next_n):
        # train p network
        p_loss, p_summary = self.p_train(*(obs_n + target_act_next_n))

        self.board_writer.add_summary(p_summary, global_step=t)
        self.p_update()

示例#19

显示文件

文件： matd3.py 项目： yyywanng/MATD3implementation

class MATD3AgentTrainer(AgentTrainer):
    def __init__(self,
                 name,
                 model,
                 obs_shape_n,
                 act_space_n,
                 agent_index,
                 args,
                 local_q_func=False):
        self.name = name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(
                U.BatchInput(obs_shape_n[i],
                             name="observation" + str(i)).get())

        # Create all the functions necessary to train the model
        self.q_train1, self.q_update1, self.q_debug1 = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            agent_idx=agent_index,
            q_function_idx=1,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units)
        self.q_train2, self.q_update2, self.q_debug2 = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            agent_idx=agent_index,
            q_func=model,
            q_function_idx=2,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units)

        self.act, self.p_train, self.p_update, self.p_debug = p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            agent_idx=agent_index,
            p_func=model,
            q_func=model,  #MLPmodel()
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units)
        # Create experience buffer
        self.replay_buffer = ReplayBuffer(1e6)
        self.min_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None
        a = tf.summary.FileWriter("logdirMaddpg", tf.get_default_graph())
        a.flush()
        a.close()

    def action(self, obs):
        return self.act(obs[None])[0]

    def experience(self, obs, act, rew, new_obs, done, terminal):
        # Store transition in the replay buffer.
        self.replay_buffer.add(obs, act, rew, new_obs, float(done))

    def preupdate(self):
        self.replay_sample_index = None

    @property
    def q_debug(self):
        return self.q_debug1

    def update(self, agents, train_step):
        if len(
                self.replay_buffer
        ) < self.min_replay_buffer_len:  # replay buffer is not large enough
            return

        if not train_step % self.args.update_rate == 0:
            return

        self.replay_sample_index = self.replay_buffer.generate_sample_indices(
            self.args.batch_size)
        # collect replay sample from all agents
        obs_n = []
        obs_next_n = []
        act_n = []
        index = self.replay_sample_index
        for i in range(self.n):
            obs, act, rew, obs_next, done = agents[
                i].replay_buffer.sample_index(index)
            obs_n.append(obs)
            obs_next_n.append(obs_next)
            act_n.append(act)
        obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index)

        # train q network
        target_act_next_n = [
            agents[i].p_debug['target_act'](obs_next_n[i])
            for i in range(self.n)
        ]
        if self.args.use_critic_noise:
            for agent_idx in range(self.n):
                noise = np.random.normal(
                    0,
                    self.args.critic_action_noise_stddev,
                    size=target_act_next_n[agent_idx].shape)
                clipped_noise = np.clip(noise, -self.args.action_noise_clip,
                                        self.args.action_noise_clip)
                target_act_next_n[agent_idx] = (target_act_next_n[agent_idx] +
                                                clipped_noise).tolist()
        elif self.args.use_critic_noise_self:
            noise = np.random.normal(
                0,
                self.args.critic_action_noise_stddev,
                size=target_act_next_n[self.agent_index].shape)
            clipped_noise = np.clip(noise, -self.args.action_noise_clip,
                                    self.args.action_noise_clip)
            target_act_next_n[self.agent_index] = target_act_next_n[
                self.agent_index] + clipped_noise
            target_act_next_n = target_act_next_n.tolist()
        else:
            target_act_next_n = target_act_next_n
        target_q_next1 = self.q_debug1['target_q_values'](*(obs_next_n +
                                                            target_act_next_n))
        target_q_next2 = self.q_debug2['target_q_values'](*(obs_next_n +
                                                            target_act_next_n))
        target_q_next = np.min([target_q_next1, target_q_next2], 0)
        if self.args.critic_zero_if_done:
            done_cond = done == True
            target_q_next[done_cond] = 0

        target_q = rew + self.args.gamma * target_q_next
        q_loss = self.q_train1(*(obs_n + act_n + [target_q]))
        q_loss = self.q_train2(*(obs_n + act_n + [target_q]))

        # train p network
        if train_step % (self.args.update_rate *
                         self.args.policy_update_rate) == 0:
            p_loss = self.p_train(*(obs_n + act_n))
            self.p_update()
            self.q_update1()
            self.q_update2()

        # print('Agent' + str(self.agent_index)  + ' Qloss = ' + str(q_loss) + ' Ploss = ' + str(p_loss))
        # print('Replay buffer size:' + str(len(self.replay_buffer)))

        return [
            q_loss,
            np.mean(target_q),
            np.mean(rew),
            np.mean(target_q_next),
            np.std(target_q)
        ]

示例#20

显示文件

文件： maddpg.py 项目： zimoqingfeng/maddpg

class MADDPGAgentTrainer(AgentTrainer):
    def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False):
        self.name = name               # name of the agent
        self.n = len(obs_shape_n)      # number of agents
        self.agent_index = agent_index # Index of the specific agent
        self.args = args               # Settings of hyper-parameters
        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i)).get()) # Creates a placeholder for a batch of tensors of a given shape and dtype.

        # [Create all the functions necessary to train the model]
        # train:             U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr])
        # update_target_q:   make_update_exp(q_func_vars, target_q_func_vars)
        # q_values:          U.function(obs_ph_n + act_ph_n, q)
        # target_q_values:   U.function(obs_ph_n + act_ph_n, target_q)
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,                                             # String: "agent_1" or "agent_2" or ...
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,                                     # action_space.
            q_index=agent_index,                                         # Index of the specific agent.
            q_func=model,                                                # Defined model.
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),     # 优化方法 --- 自适应矩估计 --- Adam法 --- 学习率设定
            grad_norm_clipping=0.5,                                      # 梯度剪切 --- 防止梯度爆炸 --- 梯度超过该值,直接设定为该值
            local_q_func=local_q_func,
            num_units=args.num_units                                     # Hidden layers 隐藏节点数
        )
        
        # act:                U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample)
        # train:              U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr])
        # update_target_p:    make_update_exp(p_func_vars, target_p_func_vars)
        # p_values:           U.function([obs_ph_n[p_index]], p)
        # target_act:         U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample)
        self.act, self.p_train, self.p_update, self.p_debug = p_train(   
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            p_index=agent_index,
            p_func=model,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units
        )
        # Create experience buffer
        self.replay_buffer = ReplayBuffer(1e6)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None

    def action(self, obs):
        return self.act(obs[None])[0]

    def experience(self, obs, act, rew, new_obs, done, terminal):
        # Store transition in the replay buffer.
        self.replay_buffer.add(obs, act, rew, new_obs, float(done))

    def preupdate(self):
        self.replay_sample_index = None

    # Input:  agents -->    all the trainers
    #         t      -->    increment global step counter
    # Output: loss   -->   [loss of q_train,
    #                       loss of p_train,
    #                       mean of target_q,
    #                       mean of reward,
    #                       mean of next target_q,
    #                       std of target_q]
    def update(self, agents, t):
        if len(self.replay_buffer) < self.max_replay_buffer_len: # replay buffer is not large enough
            return
        if not t % 100 == 0:  # only update every 100 steps
            return

        # Random sample from the replay buffer (Experience replay mechanism)
        self.replay_sample_index = self.replay_buffer.make_index(self.args.batch_size)   # Random sample from the replay_buffer --- return the sample index.
        # collect replay sample from all agents
        obs_n = []            # Clearly, 'n' indicates the number of the total agents. (Clear the past memory.)
        obs_next_n = []
        act_n = []
        index = self.replay_sample_index
        for i in range(self.n):                                                # Fetch the [all agents'] information.
            obs, act, rew, obs_next, done = agents[i].replay_buffer.sample_index(index)  # Fetch the observation, action, rewerds, next observation, done from the buffer.
            obs_n.append(obs)
            obs_next_n.append(obs_next)
            act_n.append(act)                                                  # obs_n, obs_next_n, act_n
        obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index) # Fetch the [self] information.

        # train q network [Critic network]
        num_sample = 1
        target_q = 0.0
        for i in range(num_sample):            # obs_n + act_n + [target_q] ==> q network --> Input is all obversation and all action and the target_q --> Output is value.
            target_act_next_n = [agents[i].p_debug['target_act'](obs_next_n[i]) for i in range(self.n)] # 根据observation生成个体下一步的action
            target_q_next = self.q_debug['target_q_values'](*(obs_next_n + target_act_next_n))          # 根据observation以及action是计算评价
            target_q += rew + self.args.gamma * (1.0 - done) * target_q_next                            # rewards + gamma*target_q_next*(1-done) 迷之done...
        target_q /= num_sample                 # calculate the mean of the target_q
        
        q_loss = self.q_train(*(obs_n + act_n + [target_q])) # Training procedure.

        # train p network [Actor network]
        p_loss = self.p_train(*(obs_n + act_n)) # obs_n + act_n ==> list 拼接; Policy network -->  Input is all obversation and all action --> Output is action.

        self.p_update()                         # p_network: make_update_experence
        self.q_update()                         # q_network: make_update_experence

        return [q_loss, p_loss, np.mean(target_q), np.mean(rew), np.mean(target_q_next), np.std(target_q)]

示例#21

显示文件

class MADDPGAgentTrainer(AgentTrainer):
    def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False):
        self.name = name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i)).get())

        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            q_index=agent_index,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units
        )
        self.act, self.p_train, self.p_update, self.p_debug = p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            p_index=agent_index,
            p_func=model,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units
        )
        # Create experience buffer
        self.replay_buffer = ReplayBuffer(1e6)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None

    def action(self, obs):
        return self.act(obs[None])[0]

    def experience(self, obs, act, rew, new_obs, done, terminal):
        # Store transition in the replay buffer.
        self.replay_buffer.add(obs, act, rew, new_obs, float(done))

    def preupdate(self):
        self.replay_sample_index = None

    def update(self, agents, t):
        if len(self.replay_buffer) < self.max_replay_buffer_len: # replay buffer is not large enough
            return
        if not t % 100 == 0:  # only update every 100 steps
            return

        self.replay_sample_index = self.replay_buffer.make_index(self.args.batch_size)
        # collect replay sample from all agents
        obs_n = []
        obs_next_n = []
        act_n = []
        index = self.replay_sample_index
        for i in range(self.n):
            obs, act, rew, obs_next, done = agents[i].replay_buffer.sample_index(index)
            obs_n.append(obs)
            obs_next_n.append(obs_next)
            act_n.append(act)
        obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index)

        # train q network
        num_sample = 1
        target_q = 0.0
        for i in range(num_sample):
            target_act_next_n = [agents[i].p_debug['target_act'](obs_next_n[i]) for i in range(self.n)]
            target_q_next = self.q_debug['target_q_values'](*(obs_next_n + target_act_next_n))
            target_q += rew + self.args.gamma * (1.0 - done) * target_q_next
        target_q /= num_sample
        q_loss = self.q_train(*(obs_n + act_n + [target_q]))

        # train p network
        p_loss = self.p_train(*(obs_n + act_n))

        self.p_update()
        self.q_update()

        return [q_loss, p_loss, np.mean(target_q), np.mean(rew), np.mean(target_q_next), np.std(target_q)]

示例#22

显示文件

文件： maddpg.py 项目： CaralHsi/M-G

class MADDPGAgentTrainer(AgentTrainer):
    def __init__(self,
                 name,
                 p_model,
                 q_model,
                 obs_shape_n,
                 act_space_n,
                 num_adversaries,
                 args,
                 local_q_func=False):
        self.name = name
        self.n = len(obs_shape_n)
        self.args = args
        self.neighbor_n = 2
        self.num_adversaries = num_adversaries
        adj_n = []
        obs_ph_n = []
        agent_n = []
        for i in range(self.n):
            obs_ph_n.append(
                U.BatchInput(obs_shape_n[i],
                             name="observation" + str(i)).get())
            adj_n.append(
                U.BatchInput([
                    self.neighbor_n,
                    num_adversaries if i < num_adversaries else
                    (self.n - num_adversaries)
                ],
                             name="adjacency" + str(i)).get())

        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_values, self.target_q_values = q_train(
            name=self.name,
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            adj_n=adj_n,
            act_space_n=act_space_n,
            num_adversaries=num_adversaries,
            neighbor_n=self.neighbor_n,
            q_func=q_model,
            agent_n=self.n,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units)

        self.act, self.p_train, self.p_update, self.p_values, self.target_act = p_train(
            name=self.name,
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            adj_n=adj_n,
            act_space_n=act_space_n,
            neighbor_n=self.neighbor_n,
            p_index=agent_n,
            p_func=p_model,
            q_func=q_model,
            num_adversaries=self.num_adversaries,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units,
        )

        # Create experience buffer
        self.replay_buffer = ReplayBuffer(1e6)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None

    def action(self, obs):
        for _ in range(len(obs)):
            obs[_] = obs[_][None]
        return self.act(*obs)

    def experience(self, obs, act, rew, new_obs, done, adj, new_adj, terminal):
        # Store transition in the replay buffer.
        done_int = [float(x) for x in done]
        self.replay_buffer.add(obs, act, rew, new_obs, done_int, adj, new_adj)

    def pre_update(self):
        self.replay_sample_index = None

    def update(self, agents, t):
        if len(
                self.replay_buffer
        ) < self.max_replay_buffer_len:  # replay buffer is not large enough
            return
        if not t % 100 == 0:  # only update every 100 steps
            return

        # collect replay sample from all agents
        self.replay_sample_index = self.replay_buffer.make_index(
            self.args.batch_size)
        index = self.replay_sample_index
        obs_n = []
        obs_next_n = []
        act_n = []
        adj_n = []
        adj_next_n = []
        for i in range(len(agents)):
            obs_record, act_record, rew_record, obs_next_record, done_record, adj_record, adj_next_record = \
                agents[i].replay_buffer.sample_index(index)
            obs_n.append(obs_record)
            obs_next_n.append(obs_next_record)
            act_n.append(act_record)
            adj_n.append(adj_record)
            adj_next_n.append(adj_next_record)

        obs, act, rew, obs_next, done, adj, adj_next = self.replay_buffer.sample_index(
            index)

        target_act_next_n = []
        target_q_next_input_obs = []
        target_q_next_input_act = []
        q_input_obs_n = []
        q_input_act_n = []
        p_input_adj_n = []

        for _, agent in enumerate(agents):  # traverse every species
            q_input_obs = []
            q_input_act = []
            p_input_adj = []
            target_act_next_input_obs = []
            target_act_next_input_adj = []
            for j in range(
                    obs_n[_].shape[1]):  # traverse every agent in each species
                _obs = []
                _act = []
                _adj = []
                _obs_next = []
                _adj_next = []
                for i in range(self.args.batch_size):  # traverse each instance
                    _obs.append(obs_n[_][i][j])
                    _act.append(act_n[_][i][j])
                    _adj.append(adj_n[_][i][j])
                    _obs_next.append(obs_next_n[_][i][j])
                    _adj_next.append(adj_next_n[_][i][j])
                q_input_obs.append(np.array(_obs))
                q_input_act.append(np.array(_act))
                p_input_adj.append(np.array(_adj))
                target_act_next_input_obs.append(np.array(_obs_next))
                target_act_next_input_adj.append(np.array(_adj_next))
            vec = matlib.repmat([1, 0], self.args.batch_size, 1)
            vec = np.expand_dims(vec, axis=1)
            target_act_next_input = target_act_next_input_obs + target_act_next_input_adj + [
                vec
            ]
            temp = agent.target_act(*target_act_next_input)
            target_act_next_n.append(temp)
            target_q_next_input_obs.extend(target_act_next_input_obs)
            target_q_next_input_act.extend(temp)
            q_input_obs_n.extend(q_input_obs)
            q_input_act_n.extend(q_input_act)
            p_input_adj_n.extend(p_input_adj)

        target_q = 0.0
        target_q_next = self.target_q_values(*(target_q_next_input_obs +
                                               target_q_next_input_act))
        #rew = np.sum(rew, 1) / 4
        # used to be (1 - done) but actually what's 'done' is not defined in "simple-world-comm" scenario,
        # thus should be considered again how to define "done" for species
        target_q_next = np.transpose(np.array(target_q_next))
        target_q += rew + self.args.gamma * target_q_next
        target_q_list = [
            target_q.transpose()[i] for i in range(np.shape(target_q)[1])
        ]

        # train the critic network
        # q_train_input = q_input_obs_n + q_input_act_n + [target_q]
        q_loss = [
            self.q_train[i](*(q_input_obs_n + q_input_act_n +
                              [target_q_list[i]]))
            for i in range(len(self.q_train))
        ]

        # train the policy network
        p_loss = self.p_train(*(q_input_obs_n + q_input_act_n + p_input_adj_n +
                                [vec]))

        self.p_update()
        self.q_update()

        return [
            q_loss, p_loss,
            np.mean(target_q),
            np.mean(rew),
            np.mean(target_q_next),
            np.std(target_q)
        ]

示例#23

显示文件

文件： train_asyncMPI(L=3,M=N=1).py 项目： LWei-gfkd/MADDPG_MPI

def train(arglist, PID=None, lock=None):
    start_time = time.time()
    # global replay_buffer
    with U.single_threaded_session() as sess:
        # Create environment
        env = make_env(arglist.scenario, arglist, arglist.benchmark)
        # Create agents networks
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]

        ####changed by yuan li
        num_adversaries = copy.deepcopy(env.num_adversaries)
        arglist.num_adversaries = copy.deepcopy(num_adversaries)

        if comm_rank != 0 and comm_rank != 1:
            req = None
            wait_flag = False

            actors = get_agents(env, num_adversaries, obs_shape_n, arglist)

            U.initialize()

            #var_list = [var for var in tf.trainable_variables()]
            #加载模型
            var_list_n = []
            for actor in actors:
                var_list_n.extend(actor.get_variable_list())
            saver = tf.train.Saver(var_list=var_list_n, max_to_keep=20)
            if arglist.load_dir != "":
                U.load_state(arglist.load_dir, saver)

            episode_rewards, agent_rewards, final_ep_rewards, final_ep_ag_rewards, agent_info = initialize_variables(
                env)
            obs_n = env.reset()
            step = 0
            episode_step = 0
            sample_number = 0
            t_start = time.time()
            updata_time = 0
            print('Starting iterations...')

            invalid_train, red_win, red_leave, green_win, green_leave = 0, 0, 0, 0, 0

            while True:
                if not wait_flag:
                    #req = comm.irecv(350000, source=(comm_rank - 1 + comm_size) % comm_size, tag=11)
                    req = comm.irecv(350000, source=0, tag=11)
                    wait_flag = True
                else:
                    data_recv = req.test()
                    if data_recv[0]:
                        wait_flag = False
                        if data_recv[1] == 'finish':
                            #finish = True
                            comm.send('finish', dest=1, tag=11)
                            break
                        else:
                            update_start = time.time()
                            i = 0
                            j = 0
                            for var in tf.trainable_variables():
                                if 11 < (i % 24) < 24:
                                    var.load(data_recv[1][j], sess)
                                    j += 1
                                i += 1

                            #for var in var_list:
                            #    var.load(data_recv[1][i], sess)
                            #    i += 1
                            #print("111111111111111111111111,load param")
                            #for i, actor in enumerate(actors):
                            #    actor.load_weights(data_recv[1][i], sess)
                            update_end = time.time()
                            #print("step:{}, rank0_update_end_time：{}".format(step, update_end))
                            updata_time += (update_end - update_start)
                            step += 1
                    else:
                        wait_flag = True
                        # get action
                        action_n = [
                            agent.action(obs)
                            for agent, obs in zip(actors, obs_n)
                        ]
                        # environment step
                        new_obs_n, rew_n, done_n, info_n = env.step(action_n)
                        episode_step += 1
                        # changed by liyuan
                        done = any(done_n)
                        terminal = (episode_step >= arglist.max_episode_len)
                        ###liyuan: compute the arverage win rate
                        if green_leave_screen(env) or adversary_all_die(
                                env) or adversary_leave_screen(env):
                            terminal = True

                        if adversary_all_die(env):
                            green_win += 1
                        if green_leave_screen(env):
                            invalid_train += 1
                            green_leave += 1
                        if adversary_leave_screen(env):
                            red_leave += 1

                        if episode_step >= arglist.max_episode_len:
                            for i, agent in enumerate(env.agents):
                                if agent.adversary:
                                    rew_n[i] -= 50

                        if adversary_all_die(env):
                            for i, agent in enumerate(env.agents):
                                if agent.adversary:
                                    rew_n[i] -= 100

                        if done:
                            red_win = red_win + 1
                            for i, agent in enumerate(env.agents):
                                if agent.adversary:
                                    rew_n[i] += 200
                                    rew_n[i] += (
                                        arglist.max_episode_len -
                                        episode_step) / arglist.max_episode_len

                        #send data
                        data = [obs_n, action_n, rew_n, new_obs_n, done_n]
                        comm.send(data, dest=1, tag=11)

                        sample_number += 1

                        #replay_buffer.add(obs_n, action_n, rew_n, new_obs_n, done_n)
                        obs_n = new_obs_n
                        for i, rew in enumerate(rew_n):
                            episode_rewards[-1] += rew
                            agent_rewards[i][-1] += rew

                        if done or terminal:
                            obs_n = env.reset()
                            episode_step = 0
                            episode_rewards.append(0)
                            for a in agent_rewards:
                                a.append(0)
                            agent_info.append([[]])

                        # save model, display training output
                        if (terminal or done) and (len(episode_rewards) %
                                                   arglist.save_rate == 0):
                            if red_win >= 0.8 * arglist.save_rate:
                                temp_dir = arglist.save_dir + "_" + str(
                                    len(episode_rewards)) + "_" + str(
                                        red_win) + "_{}".format(PID)
                                U.save_state(temp_dir, saver=saver)
                            # print statement depends on whether or not there are adversaries
                            if num_adversaries == 0:
                                print(
                                    "Rank {}, steps: {}, episodes: {}, mean episode reward: {}, time: {}"
                                    .format(
                                        comm_rank, sample_number,
                                        len(episode_rewards),
                                        np.mean(episode_rewards[-arglist.
                                                                save_rate:]),
                                        round(time.time() - t_start, 3)))
                            else:
                                print(
                                    "Rank {}, steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}"
                                    .format(
                                        comm_rank, sample_number,
                                        len(episode_rewards),
                                        np.mean(episode_rewards[-arglist.
                                                                save_rate:]),
                                        [
                                            np.mean(rew[-arglist.save_rate:])
                                            for rew in agent_rewards
                                        ], round(time.time() - t_start, 3)))
                                print(
                                    "Rank  {}, red win: {}, green win: {}, red all leave: {}, green all leave: {}"
                                    .format(comm_rank, red_win, green_win,
                                            red_leave, green_leave))

                                middle_time = time.time()
                                print(
                                    "sample_number:{}, train_step:{}, update_time:{}, total_time:{}"
                                    .format(sample_number, step, updata_time,
                                            middle_time - start_time))
                                mydata = []
                                mydata.append(str(len(episode_rewards)))
                                mydata.append(
                                    str(
                                        np.mean(episode_rewards[-arglist.
                                                                save_rate:])))
                                mydata.append(
                                    str(
                                        np.mean(agent_rewards[0]
                                                [-arglist.save_rate:])))
                                mydata.append(
                                    str(
                                        np.mean(agent_rewards[1]
                                                [-arglist.save_rate:])))
                                mydata.append(
                                    str(
                                        np.mean(agent_rewards[2]
                                                [-arglist.save_rate:])))
                                mydata.append(str(red_win))
                                mydata.append(
                                    str(round(time.time() - t_start, 3)))
                                out = open('1mydata_{}.csv'.format(comm_rank),
                                           'a',
                                           newline='')
                                csv_write = csv.writer(out, dialect='excel')
                                csv_write.writerow(mydata)

                            if len(episode_rewards) > 3000:
                                U.save_state(arglist.save_dir, saver=saver)

                            invalid_train, red_win, red_leave, green_win, green_leave = 0, 0, 0, 0, 0
                            t_start = time.time()
                            # Keep track of final episode reward
                            final_ep_rewards.append(
                                np.mean(episode_rewards[-arglist.save_rate:]))
                            for rew in agent_rewards:
                                final_ep_ag_rewards.append(
                                    np.mean(rew[-arglist.save_rate:]))

            end_time = time.time()
            print("rank{}_time:{}".format(comm_rank, end_time - start_time))
            print("rank{}_update_time:{}".format(comm_rank, updata_time))
            print("rank{}_step:{}".format(comm_rank, step))

        if comm_rank == 1:
            replay_buffer = ReplayBuffer(1e6)

            wait_flag_1 = False
            wait_flag_2 = False
            wait_flag_3 = False
            req1 = None
            req2 = None
            req3 = None
            sample = 0
            step = 0
            req_list = []
            while True:
                if not wait_flag_1 or not wait_flag_2 or not wait_flag_3:
                    if not wait_flag_1:
                        req1 = comm.irecv(source=2, tag=11)
                        wait_flag_1 = True
                    if not wait_flag_2:
                        req2 = comm.irecv(source=3, tag=11)
                        wait_flag_2 = True
                    if not wait_flag_3:
                        req3 = comm.irecv(source=4, tag=11)
                        wait_flag_3 = True
                else:
                    data_recv_1 = req1.test()
                    data_recv_2 = req2.test()
                    data_recv_3 = req3.test()
                    if data_recv_1[0] or data_recv_2[0] or data_recv_3[0]:
                        if data_recv_1[0]:
                            wait_flag_1 = False
                            if data_recv_1[1] == 'finish':
                                break
                            else:
                                obs_n, action_n, rew_n, new_obs_n, done_n = data_recv_1[
                                    1]
                                replay_buffer.add(obs_n, action_n, rew_n,
                                                  new_obs_n, done_n)
                                sample += 1

                        if data_recv_2[0]:
                            wait_flag_2 = False
                            if data_recv_2[1] == 'finish':
                                break
                            else:
                                obs_n, action_n, rew_n, new_obs_n, done_n = data_recv_2[
                                    1]
                                replay_buffer.add(obs_n, action_n, rew_n,
                                                  new_obs_n, done_n)
                                sample += 1

                        if data_recv_3[0]:
                            wait_flag_3 = False
                            if data_recv_3[1] == 'finish':
                                break
                            else:
                                obs_n, action_n, rew_n, new_obs_n, done_n = data_recv_3[
                                    1]
                                replay_buffer.add(obs_n, action_n, rew_n,
                                                  new_obs_n, done_n)
                                sample += 1
                        '''
                        #计算接收100个样本然后发送样本用的时间
                        if (sample % 100 == 0) and len(replay_buffer) >= arglist.batch_size * arglist.max_episode_len:
                            start = time.time()
                            replay_sample_index = replay_buffer.make_index(arglist.batch_size)
                            send_data = replay_buffer.sample_index(replay_sample_index)
                            #send_data = (obs_n_a, act_n_a, rew_n_a, obs_next_n_a, done_n_a)
                            comm.send(send_data, dest=(comm_rank + 1) % comm_size, tag=11)
                            sample = 0
                            step += 1
                            end = time.time()
                            print("rank1 send sample time:", end-start)
                        '''

                    else:
                        wait_flag_1 = True
                        wait_flag_2 = True
                        wait_flag_3 = True
                        if (sample // 100 > 0) and len(
                                replay_buffer
                        ) >= arglist.batch_size * arglist.max_episode_len:
                            replay_sample_index = replay_buffer.make_index(
                                arglist.batch_size)
                            send_data = replay_buffer.sample_index(
                                replay_sample_index)
                            #send_data = (obs_n_a, act_n_a, rew_n_a, obs_next_n_a, done_n_a)
                            comm.send(send_data, dest=0, tag=11)
                            sample = 0
                            step += 1

            end_time = time.time()
            print("rank1_time:", end_time - start_time)
            print("rank1_step", step)

        if comm_rank == 0:
            extract_time = 0
            step = 0

            learners = get_agents(env, num_adversaries, obs_shape_n, arglist)

            var_list_n = []
            for learner in learners:
                var_list_n.extend(learner.get_variable_list())

            U.initialize()

            #var_list = [var for var in tf.trainable_variables()]

            # 加载模型
            saver = tf.train.Saver(var_list=var_list_n, max_to_keep=20)
            if arglist.load_dir != "":
                U.load_state(arglist.load_dir, saver)

            while True:
                if step >= STEP:
                    for i in range(comm_size - 2):
                        comm.send('finish', dest=(i + 2), tag=11)
                    break
                else:
                    start = time.time()
                    data_recv = comm.recv(source=1, tag=11)

                    for i, agent in enumerate(learners):
                        agent.update(learners, data_recv)

                    #dict_list = []
                    param = []
                    extract_start = time.time()
                    i = 0
                    for var in tf.trainable_variables():
                        if 11 < (i % 24) < 24:
                            param.append(sess.run(var))
                        i += 1
                    #print("2222222222222222 load weights")
                    #for var in var_list:
                    #   param.append(sess.run(var))

                    extract_end = time.time()
                    extract_time += (extract_end - extract_start)

                    for i in range(comm_size - 2):
                        comm.send(param, dest=(i + 2), tag=11)
                    #print("222222222222222222222222,send param")

                    step += 1
                    end = time.time()
                    #print("rank2 train time:{}, extract_time:{}".format(end - start, extract_end - extract_start))
            end_time = time.time()
            print("rank0_time:", end_time - start_time)
            print("rank0_extract_time:", extract_time)
            print("rank0_step:", step)

示例#24

显示文件

文件： maddpg_full.py 项目： zengwang430521/MARL

class MADDPGAgentTrainerFull(AgentTrainer):
    def __init__(self,
                 name,
                 p_policy,
                 p_predict,
                 q_model,
                 obs_shape_n,
                 act_space_n,
                 state_shape_n,
                 agent_index,
                 args,
                 local_q_func=False):
        self.name = name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        self.obs_shape = obs_shape_n[agent_index]
        self.state_shape = state_shape_n[agent_index]
        self.p_predict = p_predict
        obs_ph_n = []
        obs_next_n = []
        obs_pred_n = []
        state_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(
                U.BatchInput(obs_shape_n[i],
                             name="observation" + str(i)).get())
            obs_next_n.append(
                U.BatchInput(obs_shape_n[i], name="next_obs" + str(i)).get())
            obs_pred_n.append(
                U.BatchInput(obs_shape_n[i], name="pred_obs" + str(i)).get())
            state_ph_n.append(
                U.BatchInput(state_shape_n[i], name="state" + str(i)).get())

        # Create all the functions necessary to train the critic net
        # q_train is used for optimize Q net according to the loss in this batch
        # q_update is used to update the parameter of target net θ'i = τθi + (1 − τ)θ'i

        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            q_index=agent_index,
            q_func=q_model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units)

        # step return the action and new_state given the obs and state
        # p_train is used to optimize p Net
        # p_update is used to update target p net as θ'i = τθi + (1 − τ)θ'i
        self.step, self.predict, self.p_train, self.p_update, self.p_debug = p_train_recurrent(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            make_state_ph_n=state_ph_n,
            act_space_n=act_space_n,
            make_obs_next_n=obs_next_n,
            make_obs_pred_n=obs_pred_n,
            p_index=agent_index,
            p_policy=p_policy,
            p_predict=p_predict,
            q_func=q_model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units,
            reuse=tf.AUTO_REUSE)

        # Create experience buffer
        self.replay_buffer = ReplayBuffer(1e6)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None

    '''
    def predict(self, act_input, gru_out):
        with tf.variable_scope(self.name, reuse=tf.AUTO_REUSE):
            obs_pred = self.p_predict(act_input[None], gru_out, int(self.obs_shape[0]), scope="p_predict", num_units=self.args.num_units)
            return obs_pred

    def target_predict(self, act_input, gru_out):
        with tf.variable_scope(self.name, reuse=None):
            obs_pred = self.p_predict(act_input, gru_out, int(self.obs_shape[0]), scope="target_p_predict", num_units=self.args.num_units)
            return obs_pred
    '''

    # return the zero state of GRU
    def p_init_state(self, batch_size):
        return np.zeros([batch_size, self.state_shape[0]])

    def init_pred(self, batch_size):
        return np.zeros([batch_size, self.obs_shape[0]])

    # given the obs and current state, return the action and new state
    def take_action(self, obs, state, pred):
        act, new_state, gru_out = self.step(obs[None], state, pred)
        act = act[0]
        return act, new_state, gru_out

    def experience(self, obs, act, rew, new_obs, done, terminal):
        # Store transition in the replay buffer.
        self.replay_buffer.add(obs, act, rew, new_obs, float(done))

    def preupdate(self):
        self.replay_sample_index = None

    def update(self, agents, t, step_size=16, burn_in_step=8):
        if len(
                self.replay_buffer
        ) < self.max_replay_buffer_len:  # replay buffer is not large enough
            return
        if not t % 100 == 0:  # only update every 100 steps
            return
        # sample experience
        self.replay_sample_index = self.replay_buffer.make_index(
            self.args.batch_size)
        # collect replay sample from all agents
        obs_seq_n = []
        obs_next_seq_n = []
        act_seq_n = []
        finish_index = self.replay_sample_index
        for i in range(self.n):
            obs_seq, act_seq, rew_seq, obs_next_seq, done_seq = agents[
                i].replay_buffer.sequence_sample_index(finish_index, step_size)
            obs_seq_n.append(obs_seq)
            obs_next_seq_n.append(obs_next_seq)
            act_seq_n.append(act_seq)

        obs_seq, act_seq, rew_seq, obs_next_seq, done_seq = self.replay_buffer.sequence_sample_index(
            finish_index, step_size)

        state_n = [
            agents[i].p_init_state(self.args.batch_size) for i in range(self.n)
        ]
        pred_n = [
            agents[i].init_pred(self.args.batch_size) for i in range(self.n)
        ]
        target_state_n = [
            agents[i].p_init_state(self.args.batch_size) for i in range(self.n)
        ]
        target_pred_n = [
            agents[i].init_pred(self.args.batch_size) for i in range(self.n)
        ]

        act_n = [x[0] for x in act_seq_n]
        temp = [
            agents[i].p_debug['target_step'](obs_seq_n[i][0],
                                             target_state_n[i],
                                             target_pred_n[i])
            for i in range(self.n)
        ]
        target_state_n = [x[1] for x in temp]
        target_gru_out_n = [x[2] for x in temp]
        target_pred_n = [
            agents[i].p_debug['target_predict'](act_n[i], target_gru_out_n[i])
            for i in range(self.n)
        ]

        # burn in stage, don't update the net
        for step in range(burn_in_step):
            act_n = [x[step] for x in act_seq_n]
            act_next_n = [x[step + 1] for x in act_seq_n]

            # target agent step
            temp = [
                agents[i].p_debug['target_step'](obs_next_seq_n[i][step],
                                                 target_state_n[i],
                                                 target_pred_n[i])
                for i in range(self.n)
            ]
            target_state_n = [x[1] for x in temp]
            target_gru_out_n = [x[2] for x in temp]
            target_pred_n = [
                agents[i].p_debug['target_predict'](act_next_n[i],
                                                    target_gru_out_n[i])
                for i in range(self.n)
            ]

            # agents step
            temp = [
                agents[i].step(obs_seq_n[i][step], state_n[i], pred_n[i])
                for i in range(self.n)
            ]
            state_n = [x[1] for x in temp]
            gru_out_n = [x[2] for x in temp]
            pred_n = [
                agents[i].predict(act_n[i], gru_out_n[i])
                for i in range(self.n)
            ]

        q_loss = 0
        p_loss = 0
        # update the agents
        for step in range(burn_in_step, step_size):
            obs_n = [x[step] for x in obs_seq_n]
            act_n = [x[step] for x in act_seq_n]
            if step < (step_size - 1):
                act_next_n = [x[step + 1] for x in act_seq_n]
            obs_next_n = [x[step] for x in obs_next_seq_n]

            # target agents step, get the action in the next step
            temp = [
                agents[i].p_debug['target_step'](obs_next_seq_n[i][step],
                                                 target_state_n[i],
                                                 target_pred_n[i])
                for i in range(self.n)
            ]
            target_act_n = [x[0] for x in temp]
            target_state_n = [x[1] for x in temp]
            target_gru_out_n = [x[2] for x in temp]
            if step < (step_size - 1):
                target_pred_n = [
                    agents[i].p_debug['target_predict'](act_next_n[i],
                                                        target_gru_out_n[i])
                    for i in range(self.n)
                ]
            # infer y from target action
            target_q_next = self.q_debug['target_q_values'](*(obs_next_n +
                                                              target_act_n))
            target_q = rew_seq[step] + self.args.gamma * (
                1.0 - done_seq[step]) * target_q_next
            q_loss += self.q_train(*(obs_n + act_n + [target_q]))

            p_loss += self.p_train(*(obs_n + state_n + act_n + obs_next_n +
                                     pred_n))
            # agents step
            state_n = [x[1] for x in temp]
            gru_out_n = [x[2] for x in temp]
            pred_n = [
                agents[i].predict(act_n[i], gru_out_n[i])
                for i in range(self.n)
            ]

        # update the target net
        self.p_update()
        self.q_update()

        return [
            q_loss, p_loss,
            np.mean(target_q),
            np.mean(rew_seq[step]),
            np.mean(target_q_next),
            np.std(target_q)
        ]

示例#25

显示文件

class MADDPGAgentTrainerCCM(AgentTrainer):
    """
    Agent Trainer using MADDPG Algorithm and CCM
    """
    def __init__(self,
                 name,
                 model,
                 obs_shape_n,
                 act_space_n,
                 agent_index,
                 args,
                 role="",
                 local_q_func=False):
        """
        Args:
            name (str): Name of the agent
            model (function): MLP Neural Network model for the agent.
            obs_shape_n (tf.placeholder): Placeholder for the observation space of all agents
            act_space_n (list): A list of the action spaces for all agents
            agent_index (int): Agent index number
            args (argparse.Namespace): Parsed commandline arguments object
            role (str): Role of the agent i.e. adversary
            local_q_func (boolean): Flag for using local q function
        """
        super(MADDPGAgentTrainerCCM, self).__init__()

        self.name = name
        self.role = role
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        obs_ph_n = []
        act_history_ph_n = []
        obs_history_ph_n = []

        hist = self.args.training_history

        obs_history_n = [(hist * x[0], ) for x in obs_shape_n]
        act_history_n = [(hist * act.n, ) for act in act_space_n]

        # act_history_n = [Discrete(act.n*(3-1)) for act in act_space_n]
        #        for act_space in act_space_n:
        #            act_space.n = act_space.n*3
        #        if act_history_n[0].n != 15:
        #            print("Line 158")

        for i in range(self.n):
            obs_ph_n.append(
                tf_util.BatchInput(obs_shape_n[i],
                                   name="observation" + str(i)).get())
            obs_history_ph_n.append(
                tf_util.BatchInput(obs_history_n[i],
                                   name="observationhistory" + str(i)).get())
            act_history_ph_n.append(
                tf_util.BatchInput(act_history_n[i],
                                   name="actionhistory" + str(i)).get())

        # obs_ph_n = [tf.concat(3*[x],1,name="observation{}".format(i)) for i,x in enumerate(obs_ph_n)]

        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            make_obs_history_n=obs_history_ph_n,
            make_act_history_n=act_history_ph_n,
            q_index=agent_index,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units)
        self.act, self.p_train, self.p_update, self.p_debug = p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            make_obs_history_n=obs_history_ph_n,
            make_act_history_n=act_history_ph_n,
            p_index=agent_index,
            p_func=model,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units)
        # Create experience buffer
        self.replay_buffer = ReplayBuffer(1e6)
        self.max_replay_buffer_len = 4 * args.batch_size * args.max_episode_len
        self.replay_sample_index = None

    def action(self, obs):
        """
        Retrieves action for agent from the P network given the observations

        Args:
            obs (np.array): Observations of the world for an agent

        Returns:
            Action for an agent
        """
        hist = self.args.training_history
        if len(self.replay_buffer) > (hist + 1):
            _, _, _, _, _, obs_h, _, _, _, _ = self.replay_buffer.sample_index(
                [len(self.replay_buffer)], hist)
            if len(obs_h) > 0:
                obs_h = obs_h[0]
            # obs = np.concatenate((obs,ob[0]),0)
        else:
            obs_h = np.array((hist) * list(obs))

        return self.act(obs[None], obs_h[None])[0]

    def experience(self, obs, act, rew, new_obs, done, terminal):
        """
        Store transition in the replay buffer.

        Args:
            obs (np.array): Observations of the world for an agent
            act (list): Action for an agent
            rew (float): Reward for an agent
            new_obs (np.array): New observations of the world for an agent
            done (): Done for an agent
            terminal (boolean): Flag for whether the final episode has been reached.
        """
        self.replay_buffer.add(obs, act, rew, new_obs, float(done))

    def preupdate(self):
        """
        Reset replay_sample_index to None.
        """
        self.replay_sample_index = None

    def update(self, agents, steps):
        """
        Update agent networks

        Args:
            agents (list): List of MADDPGAgentTrainer objects
            steps (int): Current training step

        Returns:
            (list) Training loss for the agents
                   [q_loss, p_loss, mean_target_q, mean_reward, mean_target_q_next, std_target_q]
        """
        # Replay buffer is not large enough
        # if len(self.replay_buffer) < self.max_replay_buffer_len:
        if len(self.replay_buffer) < 12500:
            return

        # Only update every 100 steps
        if not steps % 100 == 0:
            return

        self.replay_sample_index = self.replay_buffer.make_index(
            self.args.batch_size)
        hist = self.args.training_history

        # ************************************************************************************************

        ccm_loss = np.array([0.0])
        ccm_lambda = np.array([self.args.ccm_lambda])
        ccm_switch = np.array([0.0])

        # ************************************************************************************************

        # Collect replay sample from all agents
        obs_n = []
        obs_h_n = []
        obs_next_n = []
        obs_next_h_n = []
        act_n = []
        act_h_n = []
        index = self.replay_sample_index
        for i in range(self.n):
            obs, act, rew, obs_next, done, obs_h, act_h, rew_h, obs_next_h, done_h = agents[i].\
                replay_buffer.sample_index(index, history=hist)
            obs_n.append(obs)
            obs_h_n.append(obs_h)
            obs_next_n.append(obs_next)
            obs_next_h_n.append(obs_next_h)
            act_n.append(act)
            act_h_n.append(act_h)
        _, _, rew, _, done, _, _, rew_h, _, done_h = self.replay_buffer.sample_index(
            index, history=0)

        obs_h_n = [[list() for _ in range(len(obs_n[0]))] if len(x) == 0 else x
                   for x in obs_h_n]
        obs_next_h_n = [
            [list() for _ in range(len(obs_next_n[0]))] if len(x) == 0 else x
            for x in obs_next_h_n
        ]
        act_h_n = [[list() for _ in range(len(act_n[0]))] if len(x) == 0 else x
                   for x in act_h_n]

        # rew = rew.T[0]
        # done = done.T[0]
        # train q network
        # print(*([x + act_n[i][j] for i,xx in enumerate(obs_n) for j,x in enumerate(xx)]))

        num_sample = 1
        target_q = 0.0
        target_q_next = 0.0
        for i in range(num_sample):
            target_act_next_n = [
                agents[i].p_debug['target_act'](obs_next_n[i], obs_next_h_n[i])
                for i in range(self.n)
            ]
            target_q_next = self.q_debug['target_q_values'](
                *(obs_next_n + obs_next_h_n + target_act_next_n + act_h_n))

            # TODO: Possible error point
            target_q += rew + self.args.gamma * (1.0 - done) * target_q_next

        target_q /= num_sample

        # TODO: Possible error point
        q_loss = self.q_train(*(obs_n + obs_h_n + act_n + act_h_n +
                                [target_q]))

        # Train P network
        # p_loss = self.p_train(*(obs_n + act_n))
        p_loss = self.p_train(*(obs_n + obs_h_n + act_n + act_h_n +
                                [ccm_loss] + [ccm_lambda] + [ccm_switch]))

        self.p_update()
        self.q_update()

        return [
            q_loss, p_loss,
            np.mean(target_q),
            np.mean(rew),
            np.mean(target_q_next),
            np.std(target_q)
        ]

    def ccm_update(self, agents, steps):
        """
        CCM Update agent networks

        Args:
            agents (list): List of MADDPGAgentTrainer objects
            steps (int): Current training step

        Returns:
            (list) Training loss for the agents
                   [q_loss, p_loss, mean_target_q, mean_reward, mean_target_q_next, std_target_q]
        """
        # Replay buffer is not large enough
        # if len(self.replay_buffer) < self.max_replay_buffer_len:
        if len(self.replay_buffer) < 12500:
            # print("{}/{}".format(len(self.replay_buffer),self.max_replay_buffer_len))
            return

        # Only update every 4 episodes
        if not steps % (4 * self.args.max_episode_len) == 0:
            return

        # Only CCM update for adversaries
        if not self.role == "adversary":
            return

        # batch_ep_size = int(round(self.args.batch_size / self.args.max_episode_len))
        batch_ep_size = self.args.ccm_pool
        self.replay_sample_index, self.ccm_episode_index = self.replay_buffer.\
            make_episode_index(batch_ep_size, self.args.max_episode_len, shuffle=not self.args.ccm_on_policy)
        hist = self.args.training_history

        # Collect replay sample from all agents
        obs_n = []
        obs_h_n = []
        obs_next_n = []
        obs_next_h_n = []
        act_n = []
        act_h_n = []
        ccm_act_n = []

        index = self.replay_sample_index
        for i in range(self.n):
            obs, act, rew, obs_next, done, obs_h, act_h, rew_h, obs_next_h, done_h = agents[i].\
                replay_buffer.sample_index(index, history=hist)
            obs_n.append(obs)
            obs_h_n.append(obs_h)
            obs_next_n.append(obs_next)
            obs_next_h_n.append(obs_next_h)
            act_n.append(act)
            act_h_n.append(act_h)

            ccm_act = []
            for ep in self.ccm_episode_index:
                _, act, _, _, _, _, _, _, _, _ = agents[
                    i].replay_buffer.sample_index(ep)
                act = np.array(act)
                ccm_act.append(act[:, 1] - act[:, 2])
            ccm_act_n.append(np.array(ccm_act))

        # print("Action CCM: {}".format(ccm.get_score(ccm_act_n[1],ccm_act_n[2],Emax=5,tau=1)))
        # print("Action CCM: {}".format(ccm_act_n))

        ccm_loss = np.array([0.0])
        ccm_lambda = np.array([self.args.ccm_lambda])
        ccm_switch = np.array([1.0])

        if self.agent_index != 1:
            t_start = time.time()

        # ccm_scores = [ccm.get_score(ccm_act_n[agent_index], ccm_act_n[i], e_max=5, tau=None)
        #               for i in range(len(ccm_act_n)) if i != agent_index]

        if self.args.specific_leader_ccm is None and self.args.specific_agent_ccm is None:
            ccm_scores = [
                ccm.get_score(ccm_act_n[self.agent_index],
                              ccm_act_n[i],
                              e_max=5,
                              tau=1) for i in range(self.n)
                if i != self.agent_index and agents[i].role == "adversary"
            ]

        elif self.args.specific_agent_ccm is None:
            if self.agent_index == self.args.specific_leader_ccm:
                ccm_scores = [
                    ccm.get_score(ccm_act_n[i],
                                  ccm_act_n[self.agent_index],
                                  e_max=5,
                                  tau=1) for i in range(self.n)
                    if i != self.agent_index and agents[i].role == "adversary"
                ]

            else:
                ccm_scores = [
                    ccm.get_score(ccm_act_n[self.agent_index],
                                  ccm_act_n[i],
                                  e_max=5,
                                  tau=1) for i in range(self.n)
                    if i == self.args.specific_leader_ccm
                ]

        else:
            ccm_scores = [
                ccm.get_score(ccm_act_n[self.agent_index],
                              ccm_act_n[self.args.specific_leader_ccm],
                              e_max=5,
                              tau=1) for i in range(self.n)
                if i == self.args.specific_leader_ccm
            ]

        # ccm_loss = [1*(x[0]-(x[1]-0.01)) for x in ccm_scores]
        ccm_loss = [x[0] - np.exp(x[1] - 0.01) for x in ccm_scores]
        ccm_loss = np.array([np.mean(ccm_loss)])

        # print("CCM Loop Time at Trial {}: {}".format(steps,time.time() - t_start))

        # Original implementation
        # obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index)

        # Modified
        _, _, rew, _, done, _, _, rew_h, _, done_h = self.replay_buffer.sample_index(
            index, history=0)

        obs_h_n = [[list() for _ in range(len(obs_n[0]))] if len(x) == 0 else x
                   for x in obs_h_n]
        obs_next_h_n = [
            [list() for _ in range(len(obs_next_n[0]))] if len(x) == 0 else x
            for x in obs_next_h_n
        ]
        act_h_n = [[list() for _ in range(len(act_n[0]))] if len(x) == 0 else x
                   for x in act_h_n]

        num_sample = 1
        target_q = 0.0
        target_q_next = 0.0
        for i in range(num_sample):
            target_act_next_n = [
                agents[i].p_debug['target_act'](obs_next_n[i], obs_next_h_n[i])
                for i in range(self.n)
            ]
            target_q_next = self.q_debug['target_q_values'](
                *(obs_next_n + obs_next_h_n + target_act_next_n + act_h_n))

            # TODO: Possible error point
            target_q += rew + self.args.gamma * (1.0 - done) * target_q_next

        target_q /= num_sample

        # TODO: Possible error point
        q_loss = self.q_train(*(obs_n + obs_h_n + act_n + act_h_n +
                                [target_q]))

        # Train P network
        # p_loss = self.p_train(*(obs_n + act_n))
        p_loss = self.p_train(*(obs_n + obs_h_n + act_n + act_h_n +
                                [ccm_loss] + [ccm_lambda] + [ccm_switch]))

        self.p_update()
        # self.q_update()

        return [
            q_loss, p_loss,
            np.mean(target_q),
            np.mean(rew),
            np.mean(target_q_next),
            np.std(target_q)
        ]

示例#26

显示文件

文件： maddpg.py 项目： hcch0912/my_trpo_i3

class I3MADDPGAgentTrainer(AgentTrainer):
    def __init__(self, name, model, obs_shape_n, act_space_n, act_traj_shape_n,intent_shape,  agent_index, args, local_q_func=False):
        self.name = name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        obs_ph_n = []
        act_traj_ph_n = []
        intent_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i)).get())
            act_traj_ph_n.append(U.BatchInput(act_traj_shape_n[i], name = "action_trajectory"+str(i)).get())
            intent_ph_n.append(U.BatchInput(intent_shape[i], name = "intent"+str(i)).get())
        self.act_size = act_space_n[0].n
        self.get_intent, self.i_train, self.i_update, self.i_debug = i_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            intent_ph_n = intent_ph_n,
            act_space_n = act_space_n,
            make_act_traj_ph_n = act_traj_ph_n,
            make_intent_ph_n  =intent_ph_n,
            i_func = model,
            i_index = agent_index,
            output_size = (self.n-1) * self.act_size,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            num_units=args.num_units,
            reuse = False
            ) 
        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            make_intent_ph_n = intent_ph_n,
            q_index=agent_index,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units
        )
        self.act, self.p_train, self.p_update, self.p_debug = p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            make_intent_ph_n = intent_ph_n,
            p_index=agent_index,
            p_func=model,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units
        )
        # Create experience buffer
        self.replay_buffer = ReplayBuffer(1e6)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None

    def intent(self, obs, act_traj):
        # print(np.array(act_traj).shape)
        # print(np.array(obs).shape)
        intent = self.get_intent(*( [[obs]] + [[act_traj]]) )[0]
        return intent

    def onpolicy_train_i(self, obs, act_traj, true_act):        
        # print(np.array(act_traj).shape)
        true_actions = []
        for i in range(len(true_act)):
            true_actions.append([])
            for j in range(len(true_act)):
                if j != i:
                    true_actions[i].append(true_act[j])        

        obs =[ o for o in  np.reshape(obs, (len(obs), 1, -1))]
        act_traj = [a for a in np.reshape(act_traj, (len(obs),1,len(act_traj[0]),len(act_traj[0][0]),len(act_traj[0][0][0])))]
        true_act = [t for t in  np.reshape(true_actions, (len(obs),1,-1))]
        i_loss =  self.i_train(*(obs + act_traj + true_act))
        self.i_update()
        return i_loss

    def action(self, obs, intent):
        return self.act(*([[obs]] +[[intent]]))[0]

    def experience(self, obs, act, rew, new_obs, act_traj, intent, act_traj_next, intent_next, done, terminal):
        # Store transition in the replay buffer.
        self.replay_buffer.add(obs, act, rew, new_obs, act_traj, intent,act_traj_next, intent_next,float(done))

    def preupdate(self):
        self.replay_sample_index = None

    def update(self, agents, t):
        if len(self.replay_buffer) < self.max_replay_buffer_len: # replay buffer is not large enough
            return
        if not t % 100 == 0:  # only update every 100 steps
            return

        self.replay_sample_index = self.replay_buffer.make_index(self.args.batch_size)
        # collect replay sample from all agents
        obs_n = []
        obs_next_n = []
        act_n = []
        act_traj_n = []
        intent_n = []
        act_traj_next_n = []
        intent_next_n = []
        index = self.replay_sample_index

        intent_temp = np.zeros((len(self.replay_sample_index), (self.n-1) * self.act_size))
        act_traj_temp = np.zeros((len(self.replay_sample_index), (self.n-1), self.args.timestep, self.act_size))
        

        if self.args.good_i3 == 1 and self.args.adv_i3 == 1:
            for i in range(self.n):
                obs, act, rew, obs_next,act_traj, intent,act_traj_next, intent_next, done = agents[i].replay_buffer.sample_index(index)
                obs_n.append(obs)
                obs_next_n.append(obs_next)
                act_n.append(act)
                act_traj_n.append(act_traj)
                intent_n.append(intent)
                act_traj_next_n.append(act_traj_next)
                intent_next_n.append(intent_next)
            
        elif self.args.good_i3 == 1 and self.args.adv_i3 == 0:
            for i in range(self.n):
                if i < self.args.num_adversaries:
                    obs, act, rew, obs_next, done = agents[i].replay_buffer.sample_index(index)
                    obs_n.append(obs)
                    obs_next_n.append(obs_next)
                    act_n.append(act)
                    act_traj_n.append(act_traj_temp)
                    intent_n.append(intent_temp)
                    act_traj_next_n.append(act_traj_temp)
                    intent_next_n.append(intent_temp)
                else:
                    obs, act, rew, obs_next,act_traj, intent,act_traj_next, intent_next, done = agents[i].replay_buffer.sample_index(index)
                    obs_n.append(obs)
                    obs_next_n.append(obs_next)
                    act_n.append(act)
                    act_traj_n.append(act_traj)
                    intent_n.append(intent)
                    act_traj_next_n.append(act_traj_next)
                    intent_next_n.append(intent_next)
        elif self.args.good_i3 == 0 and self.args.adv_i3 == 1:
            for i in range(self.n):
                if i < self.args.num_adversaries:
                    obs, act, rew, obs_next,act_traj, intent,act_traj_next, intent_next, done = agents[i].replay_buffer.sample_index(index)
                    obs_n.append(obs)
                    obs_next_n.append(obs_next)
                    act_n.append(act)
                    act_traj_n.append(act_traj)
                    intent_n.append(intent)
                    act_traj_next_n.append(act_traj_next)
                    intent_next_n.append(intent_next)
                else:
                    obs, act, rew, obs_next, done = agents[i].replay_buffer.sample_index(index)
                    obs_n.append(obs)
                    obs_next_n.append(obs_next)
                    act_n.append(act)
                    act_traj_n.append(act_traj_temp)
                    intent_n.append(intent_temp)
                    act_traj_next_n.append(act_traj_temp)
                    intent_next_n.append(intent_temp)

        else:
            for i in range(self.n):
                obs, act, rew, obs_next, done = agents[i].replay_buffer.sample_index(index)
                obs_n.append(obs)
                obs_next_n.append(obs_next)
                act_n.append(act)
                act_traj_n.append(act_traj_temp)
                intent_n.append(intent_temp)
                act_traj_next_n.append(act_traj_temp)
                intent_next_n.append(intent_temp)

        obs, act, rew, obs_next, act_traj, intent, act_traj_next, intent_next, done = self.replay_buffer.sample_index(index)

        num_sample = 1
        target_q = 0.0
        target_act_next_n =[]

        if self.args.good_i3 == 1 and self.args.adv_i3 == 1:
            target_act_next_n = [agents[i].p_debug['target_act'](*([obs_next_n[i]] +[intent_next_n[i]])) for i in range(self.n)]
            

        elif self.args.good_i3 == 1 and self.args.adv_i3 == 0:
            for i in range(self.n):
                if i >= self.args.num_adversaries:
                    target_act_next_n.append(agents[i].p_debug['target_act'](*([obs_next_n[i]] +[intent_next_n[i]])))
                else:
                    target_act_next_n.append(agents[i].p_debug['target_act'](obs_next_n[i]))

        elif self.args.good_i3 == 0 and self.args.adv_i3 == 1:    
            for i in range(self.n):
                if i < self.args.num_adversaries:
                    target_act_next_n.append(agents[i].p_debug['target_act'](*([obs_next_n[i]] +[intent_next_n[i]])))
                else:
                    target_act_next_n.append(agents[i].p_debug['target_act'](obs_next_n[i]))
        else:   
            target_act_next_n = [agents[i].p_debug['target_act'](obs_next_n[i]) for i in range(self.n)]
           

        target_q_next = self.q_debug['target_q_values'](*(obs_next_n + target_act_next_n +intent_next_n))
        target_q += rew + self.args.gamma * (1.0 - done) * target_q_next    
        target_q /= num_sample

        q_loss = self.q_train(*(obs_n + act_n +intent_n +[target_q]))
        p_loss = self.p_train(*(obs_n + act_n + intent_n))

        self.p_update()
        self.q_update()
        i_loss = 0
        if self.args.onpolicy_i == 0:
            true_actions = []
            for i in range(len(act_traj_next_n)):
                true_actions.append([])
                agent = act_traj_next_n[i]
                for j in range(len(agent)):
                    true_actions[i].append([])
                    for k in range(len(agent[j])):
                        a = deepcopy(agent[j][k][-1])
                        true_actions[i][j] = np.concatenate((true_actions[i][j],a), axis = 0)

            i_loss =  self.i_train(*(obs_n + act_traj_n + true_actions))
            self.i_update()
        return [q_loss, p_loss,i_loss,  np.mean(target_q), np.mean(rew), np.mean(target_q_next), np.std(target_q)]