Пример #1
0
class SheldonPolicy(Policy):
    def __init__(self, env, landmark_id, args):
        super(SheldonPolicy, self).__init__()
        self.env = env
        self.landmark_id = landmark_id
        # dummy replay buffer for collecting experiences
        self.replay_buffer = ReplayBuffer(
            args.num_episodes * args.max_episode_len
            if args.benchmark and args.save_replay else 1e6)

    def action(self, obs):
        delta_pos = obs[(4 + self.landmark_id * 2):(4 + self.landmark_id * 2 +
                                                    2)]
        # ignore observation and just act based on keyboard events
        if self.env.discrete_action_input:
            # not tested!
            u = 0
            horizontal = abs(delta_pos[0]) > abs(delta_pos[1])
            if horizontal and delta_pos[0] < 0: u = 1  # LEFT
            if horizontal and delta_pos[0] > 0: u = 2  # RIGHT
            if not horizontal and delta_pos[1] < 0: u = 3  # UP
            if not horizontal and delta_pos[1] > 0: u = 4  # DOWN
        else:
            u = np.zeros(5)  # 5-d because of no-move action
            if delta_pos[0] > 0: u[1] += delta_pos[0]  # RIGHT
            if delta_pos[0] < 0: u[2] += -delta_pos[0]  # LEFT
            if delta_pos[1] > 0: u[3] += delta_pos[1]  # UP
            if delta_pos[1] < 0: u[4] += -delta_pos[1]  # DOWN
        #print(delta_pos, u)
        #return np.concatenate([u, np.zeros(self.env.world.dim_c)])
        return u

    def experience(self, obs, act, rew, new_obs, done, terminal):
        # Store transition in the replay buffer.
        self.replay_buffer.add(obs, act, rew, new_obs, float(done))
Пример #2
0
    def __init__(self,
                 name,
                 model,
                 obs_shape_n,
                 act_space_n,
                 agent_index,
                 args,
                 local_q_func=False):
        self.name = name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(
                U.BatchInput(obs_shape_n[i],
                             name="observation" + str(i)).get())

        # Create all the functions necessary to train the model
        self.q_train1, self.q_update1, self.q_debug1 = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            agent_idx=agent_index,
            q_function_idx=1,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units)
        self.q_train2, self.q_update2, self.q_debug2 = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            agent_idx=agent_index,
            q_func=model,
            q_function_idx=2,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units)

        self.act, self.p_train, self.p_update, self.p_debug = p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            agent_idx=agent_index,
            p_func=model,
            q_func=model,  #MLPmodel()
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units)
        # Create experience buffer
        self.replay_buffer = ReplayBuffer(1e6)
        self.min_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None
        a = tf.summary.FileWriter("logdirMaddpg", tf.get_default_graph())
        a.flush()
        a.close()
 def __init__(self,
              name,
              model,
              obs_shape_n,
              act_space_n,
              agent_index,
              args,
              agent_type,
              local_q_func=False):
     self.name = name
     self.n = 1
     self.agent_index = agent_index
     self.args = args
     self.u_estimation = args.u_estimation
     self.constrained = args.constrained
     self.constraint_type = args.constraint_type
     self.agent_type = agent_type
     if self.agent_type == "good":
         cvar_alpha = args.cvar_alpha_good_agent
     elif self.agent_type == "adversary":
         cvar_alpha = args.cvar_alpha_adv_agent
     obs_ph_n = []
     obs_ph_n.append(
         U.BatchInput(obs_shape_n[agent_index], name="observation0").get())
     # Create all the functions necessary to train the model
     self.q_train, self.q_train2, self.q_train3, self.q_update, self.u_update, self.q_debug = q_train(
         scope=self.name,
         make_obs_ph_n=obs_ph_n,
         act_space_n=act_space_n,
         q_index=agent_index,
         q_func=model,
         u_func=model,
         optimizer=tf.train.AdamOptimizer(learning_rate=args.lr_critic),
         optimizer_lamda=tf.train.AdamOptimizer(
             learning_rate=args.lr_lamda),
         exp_var_alpha=args.exp_var_alpha,
         cvar_alpha=cvar_alpha,
         cvar_beta=args.cvar_beta,
         grad_norm_clipping=0.5,
         local_q_func=local_q_func,
         num_units=args.num_units,
         u_estimation=self.u_estimation,
         constrained=self.constrained,
         constraint_type=self.constraint_type,
         agent_type=self.agent_type)
     self.act, self.p_train, self.p_update, self.p_debug = p_train(
         scope=self.name,
         make_obs_ph_n=obs_ph_n,
         act_space_n=act_space_n,
         p_index=agent_index,
         p_func=model,
         q_func=model,
         optimizer=tf.train.AdamOptimizer(learning_rate=args.lr_actor),
         grad_norm_clipping=0.5,
         local_q_func=local_q_func,
         num_units=args.num_units)
     # Create experience buffer
     self.replay_buffer = ReplayBuffer(1e6)
     self.max_replay_buffer_len = args.batch_size * args.max_episode_len
     self.replay_sample_index = None
Пример #4
0
    def __init__(self,
                 name,
                 model,
                 obs_shape_n,
                 act_space_n,
                 agent_index,
                 args,
                 local_q_func="maddpg"):
        self.name = name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        self.local_q_func = local_q_func
        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(
                U.BatchInput(obs_shape_n[i],
                             name="observation" + str(i)).get())

        if local_q_func == "ddpg" or local_q_func == "maddpg":

            # Create all the functions necessary to train the model
            self.q_train, self.q_update, self.q_debug = q_train(
                scope=self.name,
                make_obs_ph_n=obs_ph_n,
                act_space_n=act_space_n,
                q_index=agent_index,
                q_func=model,
                optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
                grad_norm_clipping=0.5,
                local_q_func=local_q_func,
                num_units=args.num_units)
            self.act, self.p_train, self.p_update, self.p_debug = p_train(
                scope=self.name,
                make_obs_ph_n=obs_ph_n,
                act_space_n=act_space_n,
                p_index=agent_index,
                p_func=model,
                q_func=model,
                optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
                grad_norm_clipping=0.5,
                local_q_func=local_q_func,
                num_units=args.num_units)
        if local_q_func == "dqn":
            self.act, self.p_train, self.p_update, self.p_debug = dqn_train(
                scope=self.name,
                make_obs_ph_n=obs_ph_n,
                act_space_n=act_space_n,
                p_index=agent_index,
                p_func=model,
                q_func=model,
                optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
                grad_norm_clipping=0.5,
                local_q_func=local_q_func,
                num_units=args.num_units)
        # Create experience buffer
        self.replay_buffer = ReplayBuffer(1e6)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None
Пример #5
0
 def __init__(self, env, landmark_id, args):
     super(SheldonPolicy, self).__init__()
     self.env = env
     self.landmark_id = landmark_id
     # dummy replay buffer for collecting experiences
     self.replay_buffer = ReplayBuffer(
         args.num_episodes * args.max_episode_len
         if args.benchmark and args.save_replay else 1e6)
Пример #6
0
 def __init__(self, name, model, obs_shape_n, act_space_n, act_traj_shape_n,intent_shape,  agent_index, args, local_q_func=False):
     self.name = name
     self.n = len(obs_shape_n)
     self.agent_index = agent_index
     self.args = args
     obs_ph_n = []
     act_traj_ph_n = []
     intent_ph_n = []
     for i in range(self.n):
         obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i)).get())
         act_traj_ph_n.append(U.BatchInput(act_traj_shape_n[i], name = "action_trajectory"+str(i)).get())
         intent_ph_n.append(U.BatchInput(intent_shape[i], name = "intent"+str(i)).get())
     self.act_size = act_space_n[0].n
     self.get_intent, self.i_train, self.i_update, self.i_debug = i_train(
         scope=self.name,
         make_obs_ph_n=obs_ph_n,
         intent_ph_n = intent_ph_n,
         act_space_n = act_space_n,
         make_act_traj_ph_n = act_traj_ph_n,
         make_intent_ph_n  =intent_ph_n,
         i_func = model,
         i_index = agent_index,
         output_size = (self.n-1) * self.act_size,
         optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
         grad_norm_clipping=0.5,
         num_units=args.num_units,
         reuse = False
         ) 
     # Create all the functions necessary to train the model
     self.q_train, self.q_update, self.q_debug = q_train(
         scope=self.name,
         make_obs_ph_n=obs_ph_n,
         act_space_n=act_space_n,
         make_intent_ph_n = intent_ph_n,
         q_index=agent_index,
         q_func=model,
         optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
         grad_norm_clipping=0.5,
         local_q_func=local_q_func,
         num_units=args.num_units
     )
     self.act, self.p_train, self.p_update, self.p_debug = p_train(
         scope=self.name,
         make_obs_ph_n=obs_ph_n,
         act_space_n=act_space_n,
         make_intent_ph_n = intent_ph_n,
         p_index=agent_index,
         p_func=model,
         q_func=model,
         optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
         grad_norm_clipping=0.5,
         local_q_func=local_q_func,
         num_units=args.num_units
     )
     # Create experience buffer
     self.replay_buffer = ReplayBuffer(1e6)
     self.max_replay_buffer_len = args.batch_size * args.max_episode_len
     self.replay_sample_index = None
Пример #7
0
    def __init__(self,
                 name,
                 model,
                 obs_shape_n,
                 act_space_n,
                 agent_index,
                 args,
                 local_q_func=False):
        self.name = name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(
                U.BatchInput(obs_shape_n[i],
                             name="observation" + str(i)).get())

        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            q_index=agent_index,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units,
            # reuse = tf.compat.v1.AUTO_REUSE,
        )
        self.act, self.p_train, self.p_update, self.p_debug, num_actions = p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            p_index=agent_index,
            p_func=model,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units,
            # reuse = tf.compat.v1.AUTO_REUSE,
        )

        # Create experience buffer
        self.replay_buffer = ReplayBuffer(1e6, args.batch_size, num_actions,
                                          obs_ph_n[0].shape[1])
        #self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.max_replay_buffer_len = args.batch_size  # I mean this is how it should be. This is what we're actually doing...

        self.replay_sample_index = None
Пример #8
0
    def __init__(self,
                 name,
                 before_com_model,
                 channel,
                 after_com_model,
                 critic_mlp_model,
                 obs_shape_n,
                 act_space_n,
                 args,
                 local_q_func=False):
        self.name = name
        self.n = len(obs_shape_n)
        self.args = args
        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(
                U.BatchInput(obs_shape_n[i],
                             name="observation_" + str(i)).get())

        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            q_func=critic_mlp_model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units,
        )
        self.act, self.p_train, self.p_update, self.p_debug = p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            before_com_func=before_com_model,
            channel=channel,
            after_com_func=after_com_model,
            q_func=critic_mlp_model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units,
            beta=args.beta,
            ibmac_com=args.ibmac_com,
        )
        # Create experience buffer
        self.replay_buffer = ReplayBuffer(1e6)
        # self.max_replay_buffer_len = 50 * args.max_episode_len
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None

        self.message_1_for_record = []
    def __init__(self,
                 name,
                 model,
                 obs_shape_n,
                 act_space_n,
                 agent_index,
                 args,
                 local_q_func=False,
                 u_estimation=False):
        print('in here')
        self.name = name
        self.n = 1  #len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        obs_ph_n = []
        obs_ph_n.append(
            U.BatchInput(obs_shape_n[agent_index], name="observation0").get())
        self.u_estimation = u_estimation

        # Create all the functions necessary to train the model
        l = q_train(scope=self.name,
                    make_obs_ph_n=obs_ph_n,
                    act_space_n=act_space_n,
                    q_index=agent_index,
                    q_func=model,
                    u_func=model,
                    optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
                    grad_norm_clipping=0.5,
                    local_q_func=local_q_func,
                    num_units=args.num_units,
                    u_estimation=self.u_estimation)

        if self.u_estimation:
            self.q_train, self.q_update, self.u_update, self.q_debug = l
        else:
            self.q_train, self.q_update, self.q_debug = l

        self.act, self.p_train, self.p_update, self.p_debug = p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            p_index=agent_index,
            p_func=model,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units)
        # Create experience buffer
        self.replay_buffer = ReplayBuffer(1e6)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None
Пример #10
0
    def __init__(self, n_agents, name, model, state_shape, obs_shape_n, act_space_n, agent_index, args, local_q_func=False):
        self.name = name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        obs_ph_n = []
        state_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i), lstm=args.actor_lstm or args.critic_lstm).get())
            state_ph_n.append(U.BatchInput(state_shape, name="state" + str(i)).get())

        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = q_train(
            n_agents=n_agents,
            scope=self.name,
            make_state_ph_n=state_ph_n,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            q_index=agent_index,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr, epsilon=args.optimizer_epsilon),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units,
            discrete_action=args.discrete_action,
            target_update_tau=args.target_update_tau,
            use_global_state=args.use_global_state,
            share_weights=args.share_weights
        )
        self.act, self.act_test, self.p_train, self.p_update, self.p_debug = p_train(
            n_agents = n_agents,
            scope=self.name,
            make_state_ph_n=state_ph_n,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            p_index=agent_index,
            p_func=model,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr, epsilon=args.optimizer_epsilon),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units,
            discrete_action=args.discrete_action,
            target_update_tau=args.target_update_tau,
            use_global_state=args.use_global_state,
            share_weights=args.share_weights
        )

        # Create experience buffer
        self.replay_buffer = ReplayBuffer(1e6)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None
Пример #11
0
    def __init__(self, name, model_value, model_policy, obs_shape_n,
                 act_space_n, agent_index, args, hparams,
                 summary_writer=None, local_q_func=False, rngseed=None):
        self.name = name
        self.rngseed = rngseed
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        self.hparams = hparams
        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(U.BatchInput(
                obs_shape_n[i], name="observation" + str(i)).get())

        # Create all the functions necessary to train the model

        # train critic
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            q_index=agent_index,
            q_func=model_value,
            optimizer=tf.train.AdamOptimizer(learning_rate=hparams['learning_rate']),
            grad_norm_clipping=hparams['grad_norm_clipping'],
            local_q_func=local_q_func,
            num_units=args.num_units
        )

        # train policy
        self.act, self.p_train, self.p_update, self.p_debug = p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            p_index=agent_index,
            p_func=model_policy,
            q_func=model_value,
            optimizer=tf.train.AdamOptimizer(learning_rate=hparams['learning_rate']),
            grad_norm_clipping=hparams['grad_norm_clipping'],
            local_q_func=local_q_func,
            num_units=args.num_units
        )
        # Create experience buffer
        self.replay_buffer = ReplayBuffer(hparams['replay_buffer_len'], self.rngseed)
        try:
            if hparams['test_saving']:
                self.max_replay_buffer_len = 100
        except KeyError:
            self.max_replay_buffer_len = hparams['batch_size'] * args.max_episode_len
        self.replay_sample_index = None
        self.summary_writer = summary_writer
Пример #12
0
    def __init__(self,
                 name,
                 model,
                 obs_shape_n,
                 act_space_n,
                 agent_index,
                 args,
                 local_q_func=False,
                 reuse=False):
        self.name = name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(
                U.BatchInput(obs_shape_n[i],
                             name="observation" + str(i)).get())

        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            q_index=agent_index,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units,
            reuse=reuse)
        self.act, self.p_train, self.p_update, self.p_debug = p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            p_index=agent_index,
            p_func=model,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units,
            reuse=reuse,
            deterministic=args.benchmark and args.deterministic)
        # Create experience buffer
        self.replay_buffer = ReplayBuffer(
            args.num_episodes * args.max_episode_len
            if args.benchmark and args.save_replay else 1e6)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None
Пример #13
0
    def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args):
        self.name = name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        self.counter = 0

        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i)+"_ag"+str(agent_index)).get())

        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,#[lambda name: U.BatchInput(obs_shape, name=name) for obs_shape in obs_shape_n],
            act_space_n=act_space_n,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5
        )
        self.act, self.p_train, self.p_update, self.p_debug = p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,#[lambda name: U.BatchInput(obs_shape, name=name) for obs_shape in obs_shape_n],
            act_space_n=act_space_n,
            p_index=0,
            p_func=model,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5
        )
        # Create experience buffer
        self.replay_buffer = [ReplayBuffer(1e6) for i in range(self.n)]
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
    def __init__(self,
                 name,
                 model,
                 obs_shape_n,
                 act_space_n,
                 agent_index,
                 args,
                 local_q_func=False):  # 是否用ddpg训练
        self.name = name
        self.n = len(obs_shape_n)  # 总的agent个数
        self.agent_index = agent_index  # 当前是几号agent
        self.args = args  # cmd传入的训练参数,交互用
        obs_ph_n = []
        for i in range(self.n):  # 用于一批环境数据放入的占位符集合,收集所有agent的observations,
            # 依据他们observation的shape创造不同大小的批量占位符集合
            obs_ph_n.append(
                U.BatchInput(obs_shape_n[i],
                             name="observation" + str(i)).get())

        # Create all the functions necessary to train the model
        # 训练节点,更新target网络,字典得到对应输出的q值与target-q值(已经被session激活)
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            q_index=agent_index,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units
        )  # 得到act,训练策略网络,策略网络的target网络更新,字典给出p值和target策略网络的输出动作值
        self.act, self.p_train, self.p_update, self.p_debug = p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            p_index=agent_index,
            p_func=model,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units)
        # Create experience buffer
        self.replay_buffer = ReplayBuffer(1e6)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None
Пример #15
0
    def __init__(self,
                 name,
                 model,
                 obs_shape_n,
                 act_space_n,
                 agent_index,
                 args,
                 local_q_func=False):
        self.name = name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        obs_ph_mems = []
        for i in range(args.num_groups):
            # assumes agents have same observation shape
            obs_ph_mems.append(
                U.BatchInput(obs_shape_n[i],
                             name="observation" + str(i)).get())

        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_mems,
            act_space_n=act_space_n,
            q_index=agent_index,
            q_func=model,
            optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units,
            num_groups=args.num_groups)
        self.act, self.p_train, self.p_update, self.p_debug = p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_mems,
            act_space_n=act_space_n,
            p_index=agent_index,
            p_func=model,
            q_func=model,
            optimizer=tf.compat.v1.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units,
            num_groups=args.num_groups)
        # Create experience buffer
        self.replay_buffer = ReplayBuffer(1e6)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None
Пример #16
0
    def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False):
        self.name = name               # name of the agent
        self.n = len(obs_shape_n)      # number of agents
        self.agent_index = agent_index # Index of the specific agent
        self.args = args               # Settings of hyper-parameters
        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i)).get()) # Creates a placeholder for a batch of tensors of a given shape and dtype.

        # [Create all the functions necessary to train the model]
        # train:             U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr])
        # update_target_q:   make_update_exp(q_func_vars, target_q_func_vars)
        # q_values:          U.function(obs_ph_n + act_ph_n, q)
        # target_q_values:   U.function(obs_ph_n + act_ph_n, target_q)
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,                                             # String: "agent_1" or "agent_2" or ...
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,                                     # action_space.
            q_index=agent_index,                                         # Index of the specific agent.
            q_func=model,                                                # Defined model.
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),     # 优化方法 --- 自适应矩估计 --- Adam法 --- 学习率设定
            grad_norm_clipping=0.5,                                      # 梯度剪切 --- 防止梯度爆炸 --- 梯度超过该值,直接设定为该值
            local_q_func=local_q_func,
            num_units=args.num_units                                     # Hidden layers 隐藏节点数
        )
        
        # act:                U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample)
        # train:              U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr])
        # update_target_p:    make_update_exp(p_func_vars, target_p_func_vars)
        # p_values:           U.function([obs_ph_n[p_index]], p)
        # target_act:         U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample)
        self.act, self.p_train, self.p_update, self.p_debug = p_train(   
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            p_index=agent_index,
            p_func=model,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units
        )
        # Create experience buffer
        self.replay_buffer = ReplayBuffer(1e6)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None
Пример #17
0
    def __init__(self,
                 name,
                 model,
                 obs_shape_n,
                 act_space_n,
                 agent_index,
                 args,
                 local_q_func=False):
        self.name = name
        self.n = len(obs_shape_n)  # 16
        self.agent_index = agent_index
        self.args = args
        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(
                U.BatchInput(obs_shape_n[i],
                             name="observation" + str(i)).get())
            #obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i), dtype=tf.uint8).get()) #should we specify uint8 instead of default float?

        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            q_index=agent_index,
            q_func=model,  # multi-layer perceptron
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,  # maddpg or ddpg
            num_units=args.num_units)
        self.act, self.p_train, self.p_update, self.p_debug = p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            p_index=agent_index,
            p_func=model,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units)
        # Create experience buffer
        self.replay_buffer = ReplayBuffer(1e6)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None
Пример #18
0
 def __init__(self, env, name, model, CNN_model, obs_shape_n, obs_map_shape_n,act_space_n, agent_index, args, local_q_func=False):
     self.name = name
     self.n = len(obs_shape_n)
     self.agent_index = agent_index
     self.args = args
     obs_ph_n = []
     obs_map_ph_n=[]
     for i in range(self.n):
         obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i)).get())
         obs_map_ph_n.append(U.BatchInput(obs_map_shape_n[i], name="observation_map"+str(i)).get())
     # Create all the functions necessary to train the model
     self.q_train, self.q_update, self.q_debug = q_train(
         scope=self.name,
         make_obs_ph_n=obs_ph_n,
         act_space_n=act_space_n,
         q_index=agent_index,
         q_func=model,
         shared_CNN=CNN_model,
         optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
         grad_norm_clipping=0.5,
         local_q_func=local_q_func,
         num_units=args.num_units,
         make_obs_map_ph_n=obs_map_ph_n
     )
     self.act, self.p_train, self.vf_t, self.p_update, self.vf_u, self.p_debug = p_train(
         scope=self.name,
         env = env,
         make_obs_ph_n=obs_ph_n,
         act_space_n=act_space_n,
         p_index=agent_index,
         vf_func=model,
         shana = GMMPolicy,
         q_func=model,
         shared_CNN=CNN_model,
         optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
         grad_norm_clipping=0.5,
         local_q_func=local_q_func,
         num_units=args.num_units,
         make_obs_map_ph_n=obs_map_ph_n
     )
     # Create experience buffer
     self.replay_buffer = ReplayBuffer(1e6)
     self.max_replay_buffer_len = args.batch_size * args.max_episode_len
     self.replay_sample_index = None
     self.batch_size=args.batch_size
    def __init__(self, name, critic_model, policy_model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False):
        self.name = name
        self.n = 4
        self.agent_index = agent_index
        self.args = args
        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i)).get())




        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = q_train(
            scope              =  self.name,
            make_obs_ph_n      =  obs_ph_n,
            act_space_n        =  act_space_n,
            q_index            =  agent_index,
            q_func             =  critic_model,
            optimizer          =  tf.train.AdamOptimizer(learning_rate=args['lr']),
            grad_norm_clipping =  0.5,
            local_q_func       =  local_q_func,
            num_units          =  args['num_units']
        )
        self.act, self.p_train, self.p_update, self.p_debug = p_train(
            scope              = self.name,
            make_obs_ph_n      = obs_ph_n,
            act_space_n        = act_space_n,
            p_index            = agent_index,
            p_func             = policy_model,
            q_func             = critic_model,
            optimizer          = tf.train.AdamOptimizer(learning_rate=args['lr']),
            grad_norm_clipping = 0.5,
            local_q_func       = local_q_func,
            num_units          = args['num_units']
        )




        # Create experience buffer
        self.replay_buffer = ReplayBuffer(1e6)
        self.max_replay_buffer_len = args['batch_size'] * args['max_episode_len']
        self.replay_sample_index = None
Пример #20
0
    def __init__(self,
                 name,
                 learning_rate,
                 obs_shape_n,
                 act_space_n,
                 agent_index,
                 args,
                 local_q_func=False):
        self.name = name
        self.learning_rate = learning_rate
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.obs_size = obs_shape_n[agent_index]
        self.joint_obs_size = np.sum(obs_shape_n)
        self.act_size = act_space_n[agent_index].n
        self.act_pdtype_n = [
            make_pdtype(act_space) for act_space in act_space_n
        ]

        self.joint_act_size = 0
        for i_act in act_space_n:
            self.joint_act_size += i_act.n
        self.args = args
        self.actor = Actor(self.obs_size, self.act_size)
        self.actor_target = Actor(self.obs_size, self.act_size)
        self.critic = self.build_critic()
        self.critic_target = self.build_critic()
        update_target(self.actor, self.actor_target, 0)
        update_target(self.critic, self.critic_target, 0)
        #self.actor, self.critic = self.build_model()
        #self.actor_target, self.critic_target = self.build_model()
        self.actor_optimizer = self.build_actor_optimizer()

        # Create experience buffer
        self.replay_buffer = ReplayBuffer(1e6)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None

        gpu = -1

        self.device = "/gpu:{}".format(gpu) if gpu >= 0 else "/cpu:0"
Пример #21
0
    def __init__(self, obs_shape_n, act_info_n, agent_index, args, local_q_func=False):
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        self.grad_norm_clipping = 0.5
        # Networks
        self.device = args.device

        self.vf = Critic(
            obs_shape_n=obs_shape_n,
            act_info_n=act_info_n,
            num_units=args.num_units,
            q_index=agent_index,
            local_q_func=local_q_func,
        ).to(self.device)

        act_dim, self.pdtype = act_info_n[agent_index]
        self.pi = MLP(obs_shape_n[agent_index],
                      act_dim,
                      num_units=args.num_units).to(self.device)

        # Initialize
        init_params(self.vf)
        init_params(self.pi)

        # Target Networks
        self.pi_targ = deepcopy(self.pi)
        for p in self.pi_targ.parameters():
            p.requires_grad = False
        self.vf_targ = deepcopy(self.vf)
        for p in self.vf_targ.parameters():
            p.requires_grad = False

        # Optimizer
        self.pi_optim = Adam(self.pi.parameters(), lr=args.lr)
        self.vf_optim = Adam(self.vf.parameters(), lr=args.lr)

        # Create Replay Buffer
        self.replay_buffer = ReplayBuffer(1e6)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None
Пример #22
0
    def __init__(self, name, model, state_shape, act_space_n, agent_index, args, local_q_func=False):
        self.name = name
        self.n = 1
        self.agent_index = agent_index
        self.args = args
        obs_ph_n = []
        obs_ph_n.append(U.BatchInput(state_shape, name="observation"+str(0)).get())

        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            q_index=agent_index,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units
        )
        self.act, self.p_train, self.p_update, self.p_debug = p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            p_index=agent_index,
            p_func=model,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units
        )
        # Create experience buffer
        self.replay_buffer = ReplayBuffer(1e6)
        self.min_buffer_size = args.min_buffer_size
        self.replay_sample_index = None
Пример #23
0
 def __init__(self, pos_x, pos_y, workcap=40, sense_r=1, global_view=True):
     self.pos = [pos_x, pos_y]
     self.old_pos = self.pos
     self.workcap = 40
     self.worktime = 0
     # TODO: Establish relation between sense_r and sense_p
     self.sense_r = sense_r
     self.sense_p = [(1, 0), (-1, 0), (0, 1), (0, -1)]  #下,上,右,左
     self.global_view = global_view
     self.local_view = None
     self.global_view = None
     self.island = True  # island=False特指悬停, island=True特指降落充电
     self.isCharging = False
     self.actions = [(1, 0), (-1, 0), (0, 1), (0, -1)]
     self.staytime = 0
     self.experience = ReplayBuffer(1e6)
Пример #24
0
    def __init__(self,
                 name,
                 model,
                 obs_shape_n,
                 act_space_n,
                 agent_index,
                 args,
                 role="",
                 local_q_func=False):
        """
        Args:
            name (str): Name of the agent
            model (function): MLP Neural Network model for the agent.
            obs_shape_n (tf.placeholder): Placeholder for the observation space of all agents
            act_space_n (list): A list of the action spaces for all agents
            agent_index (int): Agent index number
            args (argparse.Namespace): Parsed commandline arguments object
            role (str): Role of the agent i.e. adversary
            local_q_func (boolean): Flag for using local q function
        """
        super(MADDPGAgentTrainerCCM, self).__init__()

        self.name = name
        self.role = role
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        obs_ph_n = []
        act_history_ph_n = []
        obs_history_ph_n = []

        hist = self.args.training_history

        obs_history_n = [(hist * x[0], ) for x in obs_shape_n]
        act_history_n = [(hist * act.n, ) for act in act_space_n]

        # act_history_n = [Discrete(act.n*(3-1)) for act in act_space_n]
        #        for act_space in act_space_n:
        #            act_space.n = act_space.n*3
        #        if act_history_n[0].n != 15:
        #            print("Line 158")

        for i in range(self.n):
            obs_ph_n.append(
                tf_util.BatchInput(obs_shape_n[i],
                                   name="observation" + str(i)).get())
            obs_history_ph_n.append(
                tf_util.BatchInput(obs_history_n[i],
                                   name="observationhistory" + str(i)).get())
            act_history_ph_n.append(
                tf_util.BatchInput(act_history_n[i],
                                   name="actionhistory" + str(i)).get())

        # obs_ph_n = [tf.concat(3*[x],1,name="observation{}".format(i)) for i,x in enumerate(obs_ph_n)]

        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            make_obs_history_n=obs_history_ph_n,
            make_act_history_n=act_history_ph_n,
            q_index=agent_index,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units)
        self.act, self.p_train, self.p_update, self.p_debug = p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            make_obs_history_n=obs_history_ph_n,
            make_act_history_n=act_history_ph_n,
            p_index=agent_index,
            p_func=model,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units)
        # Create experience buffer
        self.replay_buffer = ReplayBuffer(1e6)
        self.max_replay_buffer_len = 4 * args.batch_size * args.max_episode_len
        self.replay_sample_index = None
Пример #25
0
class MADDPGAgentTrainerCCM(AgentTrainer):
    """
    Agent Trainer using MADDPG Algorithm and CCM
    """
    def __init__(self,
                 name,
                 model,
                 obs_shape_n,
                 act_space_n,
                 agent_index,
                 args,
                 role="",
                 local_q_func=False):
        """
        Args:
            name (str): Name of the agent
            model (function): MLP Neural Network model for the agent.
            obs_shape_n (tf.placeholder): Placeholder for the observation space of all agents
            act_space_n (list): A list of the action spaces for all agents
            agent_index (int): Agent index number
            args (argparse.Namespace): Parsed commandline arguments object
            role (str): Role of the agent i.e. adversary
            local_q_func (boolean): Flag for using local q function
        """
        super(MADDPGAgentTrainerCCM, self).__init__()

        self.name = name
        self.role = role
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        obs_ph_n = []
        act_history_ph_n = []
        obs_history_ph_n = []

        hist = self.args.training_history

        obs_history_n = [(hist * x[0], ) for x in obs_shape_n]
        act_history_n = [(hist * act.n, ) for act in act_space_n]

        # act_history_n = [Discrete(act.n*(3-1)) for act in act_space_n]
        #        for act_space in act_space_n:
        #            act_space.n = act_space.n*3
        #        if act_history_n[0].n != 15:
        #            print("Line 158")

        for i in range(self.n):
            obs_ph_n.append(
                tf_util.BatchInput(obs_shape_n[i],
                                   name="observation" + str(i)).get())
            obs_history_ph_n.append(
                tf_util.BatchInput(obs_history_n[i],
                                   name="observationhistory" + str(i)).get())
            act_history_ph_n.append(
                tf_util.BatchInput(act_history_n[i],
                                   name="actionhistory" + str(i)).get())

        # obs_ph_n = [tf.concat(3*[x],1,name="observation{}".format(i)) for i,x in enumerate(obs_ph_n)]

        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            make_obs_history_n=obs_history_ph_n,
            make_act_history_n=act_history_ph_n,
            q_index=agent_index,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units)
        self.act, self.p_train, self.p_update, self.p_debug = p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            make_obs_history_n=obs_history_ph_n,
            make_act_history_n=act_history_ph_n,
            p_index=agent_index,
            p_func=model,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units)
        # Create experience buffer
        self.replay_buffer = ReplayBuffer(1e6)
        self.max_replay_buffer_len = 4 * args.batch_size * args.max_episode_len
        self.replay_sample_index = None

    def action(self, obs):
        """
        Retrieves action for agent from the P network given the observations

        Args:
            obs (np.array): Observations of the world for an agent

        Returns:
            Action for an agent
        """
        hist = self.args.training_history
        if len(self.replay_buffer) > (hist + 1):
            _, _, _, _, _, obs_h, _, _, _, _ = self.replay_buffer.sample_index(
                [len(self.replay_buffer)], hist)
            if len(obs_h) > 0:
                obs_h = obs_h[0]
            # obs = np.concatenate((obs,ob[0]),0)
        else:
            obs_h = np.array((hist) * list(obs))

        return self.act(obs[None], obs_h[None])[0]

    def experience(self, obs, act, rew, new_obs, done, terminal):
        """
        Store transition in the replay buffer.

        Args:
            obs (np.array): Observations of the world for an agent
            act (list): Action for an agent
            rew (float): Reward for an agent
            new_obs (np.array): New observations of the world for an agent
            done (): Done for an agent
            terminal (boolean): Flag for whether the final episode has been reached.
        """
        self.replay_buffer.add(obs, act, rew, new_obs, float(done))

    def preupdate(self):
        """
        Reset replay_sample_index to None.
        """
        self.replay_sample_index = None

    def update(self, agents, steps):
        """
        Update agent networks

        Args:
            agents (list): List of MADDPGAgentTrainer objects
            steps (int): Current training step

        Returns:
            (list) Training loss for the agents
                   [q_loss, p_loss, mean_target_q, mean_reward, mean_target_q_next, std_target_q]
        """
        # Replay buffer is not large enough
        # if len(self.replay_buffer) < self.max_replay_buffer_len:
        if len(self.replay_buffer) < 12500:
            return

        # Only update every 100 steps
        if not steps % 100 == 0:
            return

        self.replay_sample_index = self.replay_buffer.make_index(
            self.args.batch_size)
        hist = self.args.training_history

        # ************************************************************************************************

        ccm_loss = np.array([0.0])
        ccm_lambda = np.array([self.args.ccm_lambda])
        ccm_switch = np.array([0.0])

        # ************************************************************************************************

        # Collect replay sample from all agents
        obs_n = []
        obs_h_n = []
        obs_next_n = []
        obs_next_h_n = []
        act_n = []
        act_h_n = []
        index = self.replay_sample_index
        for i in range(self.n):
            obs, act, rew, obs_next, done, obs_h, act_h, rew_h, obs_next_h, done_h = agents[i].\
                replay_buffer.sample_index(index, history=hist)
            obs_n.append(obs)
            obs_h_n.append(obs_h)
            obs_next_n.append(obs_next)
            obs_next_h_n.append(obs_next_h)
            act_n.append(act)
            act_h_n.append(act_h)
        _, _, rew, _, done, _, _, rew_h, _, done_h = self.replay_buffer.sample_index(
            index, history=0)

        obs_h_n = [[list() for _ in range(len(obs_n[0]))] if len(x) == 0 else x
                   for x in obs_h_n]
        obs_next_h_n = [
            [list() for _ in range(len(obs_next_n[0]))] if len(x) == 0 else x
            for x in obs_next_h_n
        ]
        act_h_n = [[list() for _ in range(len(act_n[0]))] if len(x) == 0 else x
                   for x in act_h_n]

        # rew = rew.T[0]
        # done = done.T[0]
        # train q network
        # print(*([x + act_n[i][j] for i,xx in enumerate(obs_n) for j,x in enumerate(xx)]))

        num_sample = 1
        target_q = 0.0
        target_q_next = 0.0
        for i in range(num_sample):
            target_act_next_n = [
                agents[i].p_debug['target_act'](obs_next_n[i], obs_next_h_n[i])
                for i in range(self.n)
            ]
            target_q_next = self.q_debug['target_q_values'](
                *(obs_next_n + obs_next_h_n + target_act_next_n + act_h_n))

            # TODO: Possible error point
            target_q += rew + self.args.gamma * (1.0 - done) * target_q_next

        target_q /= num_sample

        # TODO: Possible error point
        q_loss = self.q_train(*(obs_n + obs_h_n + act_n + act_h_n +
                                [target_q]))

        # Train P network
        # p_loss = self.p_train(*(obs_n + act_n))
        p_loss = self.p_train(*(obs_n + obs_h_n + act_n + act_h_n +
                                [ccm_loss] + [ccm_lambda] + [ccm_switch]))

        self.p_update()
        self.q_update()

        return [
            q_loss, p_loss,
            np.mean(target_q),
            np.mean(rew),
            np.mean(target_q_next),
            np.std(target_q)
        ]

    def ccm_update(self, agents, steps):
        """
        CCM Update agent networks

        Args:
            agents (list): List of MADDPGAgentTrainer objects
            steps (int): Current training step

        Returns:
            (list) Training loss for the agents
                   [q_loss, p_loss, mean_target_q, mean_reward, mean_target_q_next, std_target_q]
        """
        # Replay buffer is not large enough
        # if len(self.replay_buffer) < self.max_replay_buffer_len:
        if len(self.replay_buffer) < 12500:
            # print("{}/{}".format(len(self.replay_buffer),self.max_replay_buffer_len))
            return

        # Only update every 4 episodes
        if not steps % (4 * self.args.max_episode_len) == 0:
            return

        # Only CCM update for adversaries
        if not self.role == "adversary":
            return

        # batch_ep_size = int(round(self.args.batch_size / self.args.max_episode_len))
        batch_ep_size = self.args.ccm_pool
        self.replay_sample_index, self.ccm_episode_index = self.replay_buffer.\
            make_episode_index(batch_ep_size, self.args.max_episode_len, shuffle=not self.args.ccm_on_policy)
        hist = self.args.training_history

        # Collect replay sample from all agents
        obs_n = []
        obs_h_n = []
        obs_next_n = []
        obs_next_h_n = []
        act_n = []
        act_h_n = []
        ccm_act_n = []

        index = self.replay_sample_index
        for i in range(self.n):
            obs, act, rew, obs_next, done, obs_h, act_h, rew_h, obs_next_h, done_h = agents[i].\
                replay_buffer.sample_index(index, history=hist)
            obs_n.append(obs)
            obs_h_n.append(obs_h)
            obs_next_n.append(obs_next)
            obs_next_h_n.append(obs_next_h)
            act_n.append(act)
            act_h_n.append(act_h)

            ccm_act = []
            for ep in self.ccm_episode_index:
                _, act, _, _, _, _, _, _, _, _ = agents[
                    i].replay_buffer.sample_index(ep)
                act = np.array(act)
                ccm_act.append(act[:, 1] - act[:, 2])
            ccm_act_n.append(np.array(ccm_act))

        # print("Action CCM: {}".format(ccm.get_score(ccm_act_n[1],ccm_act_n[2],Emax=5,tau=1)))
        # print("Action CCM: {}".format(ccm_act_n))

        ccm_loss = np.array([0.0])
        ccm_lambda = np.array([self.args.ccm_lambda])
        ccm_switch = np.array([1.0])

        if self.agent_index != 1:
            t_start = time.time()

        # ccm_scores = [ccm.get_score(ccm_act_n[agent_index], ccm_act_n[i], e_max=5, tau=None)
        #               for i in range(len(ccm_act_n)) if i != agent_index]

        if self.args.specific_leader_ccm is None and self.args.specific_agent_ccm is None:
            ccm_scores = [
                ccm.get_score(ccm_act_n[self.agent_index],
                              ccm_act_n[i],
                              e_max=5,
                              tau=1) for i in range(self.n)
                if i != self.agent_index and agents[i].role == "adversary"
            ]

        elif self.args.specific_agent_ccm is None:
            if self.agent_index == self.args.specific_leader_ccm:
                ccm_scores = [
                    ccm.get_score(ccm_act_n[i],
                                  ccm_act_n[self.agent_index],
                                  e_max=5,
                                  tau=1) for i in range(self.n)
                    if i != self.agent_index and agents[i].role == "adversary"
                ]

            else:
                ccm_scores = [
                    ccm.get_score(ccm_act_n[self.agent_index],
                                  ccm_act_n[i],
                                  e_max=5,
                                  tau=1) for i in range(self.n)
                    if i == self.args.specific_leader_ccm
                ]

        else:
            ccm_scores = [
                ccm.get_score(ccm_act_n[self.agent_index],
                              ccm_act_n[self.args.specific_leader_ccm],
                              e_max=5,
                              tau=1) for i in range(self.n)
                if i == self.args.specific_leader_ccm
            ]

        # ccm_loss = [1*(x[0]-(x[1]-0.01)) for x in ccm_scores]
        ccm_loss = [x[0] - np.exp(x[1] - 0.01) for x in ccm_scores]
        ccm_loss = np.array([np.mean(ccm_loss)])

        # print("CCM Loop Time at Trial {}: {}".format(steps,time.time() - t_start))

        # Original implementation
        # obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index)

        # Modified
        _, _, rew, _, done, _, _, rew_h, _, done_h = self.replay_buffer.sample_index(
            index, history=0)

        obs_h_n = [[list() for _ in range(len(obs_n[0]))] if len(x) == 0 else x
                   for x in obs_h_n]
        obs_next_h_n = [
            [list() for _ in range(len(obs_next_n[0]))] if len(x) == 0 else x
            for x in obs_next_h_n
        ]
        act_h_n = [[list() for _ in range(len(act_n[0]))] if len(x) == 0 else x
                   for x in act_h_n]

        num_sample = 1
        target_q = 0.0
        target_q_next = 0.0
        for i in range(num_sample):
            target_act_next_n = [
                agents[i].p_debug['target_act'](obs_next_n[i], obs_next_h_n[i])
                for i in range(self.n)
            ]
            target_q_next = self.q_debug['target_q_values'](
                *(obs_next_n + obs_next_h_n + target_act_next_n + act_h_n))

            # TODO: Possible error point
            target_q += rew + self.args.gamma * (1.0 - done) * target_q_next

        target_q /= num_sample

        # TODO: Possible error point
        q_loss = self.q_train(*(obs_n + obs_h_n + act_n + act_h_n +
                                [target_q]))

        # Train P network
        # p_loss = self.p_train(*(obs_n + act_n))
        p_loss = self.p_train(*(obs_n + obs_h_n + act_n + act_h_n +
                                [ccm_loss] + [ccm_lambda] + [ccm_switch]))

        self.p_update()
        # self.q_update()

        return [
            q_loss, p_loss,
            np.mean(target_q),
            np.mean(rew),
            np.mean(target_q_next),
            np.std(target_q)
        ]
Пример #26
0
    def __init__(self,
                 name,
                 model,
                 obs_shape_n,
                 act_space_n,
                 agent_index,
                 args,
                 actor_lr=None,
                 critic_lr=None,
                 gamma=None,
                 num_units=None,
                 rb_size=None,
                 batch_size=None,
                 max_episode_len=None,
                 clip_norm=0.5,
                 local_q_func=False):
        self.name = name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args

        # training parameters
        self.actor_lr = actor_lr if actor_lr else args.lr
        self.critic_lr = critic_lr if critic_lr else args.lr
        self.gamma = gamma if gamma else args.gamma
        self.num_units = num_units if num_units else args.num_units
        self.rb_size = rb_size if rb_size else args.rb_size
        self.batch_size = batch_size if batch_size else args.batch_size
        self.max_episode_len = max_episode_len if max_episode_len else args.max_episode_len
        self.clip_norm = clip_norm

        # TODO: remove after testing
        import models.config as Config
        assert actor_lr == Config.maddpg_train_args['actor_lr']
        assert critic_lr == Config.maddpg_train_args['critic_lr']
        assert gamma == Config.maddpg_train_args['gamma']
        assert num_units == Config.maddpg_train_args['num_hidden']
        assert rb_size == Config.maddpg_train_args['rb_size']
        assert batch_size == Config.maddpg_train_args['batch_size']
        assert max_episode_len == Config.maddpg_train_args['nb_rollout_steps']
        assert clip_norm == Config.maddpg_train_args['clip_norm']

        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(
                U.BatchInput(obs_shape_n[i],
                             name="observation" + str(i)).get())

        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            q_index=agent_index,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=self.critic_lr),
            grad_norm_clipping=self.clip_norm,
            local_q_func=local_q_func,
            num_units=self.num_units)
        self.act, self.p_train, self.p_update, self.p_debug = p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            p_index=agent_index,
            p_func=model,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=self.actor_lr),
            grad_norm_clipping=self.clip_norm,
            local_q_func=local_q_func,
            num_units=self.num_units)
        # Create experience buffer
        self.replay_buffer = ReplayBuffer(self.rb_size)
        self.max_replay_buffer_len = self.batch_size * self.max_episode_len
        self.replay_sample_index = None
        self.loss_names = [
            'q_loss', 'p_loss', 'mean_target_q', 'mean_rew',
            'mean_target_q_next', 'std_target_q'
        ]
Пример #27
0
class MADDPGAgentTrainer(AgentTrainer):
    def __init__(self,
                 name,
                 model,
                 obs_shape_n,
                 act_space_n,
                 agent_index,
                 args,
                 actor_lr=None,
                 critic_lr=None,
                 gamma=None,
                 num_units=None,
                 rb_size=None,
                 batch_size=None,
                 max_episode_len=None,
                 clip_norm=0.5,
                 local_q_func=False):
        self.name = name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args

        # training parameters
        self.actor_lr = actor_lr if actor_lr else args.lr
        self.critic_lr = critic_lr if critic_lr else args.lr
        self.gamma = gamma if gamma else args.gamma
        self.num_units = num_units if num_units else args.num_units
        self.rb_size = rb_size if rb_size else args.rb_size
        self.batch_size = batch_size if batch_size else args.batch_size
        self.max_episode_len = max_episode_len if max_episode_len else args.max_episode_len
        self.clip_norm = clip_norm

        # TODO: remove after testing
        import models.config as Config
        assert actor_lr == Config.maddpg_train_args['actor_lr']
        assert critic_lr == Config.maddpg_train_args['critic_lr']
        assert gamma == Config.maddpg_train_args['gamma']
        assert num_units == Config.maddpg_train_args['num_hidden']
        assert rb_size == Config.maddpg_train_args['rb_size']
        assert batch_size == Config.maddpg_train_args['batch_size']
        assert max_episode_len == Config.maddpg_train_args['nb_rollout_steps']
        assert clip_norm == Config.maddpg_train_args['clip_norm']

        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(
                U.BatchInput(obs_shape_n[i],
                             name="observation" + str(i)).get())

        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            q_index=agent_index,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=self.critic_lr),
            grad_norm_clipping=self.clip_norm,
            local_q_func=local_q_func,
            num_units=self.num_units)
        self.act, self.p_train, self.p_update, self.p_debug = p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            p_index=agent_index,
            p_func=model,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=self.actor_lr),
            grad_norm_clipping=self.clip_norm,
            local_q_func=local_q_func,
            num_units=self.num_units)
        # Create experience buffer
        self.replay_buffer = ReplayBuffer(self.rb_size)
        self.max_replay_buffer_len = self.batch_size * self.max_episode_len
        self.replay_sample_index = None
        self.loss_names = [
            'q_loss', 'p_loss', 'mean_target_q', 'mean_rew',
            'mean_target_q_next', 'std_target_q'
        ]

    def action(self, obs):
        return self.act(obs[None])[0]

    def experience(self, obs, act, rew, new_obs, done, terminal):
        # Store transition in the replay buffer.
        self.replay_buffer.add(obs, act, rew, new_obs, float(done))

    def preupdate(self):
        self.replay_sample_index = None

    def update(self, agents, t):
        if len(
                self.replay_buffer
        ) < self.max_replay_buffer_len:  # replay buffer is not large enough
            return
        if not t % 100 == 0:  # only update every 100 steps
            return

        self.replay_sample_index = self.replay_buffer.make_index(
            self.batch_size)
        # collect replay sample from all agents
        obs_n = []
        obs_next_n = []
        act_n = []
        index = self.replay_sample_index
        for i in range(self.n):
            obs, act, rew, obs_next, done = agents[
                i].replay_buffer.sample_index(index)
            obs_n.append(obs)
            obs_next_n.append(obs_next)
            act_n.append(act)
        obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index)

        # train q network
        num_sample = 1
        act_space = act.shape[-1]
        target_q = 0.0
        for i in range(num_sample):
            target_act_next_n = [
                agents[i].p_debug['target_act'](obs_next_n[i])
                for i in range(self.n)
            ]

            # flatten multi agent actions and observations
            act_serial_vals = self.q_debug['act_serial_values'](
                *(target_act_next_n))
            obs_serial_vals = self.q_debug['obs_serial_values'](*(obs_next_n))
            assert len(act_serial_vals) == self.batch_size
            assert len(obs_serial_vals) == self.batch_size

            # compute L2 normalized partial derivatives of target Q function wrt actions
            # NOTE: this is done one sample at a time to prevent tf.gradient from summing over all target q values
            grad_norm_value = [
                self.q_debug['grad_norm_value'](*([[obs_serial_vals[j]]] +
                                                  [[act_serial_vals[j]]]))
                for j in range(self.batch_size)
            ]
            assert len(grad_norm_value) == self.batch_size

            # scale the raw gradients by alpha
            # TODO: set alpha during init or compute as function of policy or loss
            perturb = np.array(grad_norm_value) * 0.01

            # update leader actions using gradients
            for b in range(self.batch_size):
                # find all the leaders wrt current agent (agent_index)
                leading_agents = [
                    [1.0] * act_space
                    if obs_next_n[k][b][2] > obs_next_n[self.agent_index][b][2]
                    else [0.0] * act_space for k in range(self.n)
                ]
                # filter perturbations to only apply for leading agents
                # scale by L2 norm of original actions to prevent the perturb from overwhelming action
                epsilon = perturb[b].flatten() * np.array(
                    leading_agents).flatten() * np.linalg.norm(
                        act_serial_vals[b], 2)
                act_serial_vals[b] += epsilon

            # target_q_next = self.q_debug['target_q_values'](*(obs_next_n + target_act_next_n))
            target_q_next = self.q_debug['target_q_values'](
                *([obs_serial_vals] + [act_serial_vals]))
            target_q += rew + self.gamma * (1.0 - done) * target_q_next
        target_q /= num_sample
        q_loss = self.q_train(*(obs_n + act_n + [target_q]))

        # get current actions and observations flattened
        act_serial_vals = self.q_debug['act_serial_values'](*(act_n))
        obs_serial_vals = self.q_debug['obs_serial_values'](*(obs_n))
        # compute L2 normalized partial derivatives of Q function wrt actions
        grad_norm_value = [
            self.p_debug['grad_norm_value'](*([[obs_serial_vals[j]]] +
                                              [[act_serial_vals[j]]]))
            for j in range(self.batch_size)
        ]
        assert len(grad_norm_value) == self.batch_size
        # scale the raw gradients by alpha
        perturb = np.array(grad_norm_value) * 0.01
        # update leader actions using these perturbations
        for b in range(self.batch_size):
            # find all the leaders wrt current agent (agent_index)
            leading_agents = [
                [1.0] * act_space
                if obs_next_n[k][b][2] > obs_next_n[self.agent_index][b][2]
                else [0.0] * act_space for k in range(self.n)
            ]
            # filter perturbations to only apply for leading agents
            epsilon = perturb[b].flatten() * np.array(leading_agents).flatten(
            ) * np.linalg.norm(act_serial_vals[b], 2)
            epsilon_n = [
                epsilon[k * act_space:(k * act_space) + act_space]
                for k in range(self.n)
            ]
            # update each agent action for current batch sample "b"
            for k in range(self.n):
                act_n[k][b] += epsilon_n[k]

        # train p network
        p_loss = self.p_train(*(obs_n + act_n))

        self.p_update()
        self.q_update()

        return [
            q_loss, p_loss,
            np.mean(target_q),
            np.mean(rew),
            np.mean(target_q_next),
            np.std(target_q)
        ]
def train(arglist, PID=None, lock=None):
    start_time = time.time()
    # global replay_buffer
    with U.single_threaded_session() as sess:
        # Create environment
        env = make_env(arglist.scenario, arglist, arglist.benchmark)
        # Create agents networks
        obs_shape_n = [env.observation_space[i].shape for i in range(env.n)]

        ####changed by yuan li
        num_adversaries = copy.deepcopy(env.num_adversaries)
        arglist.num_adversaries = copy.deepcopy(num_adversaries)

        if comm_rank != 0 and comm_rank != 1:
            req = None
            wait_flag = False

            actors = get_agents(env, num_adversaries, obs_shape_n, arglist)

            U.initialize()

            #var_list = [var for var in tf.trainable_variables()]
            #加载模型
            var_list_n = []
            for actor in actors:
                var_list_n.extend(actor.get_variable_list())
            saver = tf.train.Saver(var_list=var_list_n, max_to_keep=20)
            if arglist.load_dir != "":
                U.load_state(arglist.load_dir, saver)

            episode_rewards, agent_rewards, final_ep_rewards, final_ep_ag_rewards, agent_info = initialize_variables(
                env)
            obs_n = env.reset()
            step = 0
            episode_step = 0
            sample_number = 0
            t_start = time.time()
            updata_time = 0
            print('Starting iterations...')

            invalid_train, red_win, red_leave, green_win, green_leave = 0, 0, 0, 0, 0

            while True:
                if not wait_flag:
                    #req = comm.irecv(350000, source=(comm_rank - 1 + comm_size) % comm_size, tag=11)
                    req = comm.irecv(350000, source=0, tag=11)
                    wait_flag = True
                else:
                    data_recv = req.test()
                    if data_recv[0]:
                        wait_flag = False
                        if data_recv[1] == 'finish':
                            #finish = True
                            comm.send('finish', dest=1, tag=11)
                            break
                        else:
                            update_start = time.time()
                            i = 0
                            j = 0
                            for var in tf.trainable_variables():
                                if 11 < (i % 24) < 24:
                                    var.load(data_recv[1][j], sess)
                                    j += 1
                                i += 1

                            #for var in var_list:
                            #    var.load(data_recv[1][i], sess)
                            #    i += 1
                            #print("111111111111111111111111,load param")
                            #for i, actor in enumerate(actors):
                            #    actor.load_weights(data_recv[1][i], sess)
                            update_end = time.time()
                            #print("step:{}, rank0_update_end_time:{}".format(step, update_end))
                            updata_time += (update_end - update_start)
                            step += 1
                    else:
                        wait_flag = True
                        # get action
                        action_n = [
                            agent.action(obs)
                            for agent, obs in zip(actors, obs_n)
                        ]
                        # environment step
                        new_obs_n, rew_n, done_n, info_n = env.step(action_n)
                        episode_step += 1
                        # changed by liyuan
                        done = any(done_n)
                        terminal = (episode_step >= arglist.max_episode_len)
                        ###liyuan: compute the arverage win rate
                        if green_leave_screen(env) or adversary_all_die(
                                env) or adversary_leave_screen(env):
                            terminal = True

                        if adversary_all_die(env):
                            green_win += 1
                        if green_leave_screen(env):
                            invalid_train += 1
                            green_leave += 1
                        if adversary_leave_screen(env):
                            red_leave += 1

                        if episode_step >= arglist.max_episode_len:
                            for i, agent in enumerate(env.agents):
                                if agent.adversary:
                                    rew_n[i] -= 50

                        if adversary_all_die(env):
                            for i, agent in enumerate(env.agents):
                                if agent.adversary:
                                    rew_n[i] -= 100

                        if done:
                            red_win = red_win + 1
                            for i, agent in enumerate(env.agents):
                                if agent.adversary:
                                    rew_n[i] += 200
                                    rew_n[i] += (
                                        arglist.max_episode_len -
                                        episode_step) / arglist.max_episode_len

                        #send data
                        data = [obs_n, action_n, rew_n, new_obs_n, done_n]
                        comm.send(data, dest=1, tag=11)

                        sample_number += 1

                        #replay_buffer.add(obs_n, action_n, rew_n, new_obs_n, done_n)
                        obs_n = new_obs_n
                        for i, rew in enumerate(rew_n):
                            episode_rewards[-1] += rew
                            agent_rewards[i][-1] += rew

                        if done or terminal:
                            obs_n = env.reset()
                            episode_step = 0
                            episode_rewards.append(0)
                            for a in agent_rewards:
                                a.append(0)
                            agent_info.append([[]])

                        # save model, display training output
                        if (terminal or done) and (len(episode_rewards) %
                                                   arglist.save_rate == 0):
                            if red_win >= 0.8 * arglist.save_rate:
                                temp_dir = arglist.save_dir + "_" + str(
                                    len(episode_rewards)) + "_" + str(
                                        red_win) + "_{}".format(PID)
                                U.save_state(temp_dir, saver=saver)
                            # print statement depends on whether or not there are adversaries
                            if num_adversaries == 0:
                                print(
                                    "Rank {}, steps: {}, episodes: {}, mean episode reward: {}, time: {}"
                                    .format(
                                        comm_rank, sample_number,
                                        len(episode_rewards),
                                        np.mean(episode_rewards[-arglist.
                                                                save_rate:]),
                                        round(time.time() - t_start, 3)))
                            else:
                                print(
                                    "Rank {}, steps: {}, episodes: {}, mean episode reward: {}, agent episode reward: {}, time: {}"
                                    .format(
                                        comm_rank, sample_number,
                                        len(episode_rewards),
                                        np.mean(episode_rewards[-arglist.
                                                                save_rate:]),
                                        [
                                            np.mean(rew[-arglist.save_rate:])
                                            for rew in agent_rewards
                                        ], round(time.time() - t_start, 3)))
                                print(
                                    "Rank  {}, red win: {}, green win: {}, red all leave: {}, green all leave: {}"
                                    .format(comm_rank, red_win, green_win,
                                            red_leave, green_leave))

                                middle_time = time.time()
                                print(
                                    "sample_number:{}, train_step:{}, update_time:{}, total_time:{}"
                                    .format(sample_number, step, updata_time,
                                            middle_time - start_time))
                                mydata = []
                                mydata.append(str(len(episode_rewards)))
                                mydata.append(
                                    str(
                                        np.mean(episode_rewards[-arglist.
                                                                save_rate:])))
                                mydata.append(
                                    str(
                                        np.mean(agent_rewards[0]
                                                [-arglist.save_rate:])))
                                mydata.append(
                                    str(
                                        np.mean(agent_rewards[1]
                                                [-arglist.save_rate:])))
                                mydata.append(
                                    str(
                                        np.mean(agent_rewards[2]
                                                [-arglist.save_rate:])))
                                mydata.append(str(red_win))
                                mydata.append(
                                    str(round(time.time() - t_start, 3)))
                                out = open('1mydata_{}.csv'.format(comm_rank),
                                           'a',
                                           newline='')
                                csv_write = csv.writer(out, dialect='excel')
                                csv_write.writerow(mydata)

                            if len(episode_rewards) > 3000:
                                U.save_state(arglist.save_dir, saver=saver)

                            invalid_train, red_win, red_leave, green_win, green_leave = 0, 0, 0, 0, 0
                            t_start = time.time()
                            # Keep track of final episode reward
                            final_ep_rewards.append(
                                np.mean(episode_rewards[-arglist.save_rate:]))
                            for rew in agent_rewards:
                                final_ep_ag_rewards.append(
                                    np.mean(rew[-arglist.save_rate:]))

            end_time = time.time()
            print("rank{}_time:{}".format(comm_rank, end_time - start_time))
            print("rank{}_update_time:{}".format(comm_rank, updata_time))
            print("rank{}_step:{}".format(comm_rank, step))

        if comm_rank == 1:
            replay_buffer = ReplayBuffer(1e6)

            wait_flag_1 = False
            wait_flag_2 = False
            wait_flag_3 = False
            req1 = None
            req2 = None
            req3 = None
            sample = 0
            step = 0
            req_list = []
            while True:
                if not wait_flag_1 or not wait_flag_2 or not wait_flag_3:
                    if not wait_flag_1:
                        req1 = comm.irecv(source=2, tag=11)
                        wait_flag_1 = True
                    if not wait_flag_2:
                        req2 = comm.irecv(source=3, tag=11)
                        wait_flag_2 = True
                    if not wait_flag_3:
                        req3 = comm.irecv(source=4, tag=11)
                        wait_flag_3 = True
                else:
                    data_recv_1 = req1.test()
                    data_recv_2 = req2.test()
                    data_recv_3 = req3.test()
                    if data_recv_1[0] or data_recv_2[0] or data_recv_3[0]:
                        if data_recv_1[0]:
                            wait_flag_1 = False
                            if data_recv_1[1] == 'finish':
                                break
                            else:
                                obs_n, action_n, rew_n, new_obs_n, done_n = data_recv_1[
                                    1]
                                replay_buffer.add(obs_n, action_n, rew_n,
                                                  new_obs_n, done_n)
                                sample += 1

                        if data_recv_2[0]:
                            wait_flag_2 = False
                            if data_recv_2[1] == 'finish':
                                break
                            else:
                                obs_n, action_n, rew_n, new_obs_n, done_n = data_recv_2[
                                    1]
                                replay_buffer.add(obs_n, action_n, rew_n,
                                                  new_obs_n, done_n)
                                sample += 1

                        if data_recv_3[0]:
                            wait_flag_3 = False
                            if data_recv_3[1] == 'finish':
                                break
                            else:
                                obs_n, action_n, rew_n, new_obs_n, done_n = data_recv_3[
                                    1]
                                replay_buffer.add(obs_n, action_n, rew_n,
                                                  new_obs_n, done_n)
                                sample += 1
                        '''
                        #计算接收100个样本然后发送样本用的时间
                        if (sample % 100 == 0) and len(replay_buffer) >= arglist.batch_size * arglist.max_episode_len:
                            start = time.time()
                            replay_sample_index = replay_buffer.make_index(arglist.batch_size)
                            send_data = replay_buffer.sample_index(replay_sample_index)
                            #send_data = (obs_n_a, act_n_a, rew_n_a, obs_next_n_a, done_n_a)
                            comm.send(send_data, dest=(comm_rank + 1) % comm_size, tag=11)
                            sample = 0
                            step += 1
                            end = time.time()
                            print("rank1 send sample time:", end-start)
                        '''

                    else:
                        wait_flag_1 = True
                        wait_flag_2 = True
                        wait_flag_3 = True
                        if (sample // 100 > 0) and len(
                                replay_buffer
                        ) >= arglist.batch_size * arglist.max_episode_len:
                            replay_sample_index = replay_buffer.make_index(
                                arglist.batch_size)
                            send_data = replay_buffer.sample_index(
                                replay_sample_index)
                            #send_data = (obs_n_a, act_n_a, rew_n_a, obs_next_n_a, done_n_a)
                            comm.send(send_data, dest=0, tag=11)
                            sample = 0
                            step += 1

            end_time = time.time()
            print("rank1_time:", end_time - start_time)
            print("rank1_step", step)

        if comm_rank == 0:
            extract_time = 0
            step = 0

            learners = get_agents(env, num_adversaries, obs_shape_n, arglist)

            var_list_n = []
            for learner in learners:
                var_list_n.extend(learner.get_variable_list())

            U.initialize()

            #var_list = [var for var in tf.trainable_variables()]

            # 加载模型
            saver = tf.train.Saver(var_list=var_list_n, max_to_keep=20)
            if arglist.load_dir != "":
                U.load_state(arglist.load_dir, saver)

            while True:
                if step >= STEP:
                    for i in range(comm_size - 2):
                        comm.send('finish', dest=(i + 2), tag=11)
                    break
                else:
                    start = time.time()
                    data_recv = comm.recv(source=1, tag=11)

                    for i, agent in enumerate(learners):
                        agent.update(learners, data_recv)

                    #dict_list = []
                    param = []
                    extract_start = time.time()
                    i = 0
                    for var in tf.trainable_variables():
                        if 11 < (i % 24) < 24:
                            param.append(sess.run(var))
                        i += 1
                    #print("2222222222222222 load weights")
                    #for var in var_list:
                    #   param.append(sess.run(var))

                    extract_end = time.time()
                    extract_time += (extract_end - extract_start)

                    for i in range(comm_size - 2):
                        comm.send(param, dest=(i + 2), tag=11)
                    #print("222222222222222222222222,send param")

                    step += 1
                    end = time.time()
                    #print("rank2 train time:{}, extract_time:{}".format(end - start, extract_end - extract_start))
            end_time = time.time()
            print("rank0_time:", end_time - start_time)
            print("rank0_extract_time:", extract_time)
            print("rank0_step:", step)
Пример #29
0
class MADDPGAgentTrainer(AgentTrainer):
    def __init__(self,
                 name,
                 model,
                 obs_shape_n,
                 act_space_n,
                 agent_index,
                 args,
                 safety_layer=None,
                 local_q_func=False):
        self.name = name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        self.safety_layer = safety_layer
        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(
                U.BatchInput(obs_shape_n[i],
                             name="observation" + str(i)).get())
        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            q_index=agent_index,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units)
        self.act, self.p_train, self.p_update, self.p_debug = p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            p_index=agent_index,
            p_func=model,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units)
        # Create experience buffer
        self.replay_buffer = ReplayBuffer(1e6)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None

    def action(self, obs, c=None, env=None):
        action = self.act(obs[None])[0]
        if_call = False
        return action, if_call

    def action_real(self, obs, c=None, env=None):
        # get action from DDPG
        action = self.act(obs[None])[0]
        action_real = action
        if_call = False
        dist = np.sqrt(
            np.sum(
                np.square(env.agents[0].state.p_pos -
                          env.world.landmarks[-1].state.p_pos)))

        # call for the safety_layer
        if self.safety_layer and c is not None and env is not None and dist > 1.5:
            # judge the collision in future 10 steps
            collision_flag = False
            env_future = copy.deepcopy(env)
            obs_future = copy.deepcopy(obs)
            trajectory = np.zeros([4, self.safety_layer.UAV_config.N + 1])
            trajectory[0, 0] = obs_future[2]
            trajectory[1, 0] = obs_future[3]
            trajectory[2, 0] = obs_future[4]
            trajectory[3, 0] = obs_future[5]
            for i in range(self.safety_layer.UAV_config.N):
                action_future = [self.act(obs_future[None])[0]]
                # environment step
                new_obs_n, rew_n, done_n, info_n = env_future.step(
                    action_future)
                is_any_collision = []
                for agent in env_future.agents:
                    temp = False
                    for _, landmark in enumerate(
                            env_future.world.landmarks[0:-1]):
                        dist = np.sqrt(np.sum(np.square(agent.state.p_pos - landmark.state.p_pos))) \
                               - (agent.size + landmark.size)
                        if dist <= 0:
                            temp = True
                    is_any_collision.append(temp)
                if is_any_collision[0]:
                    collision_flag = True
                done_future = all(done_n)
                if done_future:
                    break
                obs_future = new_obs_n[0]
                trajectory[0, i + 1] = obs_future[2]
                trajectory[1, i + 1] = obs_future[3]
                trajectory[2, i + 1] = obs_future[4]
                trajectory[3, i + 1] = obs_future[5]
            if not collision_flag:
                return action_real, action, if_call
            action, if_call = self.safety_layer.get_safe_action(
                obs, action, trajectory)
        return action_real, action, if_call

    def set_safety_layer(self, safety_layer):
        self.safety_layer = safety_layer

    def experience(self, obs, act, rew, new_obs, done, terminal):
        # Store transition in the replay buffer.
        self.replay_buffer.add(obs, act, rew, new_obs, float(done))

    def preupdate(self):
        self.replay_sample_index = None

    def update(self, agents, t):
        if len(
                self.replay_buffer
        ) < self.max_replay_buffer_len:  # replay buffer is not large enough
            return
        if not t % 100 == 0:  # only update every 100 steps
            return

        self.replay_sample_index = self.replay_buffer.make_index(
            self.args.batch_size)
        # collect replay sample from all agents
        obs_n = []
        obs_next_n = []
        act_n = []
        index = self.replay_sample_index
        for i in range(self.n):
            obs, act, rew, obs_next, done = agents[
                i].replay_buffer.sample_index(index)
            obs_n.append(obs)
            obs_next_n.append(obs_next)
            act_n.append(act)
        obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index)

        # train q network
        num_sample = 1
        target_q = 0.0
        for i in range(num_sample):
            target_act_next_n = [
                agents[i].p_debug['target_act'](obs_next_n[i])
                for i in range(self.n)
            ]
            target_q_next = self.q_debug['target_q_values'](
                *(obs_next_n + target_act_next_n))
            target_q += rew + self.args.gamma * (1.0 - done) * target_q_next
        target_q /= num_sample
        q_loss = self.q_train(*(obs_n + act_n + [target_q]))

        # train p network
        p_loss = self.p_train(*(obs_n + act_n))

        self.p_update()
        self.q_update()

        return [
            q_loss, p_loss,
            np.mean(target_q),
            np.mean(rew),
            np.mean(target_q_next),
            np.std(target_q)
        ]
Пример #30
0
class MADDPGAgentTrainer(AgentTrainer):
    def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False):
        self.name = name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i)).get())

        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            q_index=agent_index,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units
        )
        self.act, self.p_train, self.p_update, self.p_debug = p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            p_index=agent_index,
            p_func=model,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units
        )
        # Create experience buffer
        self.replay_buffer = ReplayBuffer(1e6)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None

    def action(self, obs):
        return self.act(obs[None])[0]

    def experience(self, obs, act, rew, new_obs, done, terminal):
        # Store transition in the replay buffer.
        self.replay_buffer.add(obs, act, rew, new_obs, float(done))

    def preupdate(self):
        self.replay_sample_index = None

    def update(self, agents, t):
        if len(self.replay_buffer) < self.max_replay_buffer_len: # replay buffer is not large enough
            return
        if not t % 100 == 0:  # only update every 100 steps
            return

        self.replay_sample_index = self.replay_buffer.make_index(self.args.batch_size)
        # collect replay sample from all agents
        obs_n = []
        obs_next_n = []
        act_n = []
        index = self.replay_sample_index
        for i in range(self.n):
            obs, act, rew, obs_next, done = agents[i].replay_buffer.sample_index(index)
            obs_n.append(obs)
            obs_next_n.append(obs_next)
            act_n.append(act)
        obs, act, rew, obs_next, done = self.replay_buffer.sample_index(index)

        # train q network
        num_sample = 1
        target_q = 0.0
        for i in range(num_sample):
            target_act_next_n = [agents[i].p_debug['target_act'](obs_next_n[i]) for i in range(self.n)]
            target_q_next = self.q_debug['target_q_values'](*(obs_next_n + target_act_next_n))
            target_q += rew + self.args.gamma * (1.0 - done) * target_q_next
        target_q /= num_sample
        q_loss = self.q_train(*(obs_n + act_n + [target_q]))

        # train p network
        p_loss = self.p_train(*(obs_n + act_n))

        self.p_update()
        self.q_update()

        return [q_loss, p_loss, np.mean(target_q), np.mean(rew), np.mean(target_q_next), np.std(target_q)]