Пример #1
0
 def __init__(self, env, num_constraints, model, obs_shape, act_space):
     self.name = "safe-layer"
     self._env = env
     self.num_constraints = num_constraints  # = num_landmarks - 1 because the last landmark is the target
     self.max_episode_length = 300
     self.batch_size = 1024
     self.lr = 0.1
     self.steps_per_epoch = 6000
     self.epochs = 250
     self.evaluation_steps = 1500
     self.replay_buffer_size = 1000000
     self.num_units = 10
     self._train_global_step = 0
     self.max_replay_buffer = self.batch_size * self.max_episode_length  # 76800
     self.replay_buffer = ReplayBuffer(self.replay_buffer_size)  # 1e6
     obs_ph = U.BatchInput(obs_shape, name="observation").get()
     c_ph = [
         U.BatchInput([1], name="constraints_value" + str(_)).get()
         for _ in range(self.num_constraints)
     ]
     self.c_next_train, self.c_next_values, self.g_next_values = c_next(
         scope=self.name,
         make_obs_ph=obs_ph,
         act_space=act_space,
         c_ph=c_ph,
         num_constraints=self.num_constraints,
         c_next_func=model,
         optimizer=tf.train.AdamOptimizer(learning_rate=self.lr),
         grad_norm_clipping=0.5,
         num_units=self.num_units,
     )
Пример #2
0
 def __init__(self, name, model, obs_shape_n, act_space_n, act_traj_shape_n,intent_shape,  agent_index, args, local_q_func=False):
     self.name = name
     self.n = len(obs_shape_n)
     self.agent_index = agent_index
     self.args = args
     obs_ph_n = []
     act_traj_ph_n = []
     intent_ph_n = []
     for i in range(self.n):
         obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i)).get())
         act_traj_ph_n.append(U.BatchInput(act_traj_shape_n[i], name = "action_trajectory"+str(i)).get())
         intent_ph_n.append(U.BatchInput(intent_shape[i], name = "intent"+str(i)).get())
     self.act_size = act_space_n[0].n
     self.get_intent, self.i_train, self.i_update, self.i_debug = i_train(
         scope=self.name,
         make_obs_ph_n=obs_ph_n,
         intent_ph_n = intent_ph_n,
         act_space_n = act_space_n,
         make_act_traj_ph_n = act_traj_ph_n,
         make_intent_ph_n  =intent_ph_n,
         i_func = model,
         i_index = agent_index,
         output_size = (self.n-1) * self.act_size,
         optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
         grad_norm_clipping=0.5,
         num_units=args.num_units,
         reuse = False
         ) 
     # Create all the functions necessary to train the model
     self.q_train, self.q_update, self.q_debug = q_train(
         scope=self.name,
         make_obs_ph_n=obs_ph_n,
         act_space_n=act_space_n,
         make_intent_ph_n = intent_ph_n,
         q_index=agent_index,
         q_func=model,
         optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
         grad_norm_clipping=0.5,
         local_q_func=local_q_func,
         num_units=args.num_units
     )
     self.act, self.p_train, self.p_update, self.p_debug = p_train(
         scope=self.name,
         make_obs_ph_n=obs_ph_n,
         act_space_n=act_space_n,
         make_intent_ph_n = intent_ph_n,
         p_index=agent_index,
         p_func=model,
         q_func=model,
         optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
         grad_norm_clipping=0.5,
         local_q_func=local_q_func,
         num_units=args.num_units
     )
     # Create experience buffer
     self.replay_buffer = ReplayBuffer(1e6)
     self.max_replay_buffer_len = args.batch_size * args.max_episode_len
     self.replay_sample_index = None
Пример #3
0
 def __init__(self,
              name,
              model,
              CNN_model,
              obs_shape_n,
              obs_map_shape_n,
              act_space_n,
              agent_index,
              args,
              local_q_func=False):
     self.name = name
     self.n = len(obs_shape_n)
     self.agent_index = agent_index
     self.args = args
     obs_ph_n = []
     obs_map_ph_n = []
     for i in range(self.n):
         obs_ph_n.append(
             U.BatchInput(obs_shape_n[i],
                          name="observation" + str(i)).get())
         obs_map_ph_n.append(
             U.BatchInput(obs_map_shape_n[i],
                          name="observation_map" + str(i)).get())
     # Create all the functions necessary to train the model
     self.q_train, self.q_update, self.q_debug = q_train(
         scope=self.name,
         make_obs_ph_n=obs_ph_n,
         act_space_n=act_space_n,
         q_index=agent_index,
         q_func=model,
         shared_CNN=CNN_model,
         optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
         grad_norm_clipping=0.5,
         local_q_func=local_q_func,
         num_units=args.num_units,
         make_obs_map_ph_n=obs_map_ph_n)
     self.act, self.p_train, self.p_update, self.p_debug = p_train(
         scope=self.name,
         make_obs_ph_n=obs_ph_n,
         act_space_n=act_space_n,
         p_index=agent_index,
         p_func=model,
         q_func=model,
         shared_CNN=CNN_model,
         optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
         grad_norm_clipping=0.5,
         local_q_func=local_q_func,
         num_units=args.num_units,
         make_obs_map_ph_n=obs_map_ph_n)
     # Create experience buffer
     self.replay_buffer = ReplayBuffer(1e6)
     self.max_replay_buffer_len = args.batch_size * args.max_episode_len // 10
     self.batch_size = args.batch_size
     self.replay_sample_index = None
Пример #4
0
    def __init__(self, n_agents, name, model, state_shape, obs_shape_n, act_space_n, agent_index, args, local_q_func=False):
        self.name = name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        obs_ph_n = []
        state_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i), lstm=args.actor_lstm or args.critic_lstm).get())
            state_ph_n.append(U.BatchInput(state_shape, name="state" + str(i)).get())

        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = q_train(
            n_agents=n_agents,
            scope=self.name,
            make_state_ph_n=state_ph_n,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            q_index=agent_index,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr, epsilon=args.optimizer_epsilon),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units,
            discrete_action=args.discrete_action,
            target_update_tau=args.target_update_tau,
            use_global_state=args.use_global_state,
            share_weights=args.share_weights
        )
        self.act, self.act_test, self.p_train, self.p_update, self.p_debug = p_train(
            n_agents = n_agents,
            scope=self.name,
            make_state_ph_n=state_ph_n,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            p_index=agent_index,
            p_func=model,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr, epsilon=args.optimizer_epsilon),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units,
            discrete_action=args.discrete_action,
            target_update_tau=args.target_update_tau,
            use_global_state=args.use_global_state,
            share_weights=args.share_weights
        )

        # Create experience buffer
        self.replay_buffer = ReplayBuffer(1e6)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None
Пример #5
0
    def __init__(self,
                 name,
                 actor_model,
                 critic_mlp_model,
                 obs_shape_n,
                 act_space_n,
                 args,
                 local_q_func=False):
        self.name = name
        self.n = len(obs_shape_n)
        self.args = args
        obs_ph_n = []
        messages_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(
                U.BatchInput(obs_shape_n[i],
                             name="observation_" + str(i)).get())
            messages_ph_n.append(
                U.BatchInput((args.dim_message, ),
                             name="message_" + str(i)).get())

        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            make_meesages_ph_n=messages_ph_n,
            act_space_n=act_space_n,
            q_func=critic_mlp_model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units,
        )
        self.act, self.p_train, self.p_update, self.p_debug = p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            make_meesages_ph_n=messages_ph_n,
            act_space_n=act_space_n,
            p_func=actor_model,
            q_func=critic_mlp_model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units,
            beta=args.beta,
        )
        # Create experience buffer
        self.replay_buffer = ReplayBuffer(1e6)
        # self.max_replay_buffer_len = 50 * args.max_episode_len
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None
Пример #6
0
    def __init__(self, name, p_model, model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False):
        self.name = name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        pMA_model = p_model(args.num_adversaries, 1, agent_index)
        obs_ph_n = []
        memory_ph_in = []
        for i in range(self.n):
            obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i)).get())
            if i < arglist.num_adversaries:
                memory_ph_in.append(U.BatchInput((args.memUnits, ), name="memory_state"+str(i)).get())

        if self.agent_index == 0:
            reuse = False
        else:
            reuse = True

        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = qMA_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            q_index=agent_index,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.critic_lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.critic_units,
            reuse=reuse
        )
        self.act, self.memory_out, self.p_train, self.p_update, self.p_debug = pMA_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            make_memory_ph_n=memory_ph_in,
            act_space_n=act_space_n,
            p_index=agent_index,
            p_func=pMA_model.adv_model,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.actor_lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            critic_units=args.critic_units,
            reuse=reuse
        )
        # Create experience buffer
        self.replay_buffer = MAReplayBuffer(1e6)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None
Пример #7
0
    def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args):
        self.name = name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        self.counter = 0

        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i)+"_ag"+str(agent_index)).get())

        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,#[lambda name: U.BatchInput(obs_shape, name=name) for obs_shape in obs_shape_n],
            act_space_n=act_space_n,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5
        )
        self.act, self.p_train, self.p_update, self.p_debug = p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,#[lambda name: U.BatchInput(obs_shape, name=name) for obs_shape in obs_shape_n],
            act_space_n=act_space_n,
            p_index=0,
            p_func=model,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5
        )
        # Create experience buffer
        self.replay_buffer = [ReplayBuffer(1e6) for i in range(self.n)]
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
 def __init__(self,
              name,
              model,
              obs_shape_n,
              act_space_n,
              agent_index,
              args,
              agent_type,
              local_q_func=False):
     self.name = name
     self.n = 1
     self.agent_index = agent_index
     self.args = args
     self.u_estimation = args.u_estimation
     self.constrained = args.constrained
     self.constraint_type = args.constraint_type
     self.agent_type = agent_type
     if self.agent_type == "good":
         cvar_alpha = args.cvar_alpha_good_agent
     elif self.agent_type == "adversary":
         cvar_alpha = args.cvar_alpha_adv_agent
     obs_ph_n = []
     obs_ph_n.append(
         U.BatchInput(obs_shape_n[agent_index], name="observation0").get())
     # Create all the functions necessary to train the model
     self.q_train, self.q_train2, self.q_train3, self.q_update, self.u_update, self.q_debug = q_train(
         scope=self.name,
         make_obs_ph_n=obs_ph_n,
         act_space_n=act_space_n,
         q_index=agent_index,
         q_func=model,
         u_func=model,
         optimizer=tf.train.AdamOptimizer(learning_rate=args.lr_critic),
         optimizer_lamda=tf.train.AdamOptimizer(
             learning_rate=args.lr_lamda),
         exp_var_alpha=args.exp_var_alpha,
         cvar_alpha=cvar_alpha,
         cvar_beta=args.cvar_beta,
         grad_norm_clipping=0.5,
         local_q_func=local_q_func,
         num_units=args.num_units,
         u_estimation=self.u_estimation,
         constrained=self.constrained,
         constraint_type=self.constraint_type,
         agent_type=self.agent_type)
     self.act, self.p_train, self.p_update, self.p_debug = p_train(
         scope=self.name,
         make_obs_ph_n=obs_ph_n,
         act_space_n=act_space_n,
         p_index=agent_index,
         p_func=model,
         q_func=model,
         optimizer=tf.train.AdamOptimizer(learning_rate=args.lr_actor),
         grad_norm_clipping=0.5,
         local_q_func=local_q_func,
         num_units=args.num_units)
     # Create experience buffer
     self.replay_buffer = ReplayBuffer(1e6)
     self.max_replay_buffer_len = args.batch_size * args.max_episode_len
     self.replay_sample_index = None
Пример #9
0
    def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, agent_type="good", local_q_func=False):
        self.name = name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i)).get())
        
        if(agent_type == "good"):
            self.mic = float(args.good_mic)
        else:
            self.mic = float(args.adv_mic)
        
        print("MIC for ", agent_type, " agent is ", self.mic)
        self.agent_type = agent_type

        # make a multivariate for each agent. 

        self.multivariate_mean = None
        self.multivariate_cov = None 
        self.marginal_aprox_lr = 1e-2
        self.action_history = []

        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            q_index=agent_index,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), 
            mut_inf_coef=self.mic ,
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units
        )
        self.act, self.p_train, self.p_update, self.p_debug = p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            p_index=agent_index,
            p_func=model,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr), 
            mut_inf_coef=self.mic ,
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units
        )

        # Create experience buffer
        self.replay_buffer = ReplayBuffer(1e6)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None
Пример #10
0
    def __init__(self,
                 name,
                 model,
                 obs_shape_n,
                 act_space_n,
                 agent_index,
                 args,
                 local_q_func=False):
        self.name = name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(
                U.BatchInput(obs_shape_n[i],
                             name="observation" + str(i)).get())

        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            q_index=agent_index,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units,
            # reuse = tf.compat.v1.AUTO_REUSE,
        )
        self.act, self.p_train, self.p_update, self.p_debug, num_actions = p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            p_index=agent_index,
            p_func=model,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units,
            # reuse = tf.compat.v1.AUTO_REUSE,
        )

        # Create experience buffer
        self.replay_buffer = ReplayBuffer(1e6, args.batch_size, num_actions,
                                          obs_ph_n[0].shape[1])
        #self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.max_replay_buffer_len = args.batch_size  # I mean this is how it should be. This is what we're actually doing...

        self.replay_sample_index = None
Пример #11
0
    def __init__(self, name, mlp_model, lstm_model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False):
        self.name = name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i)).get())

        # LSTM placeholders
        p_res = 7
        q_res = 1

# set up initial states
        self.q_c, self.q_h = create_init_state(num_batches=1, len_sequence=args.num_units)
        self.p_c, self.p_h = create_init_state(num_batches=1, len_sequence=args.num_units)

        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = q_LSTM_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            q_index=agent_index,
            q_func=lstm_model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units
        )

        self.act, self.p_train, self.p_update, self.p_debug = p_LSTM_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            p_index=agent_index,
            p_func=lstm_model,
            q_func=lstm_model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units,
            q_debug=self.q_debug
        )
        # Create experience buffer
        self.replay_buffer = ReplayBufferLSTM(1e6)
        # self.replay_buffer = PrioritizedReplayBuffer(10000, 0.45)
        # self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.max_replay_buffer_len = args.batch_size
        self.replay_sample_index = None

        # Information tracking
        self.tracker = InfoTracker(self.name, self.args)
    def __init__(self,
                 name,
                 model,
                 obs_shape_n,
                 act_space_n,
                 agent_index,
                 args,
                 local_q_func=False,
                 u_estimation=False):
        print('in here')
        self.name = name
        self.n = 1  #len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        obs_ph_n = []
        obs_ph_n.append(
            U.BatchInput(obs_shape_n[agent_index], name="observation0").get())
        self.u_estimation = u_estimation

        # Create all the functions necessary to train the model
        l = q_train(scope=self.name,
                    make_obs_ph_n=obs_ph_n,
                    act_space_n=act_space_n,
                    q_index=agent_index,
                    q_func=model,
                    u_func=model,
                    optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
                    grad_norm_clipping=0.5,
                    local_q_func=local_q_func,
                    num_units=args.num_units,
                    u_estimation=self.u_estimation)

        if self.u_estimation:
            self.q_train, self.q_update, self.u_update, self.q_debug = l
        else:
            self.q_train, self.q_update, self.q_debug = l

        self.act, self.p_train, self.p_update, self.p_debug = p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            p_index=agent_index,
            p_func=model,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units)
        # Create experience buffer
        self.replay_buffer = ReplayBuffer(1e6)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None
Пример #13
0
    def __init__(self, name, model, obs_shape_n, act_space_n, agent_index,
                 args, local_q_func, policy_name, adversarial):
        self.name = name
        self.scope = self.name + "_" + policy_name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(
                U.BatchInput(obs_shape_n[i],
                             name="observation" + str(i)).get())

        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.scope,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            q_index=agent_index,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            adversarial=adversarial,
            adv_eps=args.adv_eps,
            adv_eps_s=args.adv_eps_s,
            num_adversaries=args.num_adversaries,
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units)
        self.act, self.p_train, self.p_update, self.p_debug = p_train(
            scope=self.scope,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            p_index=agent_index,
            p_func=model,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            adversarial=adversarial,
            adv_eps=args.adv_eps,
            adv_eps_s=args.adv_eps_s,
            num_adversaries=args.num_adversaries,
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units)
        # Create experience buffer
        self.replay_buffer = ReplayBuffer(1e6)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None
        self.policy_name = policy_name
        self.adversarial = adversarial
        self.act_space_n = act_space_n
        self.local_q_func = local_q_func
Пример #14
0
    def __init__(self, name, model_value, model_policy, obs_shape_n,
                 act_space_n, agent_index, args, hparams,
                 summary_writer=None, local_q_func=False, rngseed=None):
        self.name = name
        self.rngseed = rngseed
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        self.hparams = hparams
        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(U.BatchInput(
                obs_shape_n[i], name="observation" + str(i)).get())

        # Create all the functions necessary to train the model

        # train critic
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            q_index=agent_index,
            q_func=model_value,
            optimizer=tf.train.AdamOptimizer(learning_rate=hparams['learning_rate']),
            grad_norm_clipping=hparams['grad_norm_clipping'],
            local_q_func=local_q_func,
            num_units=args.num_units
        )

        # train policy
        self.act, self.p_train, self.p_update, self.p_debug = p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            p_index=agent_index,
            p_func=model_policy,
            q_func=model_value,
            optimizer=tf.train.AdamOptimizer(learning_rate=hparams['learning_rate']),
            grad_norm_clipping=hparams['grad_norm_clipping'],
            local_q_func=local_q_func,
            num_units=args.num_units
        )
        # Create experience buffer
        self.replay_buffer = ReplayBuffer(hparams['replay_buffer_len'], self.rngseed)
        try:
            if hparams['test_saving']:
                self.max_replay_buffer_len = 100
        except KeyError:
            self.max_replay_buffer_len = hparams['batch_size'] * args.max_episode_len
        self.replay_sample_index = None
        self.summary_writer = summary_writer
Пример #15
0
    def __init__(self, name, model, obs_shape_n, act_space_n, agent_index,
                 args):
        self.name = name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args

        # create dummy tensor flow variables to avoid Saver error
        # TODO: remove this or turn into act function
        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(
                U.BatchInput(obs_shape_n[i],
                             name="observation" + str(i)).get())
        with tf.variable_scope(self.name, reuse=None):
            self.dummy_var = U.function(obs_ph_n, outputs=tf.Variable(0))
    def __init__(self,
                 name,
                 model,
                 obs_shape_n,
                 act_space_n,
                 agent_index,
                 args,
                 local_q_func=False):  # 是否用ddpg训练
        self.name = name
        self.n = len(obs_shape_n)  # 总的agent个数
        self.agent_index = agent_index  # 当前是几号agent
        self.args = args  # cmd传入的训练参数,交互用
        obs_ph_n = []
        for i in range(self.n):  # 用于一批环境数据放入的占位符集合,收集所有agent的observations,
            # 依据他们observation的shape创造不同大小的批量占位符集合
            obs_ph_n.append(
                U.BatchInput(obs_shape_n[i],
                             name="observation" + str(i)).get())

        # Create all the functions necessary to train the model
        # 训练节点,更新target网络,字典得到对应输出的q值与target-q值(已经被session激活)
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            q_index=agent_index,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units
        )  # 得到act,训练策略网络,策略网络的target网络更新,字典给出p值和target策略网络的输出动作值
        self.act, self.p_train, self.p_update, self.p_debug = p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            p_index=agent_index,
            p_func=model,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units)
        # Create experience buffer
        self.replay_buffer = ReplayBuffer(1e6)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None
Пример #17
0
    def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False):
        self.name = name               # name of the agent
        self.n = len(obs_shape_n)      # number of agents
        self.agent_index = agent_index # Index of the specific agent
        self.args = args               # Settings of hyper-parameters
        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i)).get()) # Creates a placeholder for a batch of tensors of a given shape and dtype.

        # [Create all the functions necessary to train the model]
        # train:             U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr])
        # update_target_q:   make_update_exp(q_func_vars, target_q_func_vars)
        # q_values:          U.function(obs_ph_n + act_ph_n, q)
        # target_q_values:   U.function(obs_ph_n + act_ph_n, target_q)
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,                                             # String: "agent_1" or "agent_2" or ...
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,                                     # action_space.
            q_index=agent_index,                                         # Index of the specific agent.
            q_func=model,                                                # Defined model.
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),     # 优化方法 --- 自适应矩估计 --- Adam法 --- 学习率设定
            grad_norm_clipping=0.5,                                      # 梯度剪切 --- 防止梯度爆炸 --- 梯度超过该值,直接设定为该值
            local_q_func=local_q_func,
            num_units=args.num_units                                     # Hidden layers 隐藏节点数
        )
        
        # act:                U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample)
        # train:              U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr])
        # update_target_p:    make_update_exp(p_func_vars, target_p_func_vars)
        # p_values:           U.function([obs_ph_n[p_index]], p)
        # target_act:         U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample)
        self.act, self.p_train, self.p_update, self.p_debug = p_train(   
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            p_index=agent_index,
            p_func=model,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units
        )
        # Create experience buffer
        self.replay_buffer = ReplayBuffer(1e6)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None
Пример #18
0
    def __init__(self,
                 name,
                 model,
                 obs_shape_n,
                 act_space_n,
                 agent_index,
                 args,
                 local_q_func=False):
        self.name = name
        self.n = len(obs_shape_n)  # 16
        self.agent_index = agent_index
        self.args = args
        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(
                U.BatchInput(obs_shape_n[i],
                             name="observation" + str(i)).get())
            #obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i), dtype=tf.uint8).get()) #should we specify uint8 instead of default float?

        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            q_index=agent_index,
            q_func=model,  # multi-layer perceptron
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,  # maddpg or ddpg
            num_units=args.num_units)
        self.act, self.p_train, self.p_update, self.p_debug = p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            p_index=agent_index,
            p_func=model,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units)
        # Create experience buffer
        self.replay_buffer = ReplayBuffer(1e6)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None
Пример #19
0
    def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False):
        self.name = name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i)).get())


        self.act = p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            p_index=agent_index,
            p_func=model,
            num_units=args.num_units
        )
Пример #20
0
    def __init__(self,
                 name,
                 model,
                 obs_shape_n,
                 act_space_n,
                 agent_index,
                 args,
                 local_q_func=False,
                 reuse=False):
        self.name = name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(
                U.BatchInput(obs_shape_n[i],
                             name="observation" + str(i)).get())

        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            q_index=agent_index,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units,
            reuse=reuse)
        self.act, self.p_train, self.p_update, self.p_debug = p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            p_index=agent_index,
            p_func=model,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units,
            reuse=reuse,
            deterministic=args.benchmark and args.deterministic)
    def __init__(self, name, critic_model, policy_model, obs_shape_n, act_space_n, agent_index, args, local_q_func=False):
        self.name = name
        self.n = 4
        self.agent_index = agent_index
        self.args = args
        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation"+str(i)).get())




        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = q_train(
            scope              =  self.name,
            make_obs_ph_n      =  obs_ph_n,
            act_space_n        =  act_space_n,
            q_index            =  agent_index,
            q_func             =  critic_model,
            optimizer          =  tf.train.AdamOptimizer(learning_rate=args['lr']),
            grad_norm_clipping =  0.5,
            local_q_func       =  local_q_func,
            num_units          =  args['num_units']
        )
        self.act, self.p_train, self.p_update, self.p_debug = p_train(
            scope              = self.name,
            make_obs_ph_n      = obs_ph_n,
            act_space_n        = act_space_n,
            p_index            = agent_index,
            p_func             = policy_model,
            q_func             = critic_model,
            optimizer          = tf.train.AdamOptimizer(learning_rate=args['lr']),
            grad_norm_clipping = 0.5,
            local_q_func       = local_q_func,
            num_units          = args['num_units']
        )




        # Create experience buffer
        self.replay_buffer = ReplayBuffer(1e6)
        self.max_replay_buffer_len = args['batch_size'] * args['max_episode_len']
        self.replay_sample_index = None
Пример #22
0
    def __init__(self, name, model, obs_shape_n, act_space_n, agent_index, args, replay_buffer, local_q_func=False):
        self.name = name
        self.n = len(obs_shape_n)

        self.agent_index = agent_index
        self.args = args
        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(U.BatchInput(obs_shape_n[i], name="observation" + str(i)).get())

        # Create all the functions necessary to train the model
        self.critic = Critic(name, model, obs_ph_n, act_space_n, agent_index, args, local_q_func)
        self.actor = Actor(name, model, obs_ph_n, act_space_n, agent_index, args, local_q_func)
        # According to the tensorflow scope p_train and q_train, q_train must be in the front of p_train

        # Create experience buffer
        self.replay_buffer = replay_buffer  # ReplayBuffer(1e6)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None
Пример #23
0
    def __init__(self,
                 name,
                 model,
                 obs_shape_n,
                 act_space_n,
                 agent_index,
                 args,
                 local_q_func=False):
        self.name = name
        self.n = len(obs_shape_n)

        self.agent_index = agent_index
        self.args = args
        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(
                U.BatchInput(obs_shape_n[i],
                             name="observation" + str(i)).get())
        # Create all the functions necessary to train the model
        self.critic = Critic(name, model, obs_ph_n, act_space_n, agent_index,
                             args, local_q_func)
        self.actor = Actor(name, model, obs_ph_n, act_space_n, agent_index,
                           args, local_q_func)
Пример #24
0
    def __init__(self, name, model, state_shape, act_space_n, agent_index, args, local_q_func=False):
        self.name = name
        self.n = 1
        self.agent_index = agent_index
        self.args = args
        obs_ph_n = []
        obs_ph_n.append(U.BatchInput(state_shape, name="observation"+str(0)).get())

        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            q_index=agent_index,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units
        )
        self.act, self.p_train, self.p_update, self.p_debug = p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            p_index=agent_index,
            p_func=model,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units
        )
        # Create experience buffer
        self.replay_buffer = ReplayBuffer(1e6)
        self.min_buffer_size = args.min_buffer_size
        self.replay_sample_index = None
Пример #25
0
    def __init__(self,
                 name,
                 model,
                 obs_shape_n,
                 act_space_n,
                 agent_index,
                 args,
                 role="",
                 local_q_func=False):
        """
        Args:
            name (str): Name of the agent
            model (function): MLP Neural Network model for the agent.
            obs_shape_n (tf.placeholder): Placeholder for the observation space of all agents
            act_space_n (list): A list of the action spaces for all agents
            agent_index (int): Agent index number
            args (argparse.Namespace): Parsed commandline arguments object
            role (str): Role of the agent i.e. adversary
            local_q_func (boolean): Flag for using local q function
        """
        super(MADDPGAgentTrainerCCM, self).__init__()

        self.name = name
        self.role = role
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        obs_ph_n = []
        act_history_ph_n = []
        obs_history_ph_n = []

        hist = self.args.training_history

        obs_history_n = [(hist * x[0], ) for x in obs_shape_n]
        act_history_n = [(hist * act.n, ) for act in act_space_n]

        # act_history_n = [Discrete(act.n*(3-1)) for act in act_space_n]
        #        for act_space in act_space_n:
        #            act_space.n = act_space.n*3
        #        if act_history_n[0].n != 15:
        #            print("Line 158")

        for i in range(self.n):
            obs_ph_n.append(
                tf_util.BatchInput(obs_shape_n[i],
                                   name="observation" + str(i)).get())
            obs_history_ph_n.append(
                tf_util.BatchInput(obs_history_n[i],
                                   name="observationhistory" + str(i)).get())
            act_history_ph_n.append(
                tf_util.BatchInput(act_history_n[i],
                                   name="actionhistory" + str(i)).get())

        # obs_ph_n = [tf.concat(3*[x],1,name="observation{}".format(i)) for i,x in enumerate(obs_ph_n)]

        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            make_obs_history_n=obs_history_ph_n,
            make_act_history_n=act_history_ph_n,
            q_index=agent_index,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units)
        self.act, self.p_train, self.p_update, self.p_debug = p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            make_obs_history_n=obs_history_ph_n,
            make_act_history_n=act_history_ph_n,
            p_index=agent_index,
            p_func=model,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units)
        # Create experience buffer
        self.replay_buffer = ReplayBuffer(1e6)
        self.max_replay_buffer_len = 4 * args.batch_size * args.max_episode_len
        self.replay_sample_index = None
Пример #26
0
    def __init__(self,
                 name,
                 model,
                 obs_shape_n,
                 act_space_n,
                 agent_index,
                 args,
                 actor_lr=None,
                 critic_lr=None,
                 gamma=None,
                 num_units=None,
                 rb_size=None,
                 batch_size=None,
                 max_episode_len=None,
                 clip_norm=0.5,
                 local_q_func=False):
        self.name = name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args

        # training parameters
        self.actor_lr = actor_lr if actor_lr else args.lr
        self.critic_lr = critic_lr if critic_lr else args.lr
        self.gamma = gamma if gamma else args.gamma
        self.num_units = num_units if num_units else args.num_units
        self.rb_size = rb_size if rb_size else args.rb_size
        self.batch_size = batch_size if batch_size else args.batch_size
        self.max_episode_len = max_episode_len if max_episode_len else args.max_episode_len
        self.clip_norm = clip_norm

        # TODO: remove after testing
        import models.config as Config
        assert actor_lr == Config.maddpg_train_args['actor_lr']
        assert critic_lr == Config.maddpg_train_args['critic_lr']
        assert gamma == Config.maddpg_train_args['gamma']
        assert num_units == Config.maddpg_train_args['num_hidden']
        assert rb_size == Config.maddpg_train_args['rb_size']
        assert batch_size == Config.maddpg_train_args['batch_size']
        assert max_episode_len == Config.maddpg_train_args['nb_rollout_steps']
        assert clip_norm == Config.maddpg_train_args['clip_norm']

        obs_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(
                U.BatchInput(obs_shape_n[i],
                             name="observation" + str(i)).get())

        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            q_index=agent_index,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=self.critic_lr),
            grad_norm_clipping=self.clip_norm,
            local_q_func=local_q_func,
            num_units=self.num_units)
        self.act, self.p_train, self.p_update, self.p_debug = p_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            p_index=agent_index,
            p_func=model,
            q_func=model,
            optimizer=tf.train.AdamOptimizer(learning_rate=self.actor_lr),
            grad_norm_clipping=self.clip_norm,
            local_q_func=local_q_func,
            num_units=self.num_units)
        # Create experience buffer
        self.replay_buffer = ReplayBuffer(self.rb_size)
        self.max_replay_buffer_len = self.batch_size * self.max_episode_len
        self.replay_sample_index = None
        self.loss_names = [
            'q_loss', 'p_loss', 'mean_target_q', 'mean_rew',
            'mean_target_q_next', 'std_target_q'
        ]
Пример #27
0
    def __init__(self,
                 name,
                 p_model,
                 q_model,
                 obs_shape_n,
                 act_space_n,
                 num_adversaries,
                 args,
                 local_q_func=False):
        self.name = name
        self.n = len(obs_shape_n)
        self.args = args
        self.neighbor_n = 2
        self.num_adversaries = num_adversaries
        adj_n = []
        obs_ph_n = []
        agent_n = []
        for i in range(self.n):
            obs_ph_n.append(
                U.BatchInput(obs_shape_n[i],
                             name="observation" + str(i)).get())
            adj_n.append(
                U.BatchInput([
                    self.neighbor_n,
                    num_adversaries if i < num_adversaries else
                    (self.n - num_adversaries)
                ],
                             name="adjacency" + str(i)).get())

        # Create all the functions necessary to train the model
        self.q_train, self.q_update, self.q_values, self.target_q_values = q_train(
            name=self.name,
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            adj_n=adj_n,
            act_space_n=act_space_n,
            num_adversaries=num_adversaries,
            neighbor_n=self.neighbor_n,
            q_func=q_model,
            agent_n=self.n,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units)

        self.act, self.p_train, self.p_update, self.p_values, self.target_act = p_train(
            name=self.name,
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            adj_n=adj_n,
            act_space_n=act_space_n,
            neighbor_n=self.neighbor_n,
            p_index=agent_n,
            p_func=p_model,
            q_func=q_model,
            num_adversaries=self.num_adversaries,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units,
        )

        # Create experience buffer
        self.replay_buffer = ReplayBuffer(1e6)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None
Пример #28
0
def p_train(name,
            make_obs_ph_n,
            adj_n,
            act_space_n,
            neighbor_n,
            p_index,
            p_func,
            q_func,
            num_adversaries,
            optimizer,
            grad_norm_clipping=None,
            local_q_func=False,
            num_units=128,
            scope="trainer",
            reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]
        agent_n = len(obs_ph_n)
        vec_n = U.BatchInput([1, neighbor_n], name="vec").get()

        p_input1 = obs_ph_n[
            0:num_adversaries] if name == "adversaries" else obs_ph_n[
                num_adversaries:agent_n]
        p_input2 = adj_n[0:num_adversaries] if name == "adversaries" else adj_n[
            num_adversaries:agent_n]
        p_input3 = vec_n

        # call for actor network
        # act_space is not good!!!!!!!!!!
        p = p_func(p_input1,
                   p_input2,
                   p_input3,
                   neighbor_n,
                   num_adversaries if name == "adversaries" else
                   (agent_n - num_adversaries),
                   5,
                   scope="p_func",
                   num_units=num_units)
        p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))

        # wrap parameters in distribution
        act_pd = []
        act_sample = []
        for i in range(0, num_adversaries) if name == "adversaries" else range(
                num_adversaries, agent_n):
            act_pd_temp = act_pdtype_n[i].pdfromflat(
                p[i - (0 if name == "adversaries" else num_adversaries)])
            act_pd.append(act_pd_temp)
            act_sample.append(act_pd_temp.sample())

        temp = []
        for i in range(len(act_pd)):
            temp.append(act_pd[i].flatparam())

        # Is this regularization method correct?????????????????????????????/
        p_reg = tf.reduce_mean(tf.square(temp))

        act_input_n = act_ph_n + []
        for i in range(0, num_adversaries) if name == "adversaries" else range(
                num_adversaries, agent_n):
            act_input_n[i] = act_sample[
                i - (0 if name == "adversaries" else num_adversaries)]

        q_input = tf.concat(obs_ph_n + act_input_n, 1)
        q = []
        q_reduce_mean = []
        for a in range(0, num_adversaries) if name == "adversaries" else range(
                num_adversaries, agent_n):
            index = a if name == "adversaries" else a - num_adversaries
            temp = q_func(q_input,
                          1,
                          scope="q_func_%d" % index,
                          reuse=True,
                          num_units=num_units)[:, 0]
            q.append(temp)
            q_reduce_mean += temp
        pg_loss = -tf.reduce_mean(q)

        loss = pg_loss + p_reg * 1e-3

        optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars,
                                            grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n + adj_n + [vec_n],
                           outputs=loss,
                           updates=[optimize_expr])
        act = U.function(inputs=p_input1 +
                         (adj_n[0:num_adversaries] if name == "adversaries"
                          else adj_n[num_adversaries:agent_n]) + [p_input3],
                         outputs=act_sample,
                         list_output=True)
        p_values = U.function(
            p_input1 + (adj_n[0:num_adversaries] if name == "adversaries" else
                        adj_n[num_adversaries:agent_n]) + [p_input3],
            p,
            list_output=True)

        # target network
        target_p = p_func(p_input1,
                          p_input2,
                          p_input3,
                          neighbor_n,
                          num_adversaries if name == "adversaries" else
                          (agent_n - num_adversaries),
                          5,
                          scope="target_p_func",
                          num_units=num_units)
        target_p_func_vars = U.scope_vars(
            U.absolute_scope_name("target_p_func"))
        update_target_p = make_update_exp(p_func_vars,
                                          target_p_func_vars,
                                          central=True)

        target_act_sample = []
        for i in range(0, num_adversaries) if name == "adversaries" else range(
                num_adversaries, agent_n):
            target_act_sample.append(act_pdtype_n[i].pdfromflat(target_p[i - (
                0 if name == "adversaries" else num_adversaries)]).sample())
        target_act = U.function(
            inputs=p_input1 +
            (adj_n[0:num_adversaries] if name == "adversaries" else
             adj_n[num_adversaries:agent_n]) + [p_input3],
            outputs=target_act_sample,
            list_output=True)

        return act, train, update_target_p, p_values, target_act
Пример #29
0
def create_obs_ph_n(n_agents, obs_shape_n):
    obs_ph_n = []
    for i in range(n_agents):
        obs_ph_n.append(
            U.BatchInput(obs_shape_n[i], name="observation" + str(i)).get())
    return obs_ph_n
Пример #30
0
    def __init__(self,
                 name,
                 p_policy,
                 p_predict,
                 q_model,
                 obs_shape_n,
                 act_space_n,
                 state_shape_n,
                 agent_index,
                 args,
                 local_q_func=False):
        self.name = name
        self.n = len(obs_shape_n)
        self.agent_index = agent_index
        self.args = args
        self.obs_shape = obs_shape_n[agent_index]
        self.state_shape = state_shape_n[agent_index]
        self.p_predict = p_predict
        obs_ph_n = []
        obs_next_n = []
        obs_pred_n = []
        state_ph_n = []
        for i in range(self.n):
            obs_ph_n.append(
                U.BatchInput(obs_shape_n[i],
                             name="observation" + str(i)).get())
            obs_next_n.append(
                U.BatchInput(obs_shape_n[i], name="next_obs" + str(i)).get())
            obs_pred_n.append(
                U.BatchInput(obs_shape_n[i], name="pred_obs" + str(i)).get())
            state_ph_n.append(
                U.BatchInput(state_shape_n[i], name="state" + str(i)).get())

        # Create all the functions necessary to train the critic net
        # q_train is used for optimize Q net according to the loss in this batch
        # q_update is used to update the parameter of target net θ'i = τθi + (1 − τ)θ'i

        self.q_train, self.q_update, self.q_debug = q_train(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            act_space_n=act_space_n,
            q_index=agent_index,
            q_func=q_model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units)

        # step return the action and new_state given the obs and state
        # p_train is used to optimize p Net
        # p_update is used to update target p net as θ'i = τθi + (1 − τ)θ'i
        self.step, self.predict, self.p_train, self.p_update, self.p_debug = p_train_recurrent(
            scope=self.name,
            make_obs_ph_n=obs_ph_n,
            make_state_ph_n=state_ph_n,
            act_space_n=act_space_n,
            make_obs_next_n=obs_next_n,
            make_obs_pred_n=obs_pred_n,
            p_index=agent_index,
            p_policy=p_policy,
            p_predict=p_predict,
            q_func=q_model,
            optimizer=tf.train.AdamOptimizer(learning_rate=args.lr),
            grad_norm_clipping=0.5,
            local_q_func=local_q_func,
            num_units=args.num_units,
            reuse=tf.AUTO_REUSE)

        # Create experience buffer
        self.replay_buffer = ReplayBuffer(1e6)
        self.max_replay_buffer_len = args.batch_size * args.max_episode_len
        self.replay_sample_index = None