def p_train(make_obs_ph_n,
            act_space_n,
            p_index,
            p_func,
            q_func,
            optimizer,
            grad_norm_clipping=None,
            local_q_func=False,
            num_units=64,
            scope="trainer",
            reuse=None):
    with tf.variable_scope(scope, reuse=reuse):  # 重用变量
        # create distribtuions初始动作概率分布列表
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n
                        ]  # 为所有agent的动作空间都创造一个动作概率分布类
        # 类的集合
        # set up placeholders
        obs_ph_n = make_obs_ph_n  # 所有的agent观察到的环境信息
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]
        # 返回用于存放每个agent的动作的占位符集合,用于填充所有agent选择的动作[none]代表可以填入无数组数据
        p_input = obs_ph_n[p_index]  # 仅观察到自身周围环境

        p = p_func(p_input,
                   int(act_pdtype_n[p_index].param_shape()[0]),
                   scope="p_func",
                   num_units=num_units)
        # 建立神经网络,输出单元数为动作个数...这代码写的太呆了 输出每一个动作的值
        p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))
        # 获取该神经网络全部变量
        # wrap parameters in distribution
        act_pd = act_pdtype_n[p_index].pdfromflat(p)
        act_sample = act_pd.sample()  # 确定性动作叠加噪声进行探索,成为随机策略,得到一组act,作用未知
        p_reg = tf.reduce_mean(tf.square(
            act_pd.flatparam()))  # flatparam是所有动作的actor网络输出值的集合
        # 猜测引入p_reg是因为预测其agent动作的需要
        act_input_n = act_ph_n + []
        act_input_n[p_index] = act_pd.sample(
        )  # 仅替换自己的动作输入,自己的动作来自于自己的policy网络输出
        # 所以通过这一步将两个网络连接,通过q网络优化自己的policy网络
        q_input = tf.concat(obs_ph_n + act_input_n,
                            1)  # q输入所有的环境观察值与所有的agents采取的动作
        # q的输入
        if local_q_func:
            q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1)
        q = q_func(q_input, 1, scope="q_func", reuse=True,
                   num_units=num_units)[:, 0]
        # 这里是用的q_func由于reuse所以使用已经创建好的变量,即自己的q网络而不是再创建一个
        # q_train,p_train属于同一个scope!
        # 策略优化目标
        pg_loss = -tf.reduce_mean(q)  # loss与p_reg均需要加-号进行优化
        # 目标使q的均值最大,等于采样后的-reduce_mean最小
        loss = pg_loss + p_reg * 1e-3  # 引入熵?
        # 梯度下降优化器节点表达式
        optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars,
                                            grad_norm_clipping)

        # Create callable functions可调用函数,批量使用session训练
        train = U.function(inputs=obs_ph_n + act_ph_n,
                           outputs=loss,
                           updates=[optimize_expr])
        act = U.function(inputs=[obs_ph_n[p_index]],
                         outputs=act_sample)  # 依据自身观察给出确定性动作
        p_values = U.function([obs_ph_n[p_index]], p)  # 输出的是动作值集合

        # target network
        target_p = p_func(p_input,
                          int(act_pdtype_n[p_index].param_shape()[0]),
                          scope="target_p_func",
                          num_units=num_units)
        target_p_func_vars = U.scope_vars(
            U.absolute_scope_name("target_p_func"))
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)

        target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample()
        target_act = U.function(inputs=[obs_ph_n[p_index]],
                                outputs=target_act_sample)

        return act, train, update_target_p, {
            'p_values': p_values,
            'target_act': target_act
        }
Пример #2
0
def p_train(env,
            make_obs_ph_n,
            act_space_n,
            p_index,
            vf_func,
            shana,
            q_func,
            optimizer,
            grad_norm_clipping=None,
            local_q_func=False,
            num_units=64,
            scope="trainer",
            reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]
        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]
        policy = shana(env_spec=env,
                       af=15,
                       of=22,
                       K=2,
                       hidden_layer_sizes=(128, 128),
                       qf=q_func,
                       reg=0.001)
        act, log_pi = policy.actions_for(observations=make_obs_ph_n[p_index],
                                         with_log_pis=True)
        act_input_n = act_ph_n + []

        act_input_n[p_index] = act

        p_func_vars = policy.get_params_internal()
        q_input = tf.concat(obs_ph_n + act_input_n, 1)
        vf_input = tf.concat(obs_ph_n, 1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1)
        q = q_func(q_input, 1, scope="q_func", reuse=True,
                   num_units=num_units)[:, 0]
        vf = q_func(vf_input,
                    1,
                    scope="vf_func",
                    reuse=True,
                    num_units=num_units)[:, 0]
        vf_func_vars = U.scope_vars(U.absolute_scope_name("vf_func"))
        pg_loss = tf.reduce_mean(log_pi * tf.stop_gradient(log_pi - q + vf))
        p_reg = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES,
                                  scope=policy.name)
        loss = pg_loss + p_reg
        vf_loss = 0.5 * tf.reduce_mean((vf - tf.stop_gradient(q - log_pi))**2)
        optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars,
                                            grad_norm_clipping)
        mikoto = U.minimize_and_clip(optimizer, vf_loss, vf_func_vars,
                                     grad_norm_clipping)
        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n,
                           outputs=loss,
                           updates=[optimize_expr])
        misaka = U.function(inputs=obs_ph_n + act_ph_n,
                            outputs=loss,
                            updates=[mikoto])
        # target network
        target_p = shana(env_spec=env,
                         af=15,
                         of=22,
                         K=2,
                         hidden_layer_sizes=(128, 128),
                         qf=q_func,
                         reg=0.001,
                         name='target_policy')
        target_p_func_vars = target_p.get_params_internal()
        target_vf = q_func(vf_input,
                           1,
                           scope="target_vf_func",
                           num_units=num_units)[:, 0]
        target_vf_func_vars = U.scope_vars(
            U.absolute_scope_name("target_vf_func"))
        target_act_r, tar_log = target_p.actions_for(
            observations=obs_ph_n[p_index], with_log_pis=True)
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)
        upvf = make_update_exp(vf_func_vars, target_vf_func_vars)
        target_act = U.function(inputs=[obs_ph_n[p_index]],
                                outputs=target_act_r)
        return policy.get_actions, train, misaka, update_target_p, upvf, {
            'target_act': target_act
        }
Пример #3
0
def p_train(make_obs_ph_n,
            act_space_n,
            p_index,
            p_func,
            q_func,
            optimizer,
            num_outputs,
            grad_norm_clipping=None,
            local_q_func=False,
            num_units=64,
            scope="coma_trainer",
            reuse=None):

    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        # act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))]
        act_ph_n = [
            tf.placeholder(tf.int32, [None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]

        # actor的输入为本地的obs
        p_input = obs_ph_n[p_index]

        p = p_func(p_input,
                   int(act_pdtype_n[p_index].param_shape()[0]),
                   scope="coma_p_func",
                   num_units=num_units)
        p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))

        # wrap parameters in distribution
        act_pd = act_pdtype_n[p_index].pdfromflat(p)

        # 得到各个action的概率
        act_sample = act_pd.sample()
        # sample操作即gumble softmax  coma训练需要某个特定的动作,所以需要一个argmax操作
        act_picked = [act.tolist().index(max(act)) for act in act_sample]

        p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))

        # 为什么要加一个[]
        act_input_n = act_ph_n + []
        # 动作概率分布  替换当前agent的动作
        act_input_n[p_index] = act_picked
        q_input = tf.concat(obs_ph_n + act_input_n, 1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1)
        q = q_func(q_input,
                   num_outputs,
                   scope="coma_q_func",
                   reuse=True,
                   num_units=num_units)

        # 反事实基线
        baseline = [
            baseline_calculation(act_distribute, q_list)
            for act_distribute, q_list in zip(act_sample, q)
        ]
        # 根据真实采取的动作获得q
        actual_picked_q = [q_list[act] for act, q_list in zip(act_picked, q)]
        # 计算当前动作的q相对于反事实基线的差值
        a = [q - b for q, b in zip(actual_picked_q, baseline)]

        pg_loss = -tf.reduce_mean(a)

        loss = pg_loss + p_reg * 1e-3

        optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars,
                                            grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n,
                           outputs=loss,
                           updates=[optimize_expr])
        act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample)
        p_values = U.function([obs_ph_n[p_index]], p)

        # target network
        target_p = p_func(p_input,
                          int(act_pdtype_n[p_index].param_shape()[0]),
                          scope="coma_target_p_func",
                          num_units=num_units)
        target_p_func_vars = U.scope_vars(
            U.absolute_scope_name("target_p_func"))
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)

        target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample()
        target_act = U.function(inputs=[obs_ph_n[p_index]],
                                outputs=target_act_sample)

        return act, train, update_target_p, {
            'p_values': p_values,
            'target_act': target_act
        }
Пример #4
0
def c_next(make_obs_ph,
           act_space,
           c_ph,
           c_next_func,
           num_constraints,
           optimizer,
           grad_norm_clipping,
           num_units=64,
           reuse=False,
           scope="c_next"):
    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        act_pdtype = make_pdtype(act_space[0])
        obs_ph = make_obs_ph
        act_ph = act_pdtype.sample_placeholder([None], name="action")
        c_next_target_ph = []
        for _ in range(num_constraints):
            c_next_target_ph.append(
                tf.placeholder(tf.float32, [None, 1], name="target" + str(_)))

        c_next_input = tf.concat(obs_ph, 1)
        gs_ = []
        for _ in range(num_constraints):
            gs_.append(
                c_next_func(c_next_input,
                            int((act_pdtype.param_shape()[0]) / 2),
                            scope="c_next_func" + str(_),
                            num_units=num_units))

        c_ = []  # to be testified
        for _ in range(num_constraints):
            temp = c_ph[_] + tf.multiply(gs_[_], act_ph)
            c_.append(tf.reduce_sum(temp, -1))

        c_next_vars = [
            U.scope_vars(U.absolute_scope_name("c_next_func" + str(_)))
            for _ in range(num_constraints)
        ]

        diff = [(c_[_] - c_next_target_ph[_]) for _ in range(num_constraints)]
        c_next_loss = [
            tf.reduce_mean(tf.square(diff[_])) for _ in range(num_constraints)
        ]

        optimize_expr = [
            U.minimize_and_clip(optimizer, c_next_loss[_], c_next_vars[_],
                                grad_norm_clipping)
            for _ in range(num_constraints)
        ]

        # Create callable functions
        train = [
            U.function(inputs=[obs_ph] + [act_ph] + [c_ph[_]] +
                       [c_next_target_ph[_]],
                       outputs=c_next_loss[_],
                       updates=[optimize_expr[_]])
        ]
        c_next_values = [
            U.function([obs_ph] + [act_ph] + [c_ph[_]], c_[_])
            for _ in range(num_constraints)
        ]
        g_next_values = [
            U.function([obs_ph], gs_[_]) for _ in range(num_constraints)
        ]
        return train, c_next_values, g_next_values
Пример #5
0
def p_train_adv(make_obs_ph_n,
                act_space_n,
                p_index,
                p_func,
                q_func,
                optimizer,
                grad_norm_clipping=None,
                local_q_func=False,
                num_units=64,
                scope="trainer",
                reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]

        p_input = obs_ph_n[p_index]

        p = p_func(p_input,
                   int(act_pdtype_n[p_index].param_shape()[0]),
                   scope="p_func",
                   num_units=num_units)
        p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))

        # wrap parameters in distribution
        act_pd = act_pdtype_n[p_index].pdfromflat(p)

        act_sample = act_pd.sample()
        p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))

        act_input_n = act_ph_n + []

        # changed
        sample = act_pd.sample()
        act_input_n[p_index] = sample

        q_input = tf.concat(obs_ph_n + act_input_n, 1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1)
        q = q_func(q_input, 1, scope="q_func", reuse=True,
                   num_units=num_units)[:, 0]

        ## Modifications here
        ## Create values vector: auto solve rows by 1 column
        v = tf.tile([0.0],
                    [tf.shape(sample)[0]])  # variable for value function
        for i in range(act_space_n[p_index].n):
            # create row tensor with ith element as 1, actions are one-hot
            a = np.zeros((1, act_space_n[p_index].n), dtype=np.float32)
            a[0, i] = 1
            a = tf.convert_to_tensor(a)

            # tile this row tensor automatic number of times
            a = tf.tile(a, [tf.shape(sample)[0], 1])

            act_input = act_ph_n + []
            act_input[p_index] = tf.convert_to_tensor(a)
            q_input_tmp = tf.concat(obs_ph_n + act_input, 1)
            if local_q_func:
                q_input_tmp = tf.concat(
                    [obs_ph_n[p_index], act_input_n[p_index]], 1)
            # add Q(a[i], s) * pi(a[i]) to value
            p_i = act_pd.logits[:, i]
            # tmp is q values for action i multiplied by probability of taking action i
            tmp = tf.multiply(
                q_func(q_input_tmp,
                       1,
                       scope="q_func",
                       reuse=True,
                       num_units=num_units)[:, 0], p_i)
            v = tf.add(v, tmp)

        a = tf.subtract(v, q)
        # loss is equal to advantage
        pg_loss = -tf.reduce_mean(a)
        ## Modifications end

        loss = pg_loss + p_reg * 1e-3

        optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars,
                                            grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n,
                           outputs=loss,
                           updates=[optimize_expr])
        act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample)
        p_values = U.function([obs_ph_n[p_index]], p)

        # target network
        target_p = p_func(p_input,
                          int(act_pdtype_n[p_index].param_shape()[0]),
                          scope="target_p_func",
                          num_units=num_units)
        target_p_func_vars = U.scope_vars(
            U.absolute_scope_name("target_p_func"))
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)

        target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample()
        target_act = U.function(inputs=[obs_ph_n[p_index]],
                                outputs=target_act_sample)

        return act, train, update_target_p, {
            'p_values': p_values,
            'target_act': target_act
        }
Пример #6
0
def p_train(name,
            make_obs_ph_n,
            adj_n,
            act_space_n,
            neighbor_n,
            p_index,
            p_func,
            q_func,
            num_adversaries,
            optimizer,
            grad_norm_clipping=None,
            local_q_func=False,
            num_units=128,
            scope="trainer",
            reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]
        agent_n = len(obs_ph_n)
        vec_n = U.BatchInput([1, neighbor_n], name="vec").get()

        p_input1 = obs_ph_n[
            0:num_adversaries] if name == "adversaries" else obs_ph_n[
                num_adversaries:agent_n]
        p_input2 = adj_n[0:num_adversaries] if name == "adversaries" else adj_n[
            num_adversaries:agent_n]
        p_input3 = vec_n

        # call for actor network
        # act_space is not good!!!!!!!!!!
        p = p_func(p_input1,
                   p_input2,
                   p_input3,
                   neighbor_n,
                   num_adversaries if name == "adversaries" else
                   (agent_n - num_adversaries),
                   5,
                   scope="p_func",
                   num_units=num_units)
        p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))

        # wrap parameters in distribution
        act_pd = []
        act_sample = []
        for i in range(0, num_adversaries) if name == "adversaries" else range(
                num_adversaries, agent_n):
            act_pd_temp = act_pdtype_n[i].pdfromflat(
                p[i - (0 if name == "adversaries" else num_adversaries)])
            act_pd.append(act_pd_temp)
            act_sample.append(act_pd_temp.sample())

        temp = []
        for i in range(len(act_pd)):
            temp.append(act_pd[i].flatparam())

        # Is this regularization method correct?????????????????????????????/
        p_reg = tf.reduce_mean(tf.square(temp))

        act_input_n = act_ph_n + []
        for i in range(0, num_adversaries) if name == "adversaries" else range(
                num_adversaries, agent_n):
            act_input_n[i] = act_sample[
                i - (0 if name == "adversaries" else num_adversaries)]

        q_input = tf.concat(obs_ph_n + act_input_n, 1)
        q = []
        q_reduce_mean = []
        for a in range(0, num_adversaries) if name == "adversaries" else range(
                num_adversaries, agent_n):
            index = a if name == "adversaries" else a - num_adversaries
            temp = q_func(q_input,
                          1,
                          scope="q_func_%d" % index,
                          reuse=True,
                          num_units=num_units)[:, 0]
            q.append(temp)
            q_reduce_mean += temp
        pg_loss = -tf.reduce_mean(q)

        loss = pg_loss + p_reg * 1e-3

        optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars,
                                            grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n + adj_n + [vec_n],
                           outputs=loss,
                           updates=[optimize_expr])
        act = U.function(inputs=p_input1 +
                         (adj_n[0:num_adversaries] if name == "adversaries"
                          else adj_n[num_adversaries:agent_n]) + [p_input3],
                         outputs=act_sample,
                         list_output=True)
        p_values = U.function(
            p_input1 + (adj_n[0:num_adversaries] if name == "adversaries" else
                        adj_n[num_adversaries:agent_n]) + [p_input3],
            p,
            list_output=True)

        # target network
        target_p = p_func(p_input1,
                          p_input2,
                          p_input3,
                          neighbor_n,
                          num_adversaries if name == "adversaries" else
                          (agent_n - num_adversaries),
                          5,
                          scope="target_p_func",
                          num_units=num_units)
        target_p_func_vars = U.scope_vars(
            U.absolute_scope_name("target_p_func"))
        update_target_p = make_update_exp(p_func_vars,
                                          target_p_func_vars,
                                          central=True)

        target_act_sample = []
        for i in range(0, num_adversaries) if name == "adversaries" else range(
                num_adversaries, agent_n):
            target_act_sample.append(act_pdtype_n[i].pdfromflat(target_p[i - (
                0 if name == "adversaries" else num_adversaries)]).sample())
        target_act = U.function(
            inputs=p_input1 +
            (adj_n[0:num_adversaries] if name == "adversaries" else
             adj_n[num_adversaries:agent_n]) + [p_input3],
            outputs=target_act_sample,
            list_output=True)

        return act, train, update_target_p, p_values, target_act
Пример #7
0
def q_train(make_obs_ph_n,
            act_space_n,
            q_index,
            q_func,
            optimizer,
            grad_norm_clipping=None,
            local_q_func=False,
            num_units=64,
            scope="trainer",
            reuse=None):
    """
    Q-Learning

        make_obs_ph_n (tf.placeholder): Placeholder for the observation space of all agents
        act_space_n (list): A list of the action spaces for all agents
        q_index (int): Agent index number
        q_func (function): MLP Neural Network model for the agent.
        optimizer (function): Network Optimizer function
        grad_norm_clipping (float): Value by which to clip the norm of the gradient
        local_q_func (boolean): Flag for using local q function
        num_units (int): The number outputs for the layers of the model
        scope (str): The name of the scope
        reuse (boolean): Flag specifying whether to reuse the scope

    Returns:
        train (function): Training function for Q network
        update_target_q (function): Update function for updating Q network values
        q_debug (dict): Contains 'q_values' and 'target_q_values' of the Q network
    """
    with tf.variable_scope(scope, reuse=reuse):
        # Create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # Set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]
        target_ph = tf.placeholder(tf.float32, [None], name="target")

        q_input = tf.concat(obs_ph_n + act_ph_n, 1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1)
        q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:, 0]
        q_func_vars = tf_util.scope_vars(tf_util.absolute_scope_name("q_func"))

        q_loss = tf.reduce_mean(tf.square(q - target_ph))

        # Viscosity solution to Bellman differential equation in place of an initial condition
        q_reg = tf.reduce_mean(tf.square(q))
        # loss = q_loss + 1e-3 * q_reg
        loss = q_loss

        optimize_expr = tf_util.minimize_and_clip(optimizer, loss, q_func_vars,
                                                  grad_norm_clipping)

        # Create callable functions
        train = tf_util.function(inputs=obs_ph_n + act_ph_n + [target_ph],
                                 outputs=loss,
                                 updates=[optimize_expr])
        q_values = tf_util.function(obs_ph_n + act_ph_n, q)

        # Target network
        target_q = q_func(q_input,
                          1,
                          scope="target_q_func",
                          num_units=num_units)[:, 0]
        target_q_func_vars = tf_util.scope_vars(
            tf_util.absolute_scope_name("target_q_func"))
        update_target_q = make_update_exp(q_func_vars, target_q_func_vars)

        target_q_values = tf_util.function(obs_ph_n + act_ph_n, target_q)

        return train, update_target_q, {
            'q_values': q_values,
            'target_q_values': target_q_values
        }
Пример #8
0
def q_train(make_obs_ph_n,
            act_space_n,
            q_index,
            q_func,
            optimizer,
            adversarial,
            adv_eps,
            adv_eps_s,
            num_adversaries,
            grad_norm_clipping=None,
            local_q_func=False,
            scope="trainer",
            reuse=None,
            num_units=64):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]
        target_ph = tf.placeholder(tf.float32, [None], name="target")

        q_input = tf.concat(obs_ph_n + act_ph_n, 1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1)
        q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:, 0]
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        q_loss = tf.reduce_mean(tf.square(q - target_ph))

        # viscosity solution to Bellman differential equation in place of an initial condition
        q_reg = tf.reduce_mean(tf.square(q))
        loss = q_loss  #+ 1e-3 * q_reg

        optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars,
                                            grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph],
                           outputs=loss,
                           updates=[optimize_expr])
        q_values = U.function(obs_ph_n + act_ph_n, q)

        # target network
        target_q = q_func(q_input,
                          1,
                          scope="target_q_func",
                          num_units=num_units)[:, 0]

        # if adversarial:
        #     num_agents = len(act_ph_n)
        #     if q_index < num_adversaries:
        #         adv_rate = [adv_eps_s *(i < num_adversaries) + adv_eps * (i >= num_adversaries) for i in range(num_agents)]
        #     else:
        #         adv_rate = [adv_eps_s *(i >= num_adversaries) + adv_eps * (i < num_adversaries) for i in range(num_agents)]
        #     print("      adv rate for q_index : ", q_index, adv_rate)

        #     pg_loss = -tf.reduce_mean(target_q)
        #     raw_perturb = tf.gradients(pg_loss, act_ph_n)
        #     perturb = [adv_eps * tf.stop_gradient(tf.nn.l2_normalize(elem, axis = 1)) for elem in raw_perturb]
        #     new_act_n = [perturb[i] + act_ph_n[i] if i != q_index
        #             else act_ph_n[i] for i in range(len(act_ph_n))]
        #     adv_q_input = tf.concat(obs_ph_n + new_act_n, 1)
        #     target_q = q_func(adv_q_input, 1, scope ='target_q_func', reuse=True, num_units=num_units)[:,0]

        target_q_func_vars = U.scope_vars(
            U.absolute_scope_name("target_q_func"))
        update_target_q = make_update_exp(q_func_vars, target_q_func_vars)

        target_q_values = U.function(obs_ph_n + act_ph_n, target_q)

        return train, update_target_q, {
            'q_values': q_values,
            'target_q_values': target_q_values
        }
Пример #9
0
def p_train(make_obs_ph_n,
            act_space_n,
            p_index,
            p_func,
            q_func,
            optimizer,
            grad_norm_clipping=None,
            local_q_func=False,
            num_units=64,
            scope="trainer",
            reuse=None):
    #note(Daniel): We need to handle the change of the p_index somehow. hmm. We could just shuffle it I suppose?
    #              The observations that is. I mean it's not the cleanest solution but we wouldn't have to change anything in here.
    #              I'm gonna test that!
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]

        p_input = obs_ph_n[p_index]
        num_actions = int(act_pdtype_n[p_index].param_shape()[0])
        p = p_func(p_input,
                   int(act_pdtype_n[p_index].param_shape()[0]),
                   scope="p_func",
                   num_units=num_units)
        p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))

        # wrap parameters in distribution
        act_pd = act_pdtype_n[p_index].pdfromflat(p)

        act_sample = act_pd.sample()
        p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))

        act_input_n = act_ph_n + []
        act_input_n[p_index] = act_pd.sample()
        q_input = tf.concat(obs_ph_n + act_input_n, 1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1)
        q = q_func(q_input, 1, scope="q_func", reuse=True,
                   num_units=num_units)[:, 0]
        pg_loss = -tf.reduce_mean(q)

        loss = pg_loss + p_reg * 1e-3

        optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars,
                                            grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n,
                           outputs=loss,
                           updates=[optimize_expr])
        act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample)
        p_values = U.function([obs_ph_n[p_index]], p)

        # target network
        # Note(Daniel): Can we maybe skip this if we use the same network? Hmm probably should't
        #               This is probably the same thing as target_q in dqn
        target_p = p_func(p_input,
                          int(act_pdtype_n[p_index].param_shape()[0]),
                          scope="target_p_func",
                          num_units=num_units)
        target_p_func_vars = U.scope_vars(
            U.absolute_scope_name("target_p_func"))
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)

        target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample()
        target_act = U.function(inputs=[obs_ph_n[p_index]],
                                outputs=target_act_sample)

        return act, train, update_target_p, {
            'p_values': p_values,
            'target_act': target_act
        }, num_actions
Пример #10
0
def p_train(make_obs_ph_n,
            act_space_n,
            p_index,
            p_func,
            q_func,
            shared_CNN,
            optimizer,
            make_obs_map_ph_n,
            grad_norm_clipping=None,
            local_q_func=False,
            num_units=64,
            scope="trainer",
            reuse=None):
    act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

    # set up placeholders
    obs_ph_n = make_obs_ph_n
    act_ph_n = [
        act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
        for i in range(len(act_space_n))
    ]

    obs_map_ph_n = make_obs_map_ph_n
    p_input = obs_ph_n[p_index]
    p_map_input = obs_map_ph_n[p_index]

    with tf.variable_scope(scope, reuse=None):
        # create distribtuions

        map_context_input = shared_CNN(p_map_input, p_index, scope="CNN")
        CNN_vars = U.scope_vars(U.absolute_scope_name("CNN"))
        # num_adversary=2
        # if p_index<num_adversary:
        #     map_context_input=shared_CNN(p_map_input,scope="CNN-adv")
        #     CNN_vars=U.scope_vars("CNN-adv")
        # else:
        #     map_context_input=shared_CNN(p_map_input,scope="CNN-age")
        #     CNN_vars=U.scope_vars("CNN-age")
        p = p_func(tf.concat([p_input, map_context_input], 1),
                   int(act_pdtype_n[p_index].param_shape()[0]),
                   scope="p_func",
                   num_units=num_units)
        p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))

        # wrap parameters in distribution
        act_pd = act_pdtype_n[p_index].pdfromflat(p)

        act_sample = act_pd.sample()
        p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))

        act_input_n = act_ph_n + []
        act_input_n[p_index] = act_pd.sample()  #act_pd.mode() #
        q_input = tf.concat(obs_ph_n + act_input_n, 1)

    for i in range(len(obs_ph_n)):
        q_input = tf.concat([
            q_input,
            shared_CNN(obs_map_ph_n[i], i, scope="agent_" + str(i) + "/CNN")
        ], 1)
        # for i in range(len(obs_ph_n)):
        #     if i<num_adversary:
        #         q_input=tf.concat([q_input,shared_CNN(obs_map_ph_n[i],scope="CNN-adv")],1)
        #     else:
        #         q_input=tf.concat([q_input,shared_CNN(obs_map_ph_n[i],scope="CNN-age")],1)

    # for i in range(len(obs_ph_n)):
    #     q_input=tf.concat([q_input,shared_CNN(obs_map_ph_n[i],i,scope="agent_"+str(i)+"/CNN")],1)

    # with tf.variable_scope(scope, reuse=None):

    with tf.variable_scope(scope, reuse=None):
        if local_q_func:
            q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1)
        q = q_func(q_input, 1, scope="q_func", reuse=True,
                   num_units=num_units)[:, 0]

        pg_loss = -tf.reduce_mean(q)

        loss = pg_loss + p_reg * 1e-3

        optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars,
                                            grad_norm_clipping)

    with tf.variable_scope(scope, reuse=True):
        optimize_expr2 = U.minimize_and_clip(optimizer, loss, CNN_vars,
                                             grad_norm_clipping)
        # Create callable functions
    with tf.variable_scope(scope, reuse=None):
        train = U.function(inputs=obs_ph_n + act_ph_n + obs_map_ph_n,
                           outputs=loss,
                           updates=[optimize_expr, optimize_expr2])
        act = U.function(inputs=[obs_ph_n[p_index], obs_map_ph_n[p_index]],
                         outputs=act_sample)
        p_values = U.function([obs_ph_n[p_index], obs_map_ph_n[p_index]], p)
        #p_values = U.function([obs_ph_n[p_index]], p)

        # target network
        target_p = p_func(tf.concat([p_input, map_context_input], 1),
                          int(act_pdtype_n[p_index].param_shape()[0]),
                          scope="target_p_func",
                          num_units=num_units)
        target_p_func_vars = U.scope_vars(
            U.absolute_scope_name("target_p_func"))
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)

        target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample()
        target_act = U.function(
            inputs=[obs_ph_n[p_index], obs_map_ph_n[p_index]],
            outputs=target_act_sample)

        return act, train, update_target_p, {
            'p_values': p_values,
            'target_act': target_act
        }
Пример #11
0
def p_train(make_obs_ph_n,
            act_space_n,
            before_com_func,
            channel,
            after_com_func,
            q_func,
            optimizer,
            grad_norm_clipping=None,
            local_q_func=False,
            num_units=64,
            scope="trainer",
            reuse=None,
            beta=0.01,
            ibmac_com=True):
    with tf.variable_scope(scope, reuse=reuse):
        clip_threshold = 1  # 1, 5, 10
        is_norm_training = tf.placeholder(tf.bool)
        is_inference = tf.placeholder(tf.bool)

        ibmac_nocom = not ibmac_com
        num_agents = len(make_obs_ph_n)

        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(num_agents)
        ]

        hiddens_n = [
            before_com_func(obs_ph_n[i],
                            num_units,
                            scope="before_com_{}".format(i),
                            num_units=num_units) for i in range(num_agents)
        ]
        before_com_vars_n = [
            U.scope_vars(U.absolute_scope_name("before_com_{}".format(i)))
            for i in range(num_agents)
        ]

        hiddens_n_for_message = tf.concat([
            before_com_func(obs_ph_n[i],
                            num_units,
                            scope="before_com_{}".format(i),
                            reuse=True,
                            num_units=num_units) for i in range(num_agents)
        ],
                                          axis=1)
        hiddens_n_for_message = tf.stop_gradient(hiddens_n_for_message)
        channel_output = channel(hiddens_n_for_message,
                                 num_units * num_agents,
                                 scope="channel",
                                 num_units=num_units * num_agents)
        message_n, mu_message_n, logvar_message_n = [
            tf.split(item, num_or_size_splits=num_agents, axis=1)
            for item in channel_output
        ]
        logvar_message_n = [
            tf.clip_by_value(log, -10, 10) for log in logvar_message_n
        ]  # constrain kl_loss not to be too large

        message_n = [
            clip_message(message, clip_threshold, is_norm_training,
                         is_inference) for message in message_n
        ]

        channel_vars_n = [U.scope_vars(U.absolute_scope_name("channel"))]

        if ibmac_nocom:
            print('no_com')
            p_n = [
                after_com_func(hiddens_n[i],
                               int(act_pdtype_n[i].param_shape()[0]),
                               scope="p_func_{}".format(i),
                               num_units=num_units) for i in range(num_agents)
            ]
        else:
            check_n = [hiddens_n[i] + message_n[i] for i in range(num_agents)]
            p_n = [
                after_com_func(hiddens_n[i] + message_n[i],
                               int(act_pdtype_n[i].param_shape()[0]),
                               scope="p_func_{}".format(i),
                               num_units=num_units) for i in range(num_agents)
            ]
        p_func_vars = [
            U.scope_vars(U.absolute_scope_name("p_func_{}".format(i)))
            for i in range(num_agents)
        ]

        # wrap parameters in distribution
        act_pd_n = [
            act_pdtype_n[i].pdfromflat(p_n[i]) for i in range(num_agents)
        ]

        act_sample_n = [act_pd.sample() for act_pd in act_pd_n]
        p_reg_n = [
            tf.reduce_mean(tf.square(act_pd.flatparam()))
            for act_pd in act_pd_n
        ]

        act_input_n_n = [act_ph_n + [] for _ in range(num_agents)]
        for i in range(num_agents):
            act_input_n_n[i][i] = act_pd_n[i].sample()
        q_input_n = [
            tf.concat(obs_ph_n + act_input_n, 1)
            for act_input_n in act_input_n_n
        ]

        q_n = [
            q_func(q_input_n[i],
                   1,
                   scope="q_func_{}".format(i),
                   reuse=True,
                   num_units=num_units)[:, 0] for i in range(num_agents)
        ]
        pg_loss_n = [-tf.reduce_mean(q) for q in q_n]

        # # 0.25
        # kl_loss_message_n = [2 * (tf.pow(mu, 2) + tf.pow(tf.exp(log), 2)) - log + np.log(0.5) - 0.5 for mu, log in
        #                      zip(mu_message_n, logvar_message_n)]

        # #1
        # kl_loss_message_n = [0.5 * (tf.pow(mu, 2) + tf.pow(tf.exp(log), 2)) - log - 0.5 for mu, log in
        #                      zip(mu_message_n, logvar_message_n)]
        # #5
        # kl_loss_message_n = [1.0/50 * (tf.pow(mu, 2) + tf.pow(tf.exp(log), 2)) - log + np.log(5) - 0.5 for mu, log in
        #                      zip(mu_message_n, logvar_message_n)]
        #10
        kl_loss_message_n = [
            1.0 / 200 * (tf.pow(mu, 2) + tf.pow(tf.exp(log), 2)) - log +
            np.log(10) - 0.5 for mu, log in zip(mu_message_n, logvar_message_n)
        ]

        entropy = [tf.exp(log) + 1.4189 for log in logvar_message_n]

        pg_loss = tf.reduce_sum(pg_loss_n)
        p_reg = tf.reduce_sum(p_reg_n)
        kl_loss_message = tf.reduce_mean(kl_loss_message_n)

        if ibmac_nocom:
            loss = pg_loss + p_reg * 1e-3
        else:
            loss = pg_loss + p_reg * 1e-3 + beta * kl_loss_message

        kl_loss = U.function(inputs=obs_ph_n + act_ph_n +
                             [is_norm_training, is_inference],
                             outputs=kl_loss_message)

        var_list = []
        var_list.extend(before_com_vars_n)
        if not ibmac_nocom:
            var_list.extend(channel_vars_n)
        var_list.extend(p_func_vars)
        var_list = list(itertools.chain(*var_list))
        optimize_expr = U.minimize_and_clip(optimizer, loss, var_list,
                                            grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n +
                           [is_norm_training, is_inference],
                           outputs=loss,
                           updates=[optimize_expr])
        act = U.function(inputs=obs_ph_n + [is_norm_training, is_inference],
                         outputs=act_sample_n)
        p_values = U.function(inputs=obs_ph_n +
                              [is_norm_training, is_inference],
                              outputs=p_n)
        if not ibmac_nocom:
            check_values = U.function(inputs=obs_ph_n +
                                      [is_norm_training, is_inference],
                                      outputs=check_n)
            channel_com = U.function(inputs=obs_ph_n +
                                     [is_norm_training, is_inference],
                                     outputs=channel_output)
            check_mu = U.function(inputs=obs_ph_n +
                                  [is_norm_training, is_inference],
                                  outputs=mu_message_n)
            check_log = U.function(inputs=obs_ph_n +
                                   [is_norm_training, is_inference],
                                   outputs=logvar_message_n)
        else:
            check_values = lambda x: 0
            channel_com = lambda x: 0
            check_mu = lambda x: 0
            check_log = lambda x: 0

        # target network
        target_hiddens_n = [
            before_com_func(obs_ph_n[i],
                            num_units,
                            scope="target_before_com_{}".format(i),
                            num_units=num_units) for i in range(num_agents)
        ]
        target_before_com_vars = [
            U.scope_vars(
                U.absolute_scope_name("target_before_com_{}".format(i)))
            for i in range(num_agents)
        ]

        target_hiddens_n_for_message = tf.concat([
            before_com_func(obs_ph_n[i],
                            num_units,
                            scope="target_before_com_{}".format(i),
                            reuse=True,
                            num_units=num_units) for i in range(num_agents)
        ],
                                                 axis=1)
        target_hiddens_n_for_message = tf.stop_gradient(
            target_hiddens_n_for_message)
        target_channel_output = channel(target_hiddens_n_for_message,
                                        num_units * num_agents,
                                        scope="target_channel",
                                        num_units=num_units * num_agents)
        target_message_n, target_mu_message_n, target_logvar_message_n = [
            tf.split(item, num_or_size_splits=num_agents, axis=1)
            for item in target_channel_output
        ]
        target_channel_vars = [
            U.scope_vars(U.absolute_scope_name("target_channel"))
        ]
        if ibmac_nocom:
            target_p_n = [
                after_com_func(target_hiddens_n[i],
                               int(act_pdtype_n[i].param_shape()[0]),
                               scope="target_p_func_{}".format(i),
                               num_units=num_units) for i in range(num_agents)
            ]
        else:
            target_p_n = [
                after_com_func(target_hiddens_n[i] + target_message_n[i],
                               int(act_pdtype_n[i].param_shape()[0]),
                               scope="target_p_func_{}".format(i),
                               num_units=num_units) for i in range(num_agents)
            ]
            # target_p_n = [after_com_func(tf.concat([target_hiddens_n[i],target_message_n[i]], axis=1), int(act_pdtype_n[i].param_shape()[0]), scope="target_p_func_{}".format(i), num_units=num_units) for i in range(num_agents)]
        target_p_func_vars = [
            U.scope_vars(U.absolute_scope_name("target_p_func_{}".format(i)))
            for i in range(num_agents)
        ]

        target_var_list = []
        target_var_list.extend(target_before_com_vars)
        if not ibmac_nocom:
            target_var_list.extend(target_channel_vars)
        target_var_list.extend(target_p_func_vars)
        target_var_list = list(itertools.chain(*target_var_list))
        update_target_p = make_update_exp(var_list, target_var_list)

        target_act_sample_n = [
            act_pdtype_n[i].pdfromflat(target_p_n[i]).sample()
            for i in range(num_agents)
        ]
        target_act = U.function(inputs=obs_ph_n +
                                [is_norm_training, is_inference],
                                outputs=target_act_sample_n)

        check_message_n = U.function(inputs=obs_ph_n +
                                     [is_norm_training, is_inference],
                                     outputs=message_n)
        check_hiddens_n = U.function(inputs=obs_ph_n +
                                     [is_norm_training, is_inference],
                                     outputs=hiddens_n)
        check_entropy = U.function(inputs=obs_ph_n +
                                   [is_norm_training, is_inference],
                                   outputs=entropy)

        return act, train, update_target_p, {
            'p_values': p_values,
            'target_act': target_act,
            'kl_loss': kl_loss,
            'check_values': check_values,
            'channel_com': channel_com,
            'check_mu': check_mu,
            'check_log': check_log,
            'check_message_n': check_message_n,
            'check_hiddens_n': check_hiddens_n,
            'check_entropy': check_entropy
        }
Пример #12
0
def p_train(n_agents, make_state_ph_n, make_obs_ph_n, act_space_n, p_index, p_func, q_func, optimizer, grad_norm_clipping=None, local_q_func=False,
            num_units=64, scope="trainer", reuse=None, discrete_action=False, target_update_tau=0.001, use_global_state=False,
            share_weights=False):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]
        act_test_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        state_ph_n = make_state_ph_n
        act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))]

        p_input = obs_ph_n[p_index]
        if share_weights:
            # add agent id to input as layers share weights
            p_input = tf.concat([p_input,
                                 tf.tile(tf.eye(n_agents)[p_index:p_index+1],
                                         [tf.shape(p_input)[0], 1])], -1)

        print("ACTPDTYPE: {}".format(act_space_n))
        print("PINDEX: {}".format(p_index))
        p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func", reuse=share_weights,
                   num_units=num_units, constrain_out=True, discrete_action=discrete_action)
        p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))

        # wrap parameters in distribution
        act_pd = act_pdtype_n[p_index].pdfromflat(p)
        act_test_pd = act_test_pdtype_n[p_index].pdfromflat(p, test=True)   # NOTE: test=True during testing time

        act_sample = act_pd.sample()
        act_test_sample = act_test_pd.sample()

        p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))

        act_input_n = act_ph_n + []
        act_input_n[p_index] = act_pd.sample()
        if not use_global_state:
            q_input = tf.concat(obs_ph_n + act_input_n, 1)
            if local_q_func:
                q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1)
        else:
            q_input = tf.concat(state_ph_n + act_input_n, 1)
            if local_q_func:
                q_input = tf.concat([state_ph_n[p_index], act_input_n[p_index]], 1)

        if share_weights:
            # add agent id to input as layers share weights
            q_input = tf.concat([q_input,
                                 tf.tile(tf.eye(n_agents)[p_index:p_index+1],
                                         [tf.shape(q_input)[0], 1])], -1)

        q = q_func(q_input, 1, scope="q_func", reuse=share_weights, num_units=num_units,
                   constrain_out=False, discrete_action=discrete_action)[:, 0]
        pg_loss = -tf.reduce_mean(q)

        loss = pg_loss + p_reg * 1e-3

        optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping)

        # Create callable functions
        if not use_global_state:
            train = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr])
        else:
            train = U.function(inputs=state_ph_n + obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr])
        act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample)
        act_test = U.function(inputs=[obs_ph_n[p_index]], outputs=act_test_sample)
        p_values = U.function([obs_ph_n[p_index]], p)

        # target network
        target_p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]),
                          scope="target_p_func",
                          reuse=share_weights,
                          num_units=num_units,
                          constrain_out=True, discrete_action=discrete_action)
        target_p_func_vars = U.scope_vars(U.absolute_scope_name("target_p_func"))
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars, target_update_tau)

        target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample()
        target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample)

        return act, act_test, train, update_target_p, {'p_values': p_values, 'target_act': target_act}
Пример #13
0
def _p_train(n_agents, make_state_ph_n, make_obs_ph_n, act_space_n, p_index, p_func, q_func,
             optimizer, q_lstm_on, p_lstm_on, grad_norm_clipping=None, local_q_func=False,
             num_units=64, scope="trainer", reuse=None, q_debug=None, discrete_action=False, target_update_tau=0.001,
             use_global_state=False, share_weights=False):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]
        act_test_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        state_ph_n = make_state_ph_n

        act_ph_n = [act_pdtype_n[i].sample_placeholder([None, 1], name="action" + str(i)) for i in
                    range(len(act_space_n))]

        q_res = 1
        p_res = int(act_pdtype_n[p_index].param_shape()[0])

        # for actor
        p_c_ph, p_h_ph = get_lstm_state_ph(name='p_', n_batches=None, num_units=num_units)
        p_c_ph_n, p_h_ph_n = [p_c_ph for i in range(len(obs_ph_n))], [p_h_ph for i in range(len(obs_ph_n))]
        # for critic
        q_c_ph, q_h_ph = get_lstm_state_ph(name='q_', n_batches=None, num_units=num_units)
        q_c_ph_n, q_h_ph_n = [q_c_ph for i in range(len(obs_ph_n))], [q_h_ph for i in range(len(obs_ph_n))]

        if p_lstm_on:
            if not use_global_state:
                p_input = tf.concat([obs_ph_n[p_index], p_c_ph, p_h_ph], -1)
            else:
                p_input = tf.concat([state_ph_n, p_c_ph, p_h_ph], -1)

            if share_weights:
                # add agent id to input as layers share weights
                p_input = tf.concat([p_input,
                                     tf.expand_dims(tf.tile(tf.eye(n_agents)[p_index:p_index + 1],
                                                   [tf.shape(p_input)[0], 1]), 1)], -1)

            p, p_state_out = p_func(p_input, p_res, scope="p_func", num_units=num_units)
        else:
            if not use_global_state:
                p_input = obs_ph_n[p_index]
            else:
                p_input = state_ph_n[p_index]

            if share_weights:
                # add agent id to input as layers share weights
                p_input = tf.concat([p_input,
                                     tf.expand_dims(tf.tile(tf.eye(n_agents)[p_index:p_index + 1],
                                                   [tf.shape(p_input)[0], 1]), 1)], -1)

            p = p_func(p_input, p_res, scope="p_func", num_units=num_units)

        p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))
        # wrap parameters in distribution
        act_pd = act_pdtype_n[p_index].pdfromflat(p)
        act_test_pd = act_test_pdtype_n[p_index].pdfromflat(p, test=True)   # NOTE: test=True during testing time

        act_sample = act_pd.sample()
        act_test_sample = act_test_pd.sample()
        p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))

        act_input_n = act_ph_n + []
        act_input_n[p_index] = act_pd.sample()

        # deal with central state
        obs_or_state = state_ph_n if use_global_state else obs_ph_n

        # need to check this -- need safety checks
        if q_lstm_on:
            q_input = tf.concat(obs_or_state + act_input_n + q_c_ph_n + q_h_ph_n, -1) #  unclear + obs_ph_n
            if share_weights:
                # add agent id to input as layers share weights
                q_input = tf.concat([q_input,
                                     tf.expand_dims(tf.tile(tf.eye(n_agents)[p_index:p_index + 1],
                                                   [tf.shape(q_input)[0], 1]), 1)], -1)
            q, _ = q_func(q_input, 1, scope="q_func", num_units=num_units, reuse=True)
        else:
            q_input = tf.concat(obs_or_state + act_input_n, -1)
            if share_weights:
                # add agent id to input as layers share weights
                q_input = tf.concat([q_input,
                                     tf.expand_dims(tf.tile(tf.eye(n_agents)[p_index:p_index + 1],
                                                   [tf.shape(q_input)[0], 1]), 1)], -1)
            q = q_func(q_input, 1, scope="q_func", num_units=num_units, reuse=True)

        q = q[:, 0]
        pg_loss = -tf.reduce_mean(q)
        loss = pg_loss + p_reg * 1e-3
        optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping)

        act_test = U.function(inputs=[obs_ph_n[p_index], p_c_ph, p_h_ph], outputs=[act_test_sample, p_state_out])

        # Create callable functions
        obs_or_state_lst = state_ph_n + obs_ph_n
        if p_lstm_on and q_lstm_on:
            train = U.function(inputs=obs_or_state_lst + act_ph_n + q_c_ph_n + q_h_ph_n + p_c_ph_n + p_h_ph_n, outputs=loss,
                               updates=[optimize_expr])
        elif p_lstm_on:
            train = U.function(inputs=obs_or_state_lst + act_ph_n + p_c_ph_n + p_h_ph_n, outputs=loss, updates=[optimize_expr])
        elif q_lstm_on:
            train = U.function(inputs=obs_or_state_lst + act_ph_n + q_c_ph_n + q_h_ph_n, outputs=loss, updates=[optimize_expr])
        else:
            train = U.function(inputs=obs_or_state_lst + act_ph_n, outputs=loss, updates=[optimize_expr])

        if p_lstm_on:
            act = U.function(inputs=[obs_ph_n[p_index], p_c_ph, p_h_ph], outputs=[act_sample, p_state_out])
            p_values = U.function(inputs=[obs_ph_n[p_index], p_c_ph, p_h_ph], outputs=p)
            # target network
            target_p, target_p_state_out = p_func(p_input, p_res, scope="target_p_func", num_units=num_units)
        else:
            act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample)
            p_values = U.function(inputs=[obs_ph_n[p_index]], outputs=p)
            # target network
            target_p = p_func(p_input, p_res, scope="target_p_func", num_units=num_units)

        target_p_func_vars = U.scope_vars(U.absolute_scope_name("target_p_func"))
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)

        target_pd = act_pdtype_n[p_index].pdfromflat(target_p)
        target_act_sample = target_pd.sample()

        if p_lstm_on:
            target_act = U.function(inputs=[obs_ph_n[p_index], p_c_ph, p_h_ph], outputs=target_act_sample)
        else:
            target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample)

        return act, act_test, train, update_target_p, {'p_values': p_values, 'target_act': target_act}
Пример #14
0
def p_train(make_obs_ph_n,
            act_space_n,
            p_index,
            p_func,
            q_func,
            optimizer,
            index,
            grad_norm_clipping=None,
            local_q_func=False,
            num_units=64,
            scope="trainer",
            reuse=None,
            ensemble_num=5):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]
        #不懂
        # set up placeholders
        obs_ph_n = make_obs_ph_n  #输入:观测
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None],
                                               name="action" + str(index) +
                                               str(i))
            for i in range(len(act_space_n))
        ]
        #输出:行动
        p_input = obs_ph_n[p_index]  #这个智能体得到的观测

        p = p_func(p_input,
                   int(act_pdtype_n[p_index].param_shape()[0]),
                   scope="p_func" + str(index),
                   num_units=num_units)
        #得到映射函数,是个全连接网络
        p_func_vars = U.scope_vars(U.absolute_scope_name("p_func" +
                                                         str(index)))
        #得到这个网络的所有参数
        # wrap parameters in distribution
        act_pd = act_pdtype_n[p_index].pdfromflat(p)
        #不懂
        act_sample = act_pd.sample()  #采样,得到一个动作输出(一个实数)

        p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))  #将参数展为一维,计算模方

        act_input_n = act_ph_n + []  #动作输入,是placeholder
        act_input_n[p_index] = act_pd.sample()
        q_input = tf.concat(obs_ph_n + act_input_n, 1)
        if local_q_func:  #如果是局部Q函数
            q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1)
        q = q_func(q_input, 1, scope="q_func", reuse=True,
                   num_units=num_units)[:, 0]
        #不懂,一个全连接网络
        pg_loss = -tf.reduce_mean(q)

        loss = pg_loss + p_reg * 1e-3

        optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars,
                                            grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n,
                           outputs=loss,
                           updates=[optimize_expr])
        #训练网络
        act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample)
        #输出的动作
        p_values = U.function([obs_ph_n[p_index]], p)
        #不懂
        # target network
        target_p = p_func(p_input,
                          int(act_pdtype_n[p_index].param_shape()[0]),
                          scope="target_p_func" + str(index),
                          num_units=num_units)
        #现实网络的函数。一个全连接网络
        target_p_func_vars = U.scope_vars(
            U.absolute_scope_name("target_p_func" + str(index)))
        #得到全连接网络的参数
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)
        #更新现实网络的参数,以soft的形式,即每次更新一点点(动量更新)
        target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample()
        #不懂
        target_act = U.function(inputs=[obs_ph_n[p_index]],
                                outputs=target_act_sample)
        #得到现实网络的动作
        return act, train, update_target_p, {
            'p_values': p_values,
            'target_act': target_act
        }
Пример #15
0
def p_train(make_obs_ph_n,
            act_space_n,
            p_index,
            p_func,
            q_func,
            optimizer,
            grad_norm_clipping=None,
            local_q_func=False,
            num_units=64,
            scope="trainer",
            reuse=None):
    """
    Policy learning guided by Q-value

    Args:
        make_obs_ph_n (tf.placeholder): Placeholder for the observation space of all agents
        act_space_n (list): A list of the action spaces for all agents
        p_index (int): Agent index number
        p_func (function): MLP Neural Network model for the agent.
        q_func (function): MLP Neural Network model for the agent.
        optimizer (function): Network Optimizer function
        grad_norm_clipping (float): Value by which to clip the norm of the gradient
        local_q_func (boolean): Flag for using local q function
        num_units (int): The number outputs for the layers of the model
        scope (str): The name of the scope
        reuse (boolean): Flag specifying whether to reuse the scope

    Returns:
        act (function): Action function for retrieving agent action.
        train (function): Training function for P network
        update_target_p (function): Update function for updating P network values
        p_debug (dict): Contains 'p_values' and 'target_act' of the P network
    """
    with tf.variable_scope(scope, reuse=reuse):
        # Create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # Set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]

        p_input = obs_ph_n[p_index]

        p = p_func(p_input,
                   int(act_pdtype_n[p_index].param_shape()[0]),
                   scope="p_func",
                   num_units=num_units)
        p_func_vars = tf_util.scope_vars(tf_util.absolute_scope_name("p_func"))

        # Wrap parameters in distribution
        act_pd = act_pdtype_n[p_index].pdfromflat(p)

        act_sample = act_pd.sample()
        p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))

        act_input_n = act_ph_n + []
        act_input_n[p_index] = act_pd.sample()
        q_input = tf.concat(obs_ph_n + act_input_n, 1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1)
        q = q_func(q_input, 1, scope="q_func", reuse=True,
                   num_units=num_units)[:, 0]
        pg_loss = -tf.reduce_mean(q)

        loss = pg_loss + p_reg * 1e-3

        optimize_expr = tf_util.minimize_and_clip(optimizer, loss, p_func_vars,
                                                  grad_norm_clipping)

        # Create callable functions
        train = tf_util.function(inputs=obs_ph_n + act_ph_n,
                                 outputs=loss,
                                 updates=[optimize_expr])
        act = tf_util.function(inputs=[obs_ph_n[p_index]], outputs=act_sample)
        p_values = tf_util.function([obs_ph_n[p_index]], p)

        # Target network
        target_p = p_func(p_input,
                          int(act_pdtype_n[p_index].param_shape()[0]),
                          scope="target_p_func",
                          num_units=num_units)
        target_p_func_vars = tf_util.scope_vars(
            tf_util.absolute_scope_name("target_p_func"))
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)

        target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample()
        target_act = tf_util.function(inputs=[obs_ph_n[p_index]],
                                      outputs=target_act_sample)

        return act, train, update_target_p, {
            'p_values': p_values,
            'target_act': target_act
        }
Пример #16
0
def q_train(make_obs_ph_n,
            act_space_n,
            q_index,
            q_func,
            u_func,
            optimizer,
            optimizer_lamda,
            exp_var_alpha=None,
            cvar_alpha=None,
            cvar_beta=None,
            grad_norm_clipping=None,
            local_q_func=False,
            scope="trainer",
            reuse=None,
            num_units=64,
            u_estimation=False,
            constrained=True,
            constraint_type=None,
            agent_type=None):
    with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
        if constrained:
            lamda_constraint = tf.get_variable(
                'lamda_constraint' + str(q_index), [1],
                initializer=tf.constant_initializer(1.0),
                dtype=tf.float32)
        if not constrained or constraint_type == "CVAR":
            v_constraint = tf.get_variable(
                'v_constraint' + str(q_index), [1],
                initializer=tf.constant_initializer(1.0),
                dtype=tf.float32)
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]
        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]
        target_ph = tf.placeholder(tf.float32, [None], name="target")
        if u_estimation:
            target_ph_u = tf.placeholder(tf.float32, [None], name="target_u")
        rew = tf.placeholder(tf.float32, [None], name="reward")
        q_input = tf.concat(obs_ph_n + act_ph_n, 1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1)
        q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:, 0]

        if u_estimation:
            u_input = tf.concat(obs_ph_n + act_ph_n, 1)
            u = u_func(u_input, 1, scope="u_func", num_units=num_units)[:, 0]
            u_loss = tf.reduce_mean(
                tf.square(
                    tf.square(rew) + 2 * tf.multiply(rew, target_ph) +
                    target_ph_u - u))
            var = u - tf.square(q)
        else:
            var = tf.square(rew + target_ph) - tf.square(q)
        if not constrained or constraint_type == "CVAR":
            cvar = v_constraint + (1.0 / (1.0 - cvar_beta)) * tf.reduce_mean(
                tf.nn.relu(q - v_constraint))
            cvar_loss = tf.reduce_mean(cvar)

        if constrained:
            if constraint_type == "Exp_Var":
                #print ('In constraint generation with lamda alpha')
                constraint = lamda_constraint * (var - exp_var_alpha)
                q_loss = tf.reduce_mean(
                    tf.square((target_ph + rew + constraint) - q))
            elif constraint_type == "CVAR":
                constraint = lamda_constraint * (cvar_alpha - cvar)
                q_loss = tf.reduce_mean(
                    tf.square((target_ph + rew + constraint) - q))
        else:
            q_loss = tf.reduce_mean(tf.square(q - (target_ph + rew)))

        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))
        if u_estimation:
            u_func_vars = U.scope_vars(U.absolute_scope_name("u_func"))

        # viscosity solution to Bellman differential equation in place of an initial condition
        q_reg = tf.reduce_mean(tf.square(q))
        train3 = None
        if u_estimation:
            loss = q_loss + u_loss  #+ 1e-3 * q_reg
            optimize_expr = U.minimize_and_clip(optimizer, loss,
                                                q_func_vars + u_func_vars,
                                                grad_norm_clipping)
            train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] +
                               [target_ph_u] + [rew],
                               outputs=[q_loss, u_loss],
                               updates=[optimize_expr])
            var_fn = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] +
                                [target_ph_u] + [rew],
                                outputs=var)
        else:
            #print ('in loss minimization over q_func_vars')
            loss = q_loss
            optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars,
                                                grad_norm_clipping)
            train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] +
                               [rew],
                               outputs=q_loss,
                               updates=[optimize_expr])
            var_fn = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] +
                                [rew],
                                outputs=var)

        #loss = loss + 1e-4*q_reg
        # Create callable functions

        q_values = U.function(obs_ph_n + act_ph_n, q)

        # target network
        target_q = q_func(q_input,
                          1,
                          scope="target_q_func",
                          num_units=num_units)[:, 0]
        target_q_func_vars = U.scope_vars(
            U.absolute_scope_name("target_q_func"))
        update_target_q = make_update_exp(q_func_vars, target_q_func_vars)
        target_q_values = U.function(obs_ph_n + act_ph_n, target_q)

        if u_estimation:
            u_values = U.function(obs_ph_n + act_ph_n, u)
            target_u = u_func(u_input,
                              1,
                              scope="target_u_func",
                              num_units=num_units)[:, 0]
            target_u_func_vars = U.scope_vars(
                U.absolute_scope_name("target_u_func"))
            update_target_u = make_update_exp(u_func_vars, target_u_func_vars)
            target_u_values = U.function(obs_ph_n + act_ph_n, target_u)

        if constrained:
            loss2 = -loss
            #print ('in loss maximisation over lamda')
            optimize_expr2 = U.minimize_and_clip(optimizer_lamda, loss2,
                                                 [lamda_constraint],
                                                 grad_norm_clipping)
            if u_estimation:
                train2 = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] +
                                    [target_ph_u] + [rew],
                                    outputs=loss2,
                                    updates=[optimize_expr2])
            else:
                train2 = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] +
                                    [rew],
                                    outputs=loss2,
                                    updates=[optimize_expr2])
        if not constrained or constraint_type == "CVAR":
            loss = cvar_loss
            optimize_expr3 = U.minimize_and_clip(optimizer, loss,
                                                 [v_constraint],
                                                 grad_norm_clipping)
            train3 = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] +
                                [rew],
                                outputs=loss,
                                updates=[optimize_expr3])
            cvar_fn = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] +
                                 [rew],
                                 outputs=cvar)

        if not u_estimation:
            update_target_u = None
            target_u_values = None
            u_values = None
        if not constrained:
            train2 = None
            lamda_constraint = None
        if constraint_type != "CVAR" and constrained:
            cvar_fn = None
            v_constraint = None
        return train, train2, train3, update_target_q, update_target_u, {
            'q_values': q_values,
            'u_values': u_values,
            'target_q_values': target_q_values,
            'target_u_values': target_u_values,
            'var': var_fn,
            'cvar': cvar_fn,
            'lamda_constraint': lamda_constraint,
            'v_constraint': v_constraint,
            'optimize_expr': optimize_expr
        }
Пример #17
0
def _p_train(make_obs_ph_n,
             act_space_n,
             p_index,
             p_func,
             q_func,
             optimizer,
             q_lstm_on,
             p_lstm_on,
             centralized_p,
             grad_norm_clipping=None,
             local_q_func=False,
             num_units=64,
             scope="trainer",
             reuse=None,
             q_debug=None):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None, 1],
                                               name="action" + str(i))
            for i in range(len(act_space_n))
        ]

        q_res = 1
        p_res = int(act_pdtype_n[p_index].param_shape()[0])

        # for actor
        p_c_ph, p_h_ph = get_lstm_state_ph(name='p_',
                                           n_batches=None,
                                           num_units=num_units)
        p_c_ph_n, p_h_ph_n = [p_c_ph for i in range(len(obs_ph_n))
                              ], [p_h_ph for i in range(len(obs_ph_n))]
        # for critic
        q_c_ph, q_h_ph = get_lstm_state_ph(name='q_',
                                           n_batches=None,
                                           num_units=num_units)
        q_c_ph_n, q_h_ph_n = [q_c_ph for i in range(len(obs_ph_n))
                              ], [q_h_ph for i in range(len(obs_ph_n))]

        if p_lstm_on:
            p_input = tf.concat([obs_ph_n[p_index], p_c_ph, p_h_ph], -1)
            p, p_state_out = p_func(p_input,
                                    p_res,
                                    scope="p_func",
                                    num_units=num_units)
        else:
            p_input = obs_ph_n[p_index]
            p = p_func(p_input, p_res, scope="p_func", num_units=num_units)

        p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))
        # wrap parameters in distribution
        act_pd = act_pdtype_n[p_index].pdfromflat(p)

        act_sample = act_pd.sample()
        p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))

        act_input_n = act_ph_n + []
        act_input_n[p_index] = act_pd.sample()

        # need to check this -- need safety checks
        if q_lstm_on:
            q_input = tf.concat(obs_ph_n + act_input_n + q_c_ph_n + q_h_ph_n,
                                -1)
            q, _ = q_func(q_input,
                          1,
                          scope="q_func",
                          num_units=num_units,
                          reuse=True)
        else:
            q_input = tf.concat(obs_ph_n + act_input_n, -1)
            q = q_func(q_input,
                       1,
                       scope="q_func",
                       num_units=num_units,
                       reuse=True)

        q = q[:, 0]
        pg_loss = -tf.reduce_mean(q)
        loss = pg_loss + p_reg * 1e-3
        optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars,
                                            grad_norm_clipping)

        # Create callable functions
        if p_lstm_on and q_lstm_on:
            train = U.function(inputs=obs_ph_n + act_ph_n + q_c_ph_n +
                               q_h_ph_n + p_c_ph_n + p_h_ph_n,
                               outputs=loss,
                               updates=[optimize_expr])
        elif p_lstm_on:
            train = U.function(inputs=obs_ph_n + act_ph_n + p_c_ph_n +
                               p_h_ph_n,
                               outputs=loss,
                               updates=[optimize_expr])
        elif q_lstm_on:
            train = U.function(inputs=obs_ph_n + act_ph_n + q_c_ph_n +
                               q_h_ph_n,
                               outputs=loss,
                               updates=[optimize_expr])
        else:
            train = U.function(inputs=obs_ph_n + act_ph_n,
                               outputs=loss,
                               updates=[optimize_expr])

        if p_lstm_on:
            act = U.function(inputs=[obs_ph_n[p_index], p_c_ph, p_h_ph],
                             outputs=[act_sample, p_state_out])
            p_values = U.function(inputs=[obs_ph_n[p_index], p_c_ph, p_h_ph],
                                  outputs=p)
            # target network
            target_p, target_p_state_out = p_func(p_input,
                                                  p_res,
                                                  scope="target_p_func",
                                                  num_units=num_units)
        else:
            act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample)
            p_values = U.function(inputs=[obs_ph_n[p_index]], outputs=p)
            # target network
            target_p = p_func(p_input,
                              p_res,
                              scope="target_p_func",
                              num_units=num_units)

        target_p_func_vars = U.scope_vars(
            U.absolute_scope_name("target_p_func"))
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)

        target_pd = act_pdtype_n[p_index].pdfromflat(target_p)
        target_act_sample = target_pd.sample()

        if p_lstm_on:
            target_act = U.function(inputs=[obs_ph_n[p_index], p_c_ph, p_h_ph],
                                    outputs=target_act_sample)
        else:
            target_act = U.function(inputs=[obs_ph_n[p_index]],
                                    outputs=target_act_sample)

        return act, train, update_target_p, {
            'p_values': p_values,
            'target_act': target_act
        }
Пример #18
0
def p_train(make_obs_ph_n,
            act_space_n,
            p_index,
            p_func,
            q_func,
            optimizer,
            adversarial,
            adv_eps,
            adv_eps_s,
            num_adversaries,
            grad_norm_clipping=None,
            local_q_func=False,
            num_units=64,
            scope="trainer",
            reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]

        p_input = obs_ph_n[p_index]

        p = p_func(p_input,
                   int(act_pdtype_n[p_index].param_shape()[0]),
                   scope="p_func",
                   num_units=num_units)
        p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))

        # wrap parameters in distribution
        act_pd = act_pdtype_n[p_index].pdfromflat(p)

        act_sample = act_pd.sample()
        p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))

        act_input_n = act_ph_n + []
        act_input_n[p_index] = act_pd.sample()

        q_input = tf.concat(obs_ph_n + act_input_n, 1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1)
        q = q_func(q_input, 1, scope="q_func", reuse=True,
                   num_units=num_units)[:, 0]
        pg_loss = -tf.reduce_mean(q)

        # if adversarial:
        #     num_agents = len(act_input_n)
        #     if p_index < num_adversaries:
        #         adv_rate = [adv_eps_s *(i < num_adversaries) + adv_eps * (i >= num_adversaries) for i in range(num_agents)]
        #     else:
        #         adv_rate = [adv_eps_s *(i >= num_adversaries) + adv_eps * (i < num_adversaries) for i in range(num_agents)]
        #     print("      adv rate for p_index : ", p_index, adv_rate)
        #     raw_perturb = tf.gradients(pg_loss, act_input_n)
        #     perturb = [tf.stop_gradient(tf.nn.l2_normalize(elem, axis = 1)) for elem in raw_perturb]
        #     perturb = [perturb[i] * adv_rate[i] for i in range(num_agents)]
        #     new_act_n = [perturb[i] + act_input_n[i] if i != p_index
        #             else act_input_n[i] for i in range(len(act_input_n))]

        #     adv_q_input = tf.concat(obs_ph_n + new_act_n, 1)
        #     adv_q = q_func(adv_q_input, 1, scope = "q_func", reuse=True, num_units=num_units)[:,0]
        #     pg_loss = -tf.reduce_mean(q)

        loss = pg_loss + p_reg * 1e-3

        optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars,
                                            grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n,
                           outputs=loss,
                           updates=[optimize_expr])
        act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample)
        p_values = U.function([obs_ph_n[p_index]], p)

        # target network
        target_p = p_func(p_input,
                          int(act_pdtype_n[p_index].param_shape()[0]),
                          scope="target_p_func",
                          num_units=num_units)
        target_p_func_vars = U.scope_vars(
            U.absolute_scope_name("target_p_func"))
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)

        target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample()
        target_act = U.function(inputs=[obs_ph_n[p_index]],
                                outputs=target_act_sample)

        return act, train, update_target_p, {
            'p_values': p_values,
            'target_act': target_act
        }
Пример #19
0
def _q_train(make_obs_ph_n,
             act_space_n,
             q_index,
             q_func,
             optimizer,
             q_lstm_on,
             p_lstm_on,
             centralized_p,
             grad_norm_clipping=None,
             local_q_func=False,
             scope="trainer",
             reuse=None,
             num_units=64):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None, 1],
                                               name="action" + str(i))
            for i in range(len(act_space_n))
        ]
        target_ph = tf.placeholder(tf.float32, [None, 1], name="target")

        q_res = 1
        p_res = int(act_pdtype_n[q_index].param_shape()[0])

        # for actor
        p_c_ph, p_h_ph = get_lstm_state_ph(name='p_',
                                           n_batches=None,
                                           num_units=num_units)
        p_c_ph_n, p_h_ph_n = [p_c_ph for i in range(len(obs_ph_n))
                              ], [p_h_ph for i in range(len(obs_ph_n))]
        # for critic
        q_c_ph, q_h_ph = get_lstm_state_ph(name='q_',
                                           n_batches=None,
                                           num_units=num_units)
        q_c_ph_n, q_h_ph_n = [q_c_ph for i in range(len(obs_ph_n))
                              ], [q_h_ph for i in range(len(obs_ph_n))]

        if q_lstm_on:
            q_input = tf.concat(obs_ph_n + act_ph_n + q_c_ph_n + q_h_ph_n, -1)
            q, q_state_out = q_func(q_input,
                                    1,
                                    scope="q_func",
                                    num_units=num_units)
        else:
            q_input = tf.concat(obs_ph_n + act_ph_n, -1)
            q = q_func(q_input, 1, scope="q_func", num_units=num_units)

        q = q[:, 0]

        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        q_loss = tf.reduce_mean(tf.square(q - target_ph))

        # viscosity solution to Bellman differential equation in place of an initial condition
        q_reg = tf.reduce_mean(tf.square(q))
        loss = q_loss  #+ 1e-3 * q_reg

        optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars,
                                            grad_norm_clipping)

        # Create callable functions
        if q_lstm_on:
            q_values = U.function(inputs=obs_ph_n + act_ph_n + q_c_ph_n +
                                  q_h_ph_n,
                                  outputs=[q, q_state_out])
            train = U.function(inputs=obs_ph_n + act_ph_n + q_c_ph_n +
                               q_h_ph_n + [target_ph],
                               outputs=loss,
                               updates=[optimize_expr])
            target_q, target_q_state_out = q_func(q_input,
                                                  1,
                                                  scope="target_q_func",
                                                  num_units=num_units)
        else:
            q_values = U.function(inputs=obs_ph_n + act_ph_n, outputs=q)
            train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph],
                               outputs=loss,
                               updates=[optimize_expr])
            target_q = q_func(q_input,
                              1,
                              scope="target_q_func",
                              num_units=num_units)

        target_q = target_q[:, 0]
        target_q_func_vars = U.scope_vars(
            U.absolute_scope_name("target_q_func"))
        update_target_q = make_update_exp(q_func_vars, target_q_func_vars)

        if q_lstm_on:
            target_q_values = U.function(inputs=obs_ph_n + act_ph_n +
                                         q_c_ph_n + q_h_ph_n,
                                         outputs=target_q)
        else:
            target_q_values = U.function(inputs=obs_ph_n + act_ph_n,
                                         outputs=target_q)

        return train, update_target_q, {
            'q_values': q_values,
            'target_q_values': target_q_values
        }
Пример #20
0
def q_train(name,
            make_obs_ph_n,
            adj_n,
            act_space_n,
            num_adversaries,
            neighbor_n,
            q_func,
            agent_n,
            optimizer,
            grad_norm_clipping=None,
            local_q_func=False,
            reuse=None,
            scope="trainer",
            num_units=64):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # number of agents in this species
        agent_n_species = num_adversaries if name == "adversaries" else agent_n - num_adversaries

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]
        target_ph = [
            tf.placeholder(tf.float32, [None], name="target")
            for _ in range(agent_n_species)
        ]

        q = []
        q_square = []
        q_input = tf.concat(obs_ph_n + act_ph_n, 1)
        for a in range(agent_n_species):
            temp = q_func(q_input,
                          1,
                          scope="q_func_%d" % a,
                          num_units=num_units)[:, 0]
            q.append(temp)
        # q1 = tf.stack([q[i] for i in range(agent_n_species)], axis=1)
        # q_square = [tf.square(tf.reduce_mean(q[i] - target_ph[i], axis=1)) for i in range(agent_n_species)]
        q_func_vars = [
            U.scope_vars(U.absolute_scope_name("q_func_%d" % i))
            for i in range(agent_n_species)
        ]
        q_loss = [
            tf.reduce_mean(tf.square(q[i] - target_ph[i]))
            for i in range(agent_n_species)
        ]

        # viscosity solution to Bellman differential equation in place of an initial condition
        # q_reg = tf.reduce_mean(tf.square(q1))
        loss = q_loss
        # + 1e-3 * q_reg

        optimize_expr = [
            U.minimize_and_clip(optimizer, loss[i], q_func_vars[i],
                                grad_norm_clipping)
            for i in range(agent_n_species)
        ]

        # Create callable functions
        train = [
            U.function(inputs=obs_ph_n + act_ph_n + [target_ph[i]],
                       outputs=loss[i],
                       updates=[optimize_expr[i]])
            for i in range(agent_n_species)
        ]
        q_values = U.function(obs_ph_n + act_ph_n, q)

        # target network
        target_q = []
        for a in range(agent_n_species):
            temp = q_func(q_input,
                          1,
                          scope="target_q_func_%d" % a,
                          num_units=num_units)[:, 0]
            target_q.append(temp)
        target_q_func_vars = [
            U.scope_vars(U.absolute_scope_name("target_q_func_%d" % i))
            for i in range(agent_n_species)
        ]
        update_target_q = make_update_exp(q_func_vars,
                                          target_q_func_vars,
                                          central=False)

        target_q_values = U.function(obs_ph_n + act_ph_n, target_q)

        return train, update_target_q, q_values, target_q_values
Пример #21
0
def q_train(make_obs_ph_n,
            make_meesages_ph_n,
            act_space_n,
            q_func,
            optimizer,
            grad_norm_clipping=None,
            local_q_func=False,
            scope="trainer",
            reuse=None,
            num_units=64):
    with tf.variable_scope(scope, reuse=reuse):
        num_agents = len(make_obs_ph_n)

        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        messages_ph_n = make_meesages_ph_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None],
                                               name="action_{}".format(i))
            for i in range(len(act_space_n))
        ]
        target_ph_n = [
            tf.placeholder(tf.float32, [None], name="target_{}".format(i))
            for i in range(num_agents)
        ]

        q_input = tf.concat(obs_ph_n + messages_ph_n + act_ph_n, 1)
        q_n = [
            q_func(q_input,
                   1,
                   scope="q_func_{}".format(i),
                   num_units=num_units)[:, 0] for i in range(num_agents)
        ]
        q_func_vars = [
            U.scope_vars(U.absolute_scope_name("q_func_{}".format(i)))
            for i in range(num_agents)
        ]

        q_loss_n = [
            tf.reduce_mean(tf.square(q - target_ph))
            for q, target_ph in zip(q_n, target_ph_n)
        ]

        # viscosity solution to Bellman differential equation in place of an initial condition
        # q_reg = tf.reduce_mean(tf.square(q))
        q_loss = tf.reduce_sum(q_loss_n)
        loss = q_loss  # + 1e-3 * q_reg

        var_list = list(itertools.chain(*q_func_vars))
        optimize_expr = U.minimize_and_clip(optimizer, loss, var_list,
                                            grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + messages_ph_n + act_ph_n +
                           target_ph_n,
                           outputs=loss,
                           updates=[optimize_expr])
        q_values = U.function(obs_ph_n + messages_ph_n + act_ph_n, q_n)

        # target network
        target_q_n = [
            q_func(q_input,
                   1,
                   scope="target_q_func_{}".format(i),
                   num_units=num_units)[:, 0] for i in range(num_agents)
        ]
        target_q_func_vars = [
            U.scope_vars(U.absolute_scope_name("target_q_func_{}".format(i)))
            for i in range(num_agents)
        ]

        traget_var_list = list(itertools.chain(*target_q_func_vars))
        update_target_q = make_update_exp(var_list, traget_var_list)

        target_q_values = U.function(obs_ph_n + messages_ph_n + act_ph_n,
                                     target_q_n)

        return train, update_target_q, {
            'q_values': q_values,
            'target_q_values': target_q_values
        }
Пример #22
0
def p_train_recurrent(make_obs_ph_n,
                      make_state_ph_n,
                      make_obs_next_n,
                      make_obs_pred_n,
                      act_space_n,
                      p_index,
                      p_policy,
                      p_predict,
                      q_func,
                      optimizer,
                      grad_norm_clipping=None,
                      local_q_func=False,
                      num_units=64,
                      scope="trainer",
                      reuse=None):

    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions

        # set up placeholders
        obs_ph_n = make_obs_ph_n  # all obs, in shape Agent_num * batch_size * time_step * obs_shape
        obs_next_n = make_obs_next_n
        state_ph_n = make_state_ph_n
        obs_pred_n = make_obs_pred_n

        # used for action
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]

        # p_input is local obs of an agent
        obs_input = obs_ph_n[p_index]
        state_input = state_ph_n[p_index]
        act_input = act_ph_n[p_index]
        obs_next = obs_next_n[p_index]
        obs_pred_input = obs_pred_n[p_index]

        # get output and state
        p, gru_out, state = p_policy(
            obs_input,
            state_input,
            obs_pred_input,
            int(act_pdtype_n[p_index].param_shape()[0]),
            scope="p_policy",
            num_units=num_units)
        act_pd = act_pdtype_n[p_index].pdfromflat(
            p)  # wrap parameters in distribution
        act_sample = act_pd.sample()  # sample an action

        # predict the next obs
        obs_pred = p_predict(act_input,
                             gru_out,
                             int(obs_input.shape[1]),
                             scope="p_predict",
                             num_units=num_units)

        # variables for optimization
        p_func_vars = U.scope_vars(
            U.absolute_scope_name("p_policy")) + U.scope_vars(
                U.absolute_scope_name("p_predict"))

        pred_loss = tf.reduce_mean(tf.square(obs_next -
                                             obs_pred))  # predict loss
        p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))  # reg item
        # use critic net to get the loss about policy
        act_input_n = act_ph_n + []
        act_input_n[p_index] = act_pd.sample(
        )  # only modify the action of this agent
        q_input = tf.concat(
            obs_ph_n + act_input_n,
            1)  # get the input for Q net (all obs + all action)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1)
        q = q_func(q_input, 1, scope="q_func", reuse=True,
                   num_units=num_units)[:, 0]  # get q values
        pg_loss = -tf.reduce_mean(q)  # calculate loss to maximize Q values

        loss = pg_loss + p_reg * 1e-3 + pred_loss * 1e-3
        optimize_expr = U.minimize_and_clip(
            optimizer, loss, p_func_vars,
            grad_norm_clipping)  # update p Net parameters

        # Create callable functions
        # update P NET
        train = U.function(inputs=obs_ph_n + state_ph_n + act_ph_n +
                           obs_next_n + obs_pred_n,
                           outputs=loss,
                           updates=[optimize_expr])
        # return action and state
        step = U.function(inputs=[obs_ph_n[p_index]] + [state_ph_n[p_index]] +
                          [obs_pred_n[p_index]],
                          outputs=[act_sample] + [state] + [gru_out])
        p_values = U.function(inputs=[obs_ph_n[p_index]] +
                              [state_ph_n[p_index]] + [obs_pred_n[p_index]],
                              outputs=p)

        # target network
        target_p, target_gru_out, target_state = \
            p_policy(obs_input, state_input, obs_pred_input,
                     int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_policy", num_units=num_units)
        target_obs_pred = p_predict(act_input,
                                    target_gru_out,
                                    int(obs_input.shape[1]),
                                    scope="target_p_predict",
                                    num_units=num_units)

        target_p_func_vars = U.scope_vars(U.absolute_scope_name("target_p_policy")) + \
                             U.scope_vars(U.absolute_scope_name("target_p_predict"))
        # update the parameters θ'i = τθi + (1 − τ)θ'i
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)
        target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample()

        target_step = U.function(inputs=[obs_ph_n[p_index]] +
                                 [state_ph_n[p_index]] + [obs_pred_n[p_index]],
                                 outputs=[target_act_sample] + [target_state] +
                                 [target_gru_out])

        # return predicted obs
        gru_temp = tf.placeholder(tf.float32, [None] + [num_units],
                                  name='gru_out')
        pred_temp = p_predict(act_input,
                              gru_temp,
                              int(obs_input.shape[1]),
                              scope="p_predict",
                              num_units=num_units)
        predict = U.function(inputs=[act_ph_n[p_index]] + [gru_temp],
                             outputs=pred_temp)
        target_pred_temp = p_predict(act_input,
                                     gru_temp,
                                     int(obs_input.shape[1]),
                                     scope="target_p_predict",
                                     num_units=num_units)
        target_predict = U.function(inputs=[act_ph_n[p_index]] + [gru_temp],
                                    outputs=target_pred_temp)

        return step, predict, train, update_target_p, {
            'p_values': p_values,
            'target_step': target_step,
            'target_predict': target_predict
        }
Пример #23
0
def p_train(make_obs_ph_n,
            make_meesages_ph_n,
            act_space_n,
            p_func,
            q_func,
            optimizer,
            grad_norm_clipping=None,
            local_q_func=False,
            num_units=64,
            scope="trainer",
            reuse=None,
            beta=0.01):
    with tf.variable_scope(scope, reuse=reuse):
        num_agents = len(make_obs_ph_n)

        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(num_agents)
        ]

        messages_ph_n = make_meesages_ph_n

        # multi_head = pre_message(messages_ph_n)

        items = [
            p_func([obs_ph_n[i], tf.concat(messages_ph_n, 1)],
                   int(act_pdtype_n[i].param_shape()[0]),
                   scope="p_func_{}".format(i),
                   num_units=num_units) for i in range(num_agents)
        ]
        p_n, message_n, mu_message_n, logvar_message_n = list(zip(*items))

        logvar_message_n = [
            tf.clip_by_value(log, -10, 10) for log in logvar_message_n
        ]  # constrain kl_loss not to be too large

        p_func_vars = [
            U.scope_vars(U.absolute_scope_name("p_func_{}".format(i)))
            for i in range(num_agents)
        ]

        # wrap parameters in distribution
        act_pd_n = [
            act_pdtype_n[i].pdfromflat(p_n[i]) for i in range(num_agents)
        ]

        act_sample_n = [act_pd.sample() for act_pd in act_pd_n]
        p_reg_n = [
            tf.reduce_mean(tf.square(act_pd.flatparam()))
            for act_pd in act_pd_n
        ]

        act_input_n_n = [act_ph_n + [] for _ in range(num_agents)]
        for i in range(num_agents):
            act_input_n_n[i][i] = act_pd_n[i].sample()
        q_input_n = [
            tf.concat(obs_ph_n + messages_ph_n + act_input_n, 1)
            for act_input_n in act_input_n_n
        ]

        q_n = [
            q_func(q_input_n[i],
                   1,
                   scope="q_func_{}".format(i),
                   reuse=True,
                   num_units=num_units)[:, 0] for i in range(num_agents)
        ]
        pg_loss_n = [-tf.reduce_mean(q) for q in q_n]

        kl_loss_message_n = [
            0.5 * (tf.pow(mu, 2) + tf.pow(tf.exp(log), 2)) - log - 0.5
            for mu, log in zip(mu_message_n, logvar_message_n)
        ]
        kl_loss_message = tf.reduce_mean(kl_loss_message_n)

        pg_loss = tf.reduce_sum(pg_loss_n)
        p_reg = tf.reduce_sum(p_reg_n)
        loss = pg_loss + p_reg * 1e-3 + beta * kl_loss_message

        var_list = []
        var_list.extend(p_func_vars)
        var_list = list(itertools.chain(*var_list))
        optimize_expr = U.minimize_and_clip(optimizer, loss, var_list,
                                            grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + messages_ph_n + act_ph_n,
                           outputs=loss,
                           updates=[optimize_expr])
        act = U.function(inputs=obs_ph_n + messages_ph_n,
                         outputs=[act_sample_n, message_n])
        p_values = U.function(inputs=obs_ph_n + messages_ph_n, outputs=p_n)

        # target network
        target_items = [
            p_func([obs_ph_n[i], tf.concat(messages_ph_n, 1)],
                   int(act_pdtype_n[i].param_shape()[0]),
                   scope="target_p_func_{}".format(i),
                   num_units=num_units) for i in range(num_agents)
        ]

        target_p_n, target_message_n, target_mu_message_n, target_logvar_message_n = list(
            zip(*target_items))
        target_logvar_message_n = [
            tf.clip_by_value(log, -10, 10) for log in target_logvar_message_n
        ]  # constrain kl_loss not to be too large

        target_p_func_vars = [
            U.scope_vars(U.absolute_scope_name("target_p_func_{}".format(i)))
            for i in range(num_agents)
        ]

        target_var_list = []
        target_var_list.extend(target_p_func_vars)
        target_var_list = list(itertools.chain(*target_var_list))
        update_target_p = make_update_exp(var_list, target_var_list)

        target_act_sample_n = [
            act_pdtype_n[i].pdfromflat(target_p_n[i]).sample()
            for i in range(num_agents)
        ]
        target_act = U.function(
            inputs=obs_ph_n + messages_ph_n,
            outputs=[target_act_sample_n, target_message_n])

        return act, train, update_target_p, {
            'p_values': p_values,
            'target_act': target_act
        }
Пример #24
0
def p_train(make_obs_ph_n,
            act_space_n,
            p_index,
            p_func,
            q_func,
            optimizer,
            grad_norm_clipping=None,
            local_q_func=False,
            num_units=64,
            scope="trainer",
            reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]

        p_input = obs_ph_n[p_index]

        p = p_func(p_input,
                   int(act_pdtype_n[p_index].param_shape()[0]),
                   scope="p_func",
                   num_units=num_units)
        p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))

        # wrap parameters in distribution
        act_pd = act_pdtype_n[p_index].pdfromflat(p)

        act_sample = act_pd.sample()
        p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))

        act_input_n = act_ph_n + []
        act_input_n[p_index] = act_pd.sample()
        q_input = tf.concat(obs_ph_n + act_input_n, 1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1)
        q = q_func(q_input, 1, scope="q_func", reuse=True,
                   num_units=num_units)[:, 0]
        pg_loss = -tf.reduce_mean(q)

        loss = pg_loss + p_reg * 1e-3

        optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars,
                                            grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n,
                           outputs=loss,
                           updates=[optimize_expr])
        act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample)
        p_values = U.function([obs_ph_n[p_index]], p)

        # target network
        target_p = p_func(p_input,
                          int(act_pdtype_n[p_index].param_shape()[0]),
                          scope="target_p_func",
                          num_units=num_units)
        target_p_func_vars = U.scope_vars(
            U.absolute_scope_name("target_p_func"))
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)

        target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample()
        target_act = U.function(inputs=[obs_ph_n[p_index]],
                                outputs=target_act_sample)

        return act, train, update_target_p, {
            'p_values': p_values,
            'target_act': target_act
        }
Пример #25
0
def q_train(make_obs_ph_n,
            act_space_n,
            q_index,
            q_func,
            u_func,
            optimizer,
            grad_norm_clipping=None,
            local_q_func=False,
            scope="trainer",
            reuse=None,
            num_units=64,
            u_estimation=False):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]
        target_ph = tf.placeholder(tf.float32, [None], name="target")
        rew = tf.placeholder(tf.float32, [None], name="reward")
        if u_estimation:
            target_ph_u = tf.placeholder(tf.float32, [None], name="target_u")
        q_input = tf.concat(obs_ph_n + act_ph_n, 1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1)
        q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:, 0]
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))
        if u_estimation:
            u_input = tf.concat(obs_ph_n + act_ph_n, 1)
            u = u_func(u_input, 1, scope="u_func", num_units=num_units)[:, 0]
            u_loss = tf.reduce_mean(
                tf.square(
                    tf.square(rew) + 2 * tf.multiply(rew, target_ph) +
                    target_ph_u - u))
            var = u - tf.square(q)
        else:
            var = tf.square(rew + target_ph) - tf.square(q)
        if u_estimation:
            u_func_vars = U.scope_vars(U.absolute_scope_name("u_func"))
        q_loss = tf.reduce_mean(tf.square(q - (rew + target_ph)))

        # viscosity solution to Bellman differential equation in place of an initial condition
        q_reg = tf.reduce_mean(tf.square(q))
        if u_estimation:
            loss = q_loss + u_loss  #+ 1e-3 * q_reg
            optimize_expr = U.minimize_and_clip(optimizer, loss,
                                                q_func_vars + u_func_vars,
                                                grad_norm_clipping)
            train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] +
                               [target_ph_u] + [rew],
                               outputs=[q_loss, u_loss],
                               updates=[optimize_expr])
            var_fn = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] +
                                [target_ph_u] + [rew],
                                outputs=var)
        else:
            loss = q_loss  #+ 1e-3 * q_reg
            optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars,
                                                grad_norm_clipping)
            train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] +
                               [rew],
                               outputs=loss,
                               updates=[optimize_expr])
        q_values = U.function(obs_ph_n + act_ph_n, q)

        var_fn = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] + [rew],
                            outputs=var)
        # target network
        target_q = q_func(q_input,
                          1,
                          scope="target_q_func",
                          num_units=num_units)[:, 0]
        target_q_func_vars = U.scope_vars(
            U.absolute_scope_name("target_q_func"))
        update_target_q = make_update_exp(q_func_vars, target_q_func_vars)

        target_q_values = U.function(obs_ph_n + act_ph_n, target_q)

        if u_estimation:
            u_values = U.function(obs_ph_n + act_ph_n, u)
            target_u = u_func(u_input,
                              1,
                              scope="target_u_func",
                              num_units=num_units)[:, 0]
            target_u_func_vars = U.scope_vars(
                U.absolute_scope_name("target_u_func"))
            update_target_u = make_update_exp(u_func_vars, target_u_func_vars)
            target_u_values = U.function(obs_ph_n + act_ph_n, target_u)

        if u_estimation:
            return train, update_target_q, update_target_u, {
                'q_values': q_values,
                'u_values': u_values,
                'var': var_fn,
                'target_q_values': target_q_values,
                'target_u_values': target_u_values
            }
        else:
            return train, update_target_q, {
                'q_values': q_values,
                'var': var_fn,
                'target_q_values': target_q_values
            }
def q_train(make_obs_ph_n,
            act_space_n,
            q_index,
            q_func,
            optimizer,
            grad_norm_clipping=None,
            local_q_func=False,
            scope="trainer",
            reuse=None,
            num_units=64):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        # make_ob_ph_n是输入的placeholder,与obs_n同shape
        act_pdtype_n = [make_pdtype(act_space)
                        for act_space in act_space_n]  # 获取概率类型,传入动作维度(5)
        # act_space来自于env.act_space,由实验环境决定
        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]
        target_ph = tf.placeholder(tf.float32, [None],
                                   name="target")  # 一维输入占位符
        # 以上为三个placeholder, [None]增加维度,不知道喂进去多少数据时使用, 即None是batchsize大小

        q_input = tf.concat(obs_ph_n + act_ph_n,
                            1)  # q函数输入网络为动作加上环境,在1维上,即q网络输入是所有agent观察和动作
        if local_q_func:  # 用ddpg时即只用自己的行为训练
            q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1)
        q = q_func(q_input, 1, scope="q_func",
                   num_units=num_units)[:, 0]  # 取所有行的第0个数据
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))
        # q网络变量集合
        q_loss = tf.reduce_mean(
            tf.square(q - target_ph))  # target_ph 会被什么占据呢? 会被喂进去的td target占据
        # q网络的损失函数,均方差,target_ph来自于target网络的预测
        # viscosity solution to Bellman differential equation in place of an initial condition
        q_reg = tf.reduce_mean(tf.square(q))
        loss = q_loss  # + 1e-3 * q_reg

        optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars,
                                            grad_norm_clipping)
        # 优化器表达式,以及是否梯度clip
        # Create callable functions
        # theano function
        train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph],
                           outputs=loss,
                           updates=[optimize_expr])
        q_values = U.function(obs_ph_n + act_ph_n, q)

        # target network
        target_q = q_func(q_input,
                          1,
                          scope="target_q_func",
                          num_units=num_units)[:, 0]
        target_q_func_vars = U.scope_vars(
            U.absolute_scope_name("target_q_func"))
        update_target_q = make_update_exp(q_func_vars, target_q_func_vars)

        target_q_values = U.function(obs_ph_n + act_ph_n, target_q)
        # 以下返回值均为theano function可以直接填入传入placeholder的参数
        return train, update_target_q, {
            'q_values': q_values,
            'target_q_values': target_q_values
        }
Пример #27
0
def q_train(make_obs_ph_n,
            act_space_n,
            q_index,
            q_func,
            optimizer,
            grad_norm_clipping=None,
            local_q_func=False,
            scope="trainer",
            reuse=None,
            num_units=64):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]
        target_ph = tf.placeholder(tf.float32, [None], name="target")

        q_input = tf.concat(obs_ph_n + act_ph_n, 1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1)
        q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:, 0]
        #q_func是一个函数 其输出为全连接网络的输出,即q
        q_func_vars = U.scope_vars(
            U.absolute_scope_name("q_func"))  #得到函数中的参数(全连接的参数)

        q_loss = tf.reduce_mean(tf.square(q - target_ph))
        #定义平方损失,这是critic中的DQN的损失函数
        # viscosity solution to Bellman differential equation in place of an initial condition
        q_reg = tf.reduce_mean(tf.square(q))
        loss = q_loss  #+ 1e-3 * q_reg #类似参数衰减,防止过拟合

        optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars,
                                            grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph],
                           outputs=loss,
                           updates=[optimize_expr])
        #将输入到输出打包为一个函数
        q_values = U.function(obs_ph_n + act_ph_n, q)

        # target network
        target_q = q_func(q_input,
                          1,
                          scope="target_q_func",
                          num_units=num_units)[:, 0]
        #目标Q网络,用于计算Q现实,不必训练参数,每隔一段时间从q网络复制参数
        target_q_func_vars = U.scope_vars(
            U.absolute_scope_name("target_q_func"))
        #得到目标Q网络的参数
        update_target_q = make_update_exp(q_func_vars, target_q_func_vars)

        target_q_values = U.function(obs_ph_n + act_ph_n, target_q)
        #将这个网络打包为一个函数,调用这个函数就可以方便地计算Q现实
        return train, update_target_q, {
            'q_values': q_values,
            'target_q_values': target_q_values
        }
Пример #28
0
def q_train(make_obs_ph_n,
            act_space_n,
            q_index,
            q_func,
            optimizer,
            grad_norm_clipping=None,
            local_q_func=False,
            scope="trainer",
            reuse=None,
            num_units=256):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        # act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]
        act_pdtype_n = [
            SoftCategoricalPdType(act_space_n[q_index])
            for act_space in act_space_n
        ]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]
        target_ph = tf.placeholder(tf.float32, [None], name="target")

        q_input = tf.concat(obs_ph_n + act_ph_n, 1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1)
        q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:, 0]
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        q_loss = tf.reduce_mean(tf.square(q - target_ph))

        # viscosity solution to Bellman differential equation in place of an initial condition
        q_reg = tf.reduce_mean(tf.square(q))
        loss = q_loss  #+ 1e-3 * q_reg

        optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars,
                                            grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph],
                           outputs=loss,
                           updates=[optimize_expr])
        q_values = U.function(obs_ph_n + act_ph_n, q)

        # target network
        target_q = q_func(q_input,
                          1,
                          scope="target_q_func",
                          num_units=num_units)[:, 0]
        target_q_func_vars = U.scope_vars(
            U.absolute_scope_name("target_q_func"))
        update_target_q = make_update_exp(q_func_vars, target_q_func_vars)

        target_q_values = U.function(obs_ph_n + act_ph_n, target_q)

        return train, update_target_q, {
            'q_values': q_values,
            'target_q_values': target_q_values
        }
Пример #29
0
    def __init__(self,
                 n_b_agent,
                 a_dim,
                 s_dim,
                 a_bound=1,
                 gamma=0.95,
                 tau=0.01,
                 lr_a=1e-2,
                 lr_c=1e-2,
                 memory_size=100000,
                 batch_size=64,
                 scope=""):
        self.nb_agent = n_b_agent
        self.memory = np.zeros(
            (memory_size,
             s_dim * 2 * self.nb_agent + a_dim * self.nb_agent + 1 + 1),
            dtype=np.float32)
        self.pointer = 0
        self.sess = tf.Session()

        self.memory_size = memory_size
        self.batch_size = batch_size
        self.memory_filled = 0

        self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound,
        self.S = tf.placeholder(tf.float32, [None, s_dim], 's')
        self.total_a = tf.placeholder(tf.float32,
                                      [None, self.a_dim * self.nb_agent], 'a')
        self.S_ = tf.placeholder(tf.float32, [None, s_dim], 's_')
        self.total_a_ = tf.placeholder(tf.float32,
                                       [None, self.a_dim * self.nb_agent],
                                       'a_')
        self.R = tf.placeholder(tf.float32, [None, 1], 'r')
        self.D = tf.placeholder(tf.float32, [None, 1], 'done')
        self.scope = scope

        with tf.variable_scope('Actor'):
            self.a, self.pre_a = self._build_a(self.S,
                                               scope='eval',
                                               trainable=True)
            self.a_, *_ = self._build_a(self.S_,
                                        scope='target',
                                        trainable=False)
        with tf.variable_scope('Critic'):
            # assign self.a = a in memory when calculating q for td_error,
            # otherwise the self.a is from Actor when updating Actor
            q = self._build_c(self.S,
                              self.total_a,
                              scope='eval',
                              trainable=True)
            q_ = self._build_c(self.S_,
                               self.total_a_,
                               scope='target',
                               trainable=False)

        # networks parameters
        prefix = (self.scope + "/") if len(self.scope) > 0 else ""
        self.ae_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                           scope=prefix + 'Actor/eval')
        self.at_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                           scope=prefix + 'Actor/target')
        self.ce_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                           scope=prefix + 'Critic/eval')
        self.ct_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                           scope=prefix + 'Critic/target')

        # target net replacement
        self.soft_replace = [
            tf.assign(t, (1 - tau) * t + tau * e)
            for t, e in zip(self.at_params + self.ct_params, self.ae_params +
                            self.ce_params)
        ]

        q_target = self.R + (1. - self.D) * gamma * q_
        # in the feed_dic for the td_error, the self.a should change to actions in memory
        td_error = tf.losses.mean_squared_error(labels=q_target, predictions=q)
        # self.ctrain = tf.train.AdamOptimizer(lr_c).minimize(td_error, var_list=self.ce_params)
        optimizer = tf.train.AdamOptimizer(lr_c)
        self.ctrain = U.minimize_and_clip(optimizer, td_error, self.ce_params,
                                          .5)

        a_reg = tf.reduce_mean(tf.reduce_sum(tf.square(self.pre_a), axis=-1))
        a_loss = -tf.reduce_mean(q) + 1e-3 * a_reg  # maximize the q
        # self.atrain = tf.train.AdamOptimizer(lr_a).minimize(a_loss, var_list=self.ae_params)
        optimizer = tf.train.AdamOptimizer(lr_a)
        self.atrain = U.minimize_and_clip(optimizer, a_loss, self.ae_params,
                                          .5)
        self.sess.run(tf.global_variables_initializer())