Пример #1
0
def q_train(make_obs_ph_n, act_space_n, q_index, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, scope="trainer", reuse=None, num_units=64):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]
        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))]
        target_ph = tf.placeholder(tf.float32, [None], name="target")

        q_input = tf.concat(obs_ph_n + act_ph_n, 1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1)
        q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:,0]
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        q_loss = tf.reduce_mean(tf.square(q - target_ph))

        # viscosity solution to Bellman differential equation in place of an initial condition
        q_reg = tf.reduce_mean(tf.square(q))
        loss = q_loss #+ 1e-3 * q_reg

        optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr])
        q_values = U.function(obs_ph_n + act_ph_n, q)

        # target network
        target_q = q_func(q_input, 1, scope="target_q_func", num_units=num_units)[:,0]
        target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func"))
        update_target_q = make_update_exp(q_func_vars, target_q_func_vars)

        target_q_values = U.function(obs_ph_n + act_ph_n, target_q)

        return train, update_target_q, {'q_values': q_values, 'target_q_values': target_q_values}
Пример #2
0
def i_train(make_obs_ph_n, intent_ph_n, act_space_n, make_intent_ph_n, make_act_traj_ph_n, i_func, i_index,output_size , optimizer, scope, reuse, grad_norm_clipping=None, num_units=64):
    with tf.variable_scope(scope, reuse=reuse):
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        obs_ph_n = make_obs_ph_n
        #here the intent_ph can be used as the true actions, they are in the same shape
        intent_ph_n = make_intent_ph_n

        flat_act_traj_ph_n =[tf.reshape(a, (-1,  a.shape[1] * a.shape[2] *a.shape[3])) for a in make_act_traj_ph_n] 

        act_traj_ph_n = make_act_traj_ph_n

        i_input = [tf.concat([obs, act_traj], axis = 1) for obs, act_traj in zip(obs_ph_n, flat_act_traj_ph_n)]

        i = i_func(i_input[i_index], output_size, scope = "i_func", num_units = 64 )
        i_func_vars = U.scope_vars(U.absolute_scope_name("i_func"))

        #define loss
        loss = tf.reduce_mean(tf.square(i - intent_ph_n[i_index]))
        optimize_expr = U.minimize_and_clip(optimizer, loss, i_func_vars, grad_norm_clipping)
        train = U.function(inputs= obs_ph_n + act_traj_ph_n + intent_ph_n, outputs=loss, updates=[optimize_expr])
        i_values = U.function(inputs =[obs_ph_n[i_index]] + [act_traj_ph_n[i_index]], outputs = i)

        target_i = i_func(i_input, output_size, scope = "target_i_func", num_units = 64 )
        target_i_func_vars = U.scope_vars(U.absolute_scope_name("target_i_func"))
        update_target_i = make_update_exp(i_func_vars, target_i_func_vars)

        target_i_values = U.function(inputs = [obs_ph_n[i_index]] +[act_traj_ph_n[i_index]], outputs = target_i)

        return i_values, train, update_target_i,{'i_values': i_values, 'target_i_values': target_i_values}
Пример #3
0
def pMA_train(make_obs_ph_n, make_memory_ph_n, act_space_n, p_index, p_func, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, critic_units=64, scope="trainer", reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))]
        # p_input = obs_ph_n[p_index]
        memory_input = make_memory_ph_n[p_index]

        num_agents = len(obs_ph_n)
        p_input = [None] * (num_agents + 1)
        for i in range(num_agents):
            p_input[i] = obs_ph_n[i]
        p_input[num_agents] = memory_input

        p, memory_state = p_func(obs_ph_n, memory_input, int(act_pdtype_n[p_index].param_shape()[0]),
                                                        scope="pMA_func", reuse=reuse)
        p_func_vars = U.scope_vars(U.absolute_scope_name("pMA_func"))

        # wrap parameters in distribution
        act_pd = act_pdtype_n[p_index].pdfromflat(p)

        act_sample = act_pd.sample()
        p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))

        act_input_n = act_ph_n + []
        act_input_n[p_index] = act_pd.sample()

        q_input = tf.concat(obs_ph_n + act_input_n, 1)

        if local_q_func:
            q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1)

        q = q_func(q_input, 1, scope="qMA_func", reuse=True, num_units=critic_units)[:,0]
        pg_loss = -tf.reduce_mean(q)

        loss = pg_loss + p_reg * 1e-3

        optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=p_input + act_ph_n, outputs=loss, updates=[optimize_expr])
        act = U.function(inputs=p_input, outputs=act_sample)
        memory_out = U.function(inputs=p_input, outputs=memory_state)
        p_values = U.function(p_input, p)

        # target network
        target_p, target_memory = p_func(obs_ph_n, memory_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func", reuse=reuse)
        target_p_func_vars = U.scope_vars(U.absolute_scope_name("target_pMA_func"))
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)

        target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample()
        target_act = U.function(inputs=p_input, outputs=target_act_sample)

        return act, memory_out, train, update_target_p, {'p_values': p_values, 'target_act': target_act}
Пример #4
0
def p_train(env, make_obs_ph_n, act_space_n, p_index, vf_func, shana, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]
        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))]
        policy = shana(
        env_spec=env,
        af = 15,
        of = 22,
        K=2,
        hidden_layer_sizes=(128, 128),
        qf=q_func,
        reg=0.001
        )
        act, log_pi = policy.actions_for(observations=make_obs_ph_n[p_index],
                                                   with_log_pis=True)
        act_input_n = act_ph_n + []
        p_func_vars = policy.get_params_internal()
        q_input = tf.concat(obs_ph_n + act_input_n, 1)
        vf_input = tf.concat(obs_ph_n, 1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1)
        q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:,0]
        vf = q_func(vf_input, 1, scope="vf_func",reuse=True, num_units=num_units)[:,0]
        vf_func_vars = U.scope_vars(U.absolute_scope_name("vf_func"))
        pg_loss = tf.reduce_mean(log_pi * tf.stop_gradient(log_pi - q + vf))
        p_reg = tf.get_collection(
            tf.GraphKeys.REGULARIZATION_LOSSES,
            scope=policy.name)
        loss = pg_loss + p_reg
        vf_loss = 0.5 * tf.reduce_mean((vf - tf.stop_gradient(q - log_pi))**2)
        optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping)
        mikoto = U.minimize_and_clip(optimizer, vf_loss, vf_func_vars, grad_norm_clipping)
        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr])
        misaka = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[mikoto])
        # target network
        target_p = shana(
        env_spec=env,
        af = 15,
        of = 22,
        K=2,
        hidden_layer_sizes=(128, 128),
        qf=q_func,
        reg=0.001,
        name = 'target_policy'
        )
        target_p_func_vars = target_p.get_params_internal()
        target_vf = q_func(vf_input, 1, scope="target_vf_func", num_units=num_units)[:,0]
        target_vf_func_vars = U.scope_vars(U.absolute_scope_name("target_vf_func"))
        target_act_r, tar_log = target_p.actions_for(observations=obs_ph_n[p_index],with_log_pis=True)
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)
        upvf = make_update_exp(vf_func_vars, target_vf_func_vars)
        target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_r)
        return policy.get_actions, train, misaka, update_target_p, upvf, {'target_act': target_act}
Пример #5
0
def m_train(act_space_n,
            m_index,
            m_func,
            optimizer,
            mut_inf_coef=0,
            grad_norm_clipping=None,
            scope="trainer",
            reuse=None,
            num_units=64):
    return
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = 1
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]
        target_ph = tf.placeholder(tf.float32, [None], name="target")

        m_input = [1]
        m = m_func(m_input, 1, scope="m_func", num_units=num_units)[:, 0]
        m_func_vars = U.scope_vars(U.absolute_scope_name("m_func"))

        m_loss = tf.reduce_mean(tf.square(m - target_ph))

        # viscosity solution to Bellman differential equation in place of an initial condition
        m_reg = tf.reduce_mean(tf.square(m))
        loss = m_loss  #+ 1e-3 * q_reg

        optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars,
                                            grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph],
                           outputs=loss,
                           updates=[optimize_expr])
        m_values = U.function(obs_ph_n + act_ph_n, m)

        # target network
        target_m = m_func(m_input,
                          1,
                          scope="target_m_func",
                          num_units=num_units)[:, 0]
        target_m_func_vars = U.scope_vars(
            U.absolute_scope_name("target_m_func"))
        update_target_m = make_update_exp(m_func_vars, target_m_func_vars)

        target_m_values = U.function(obs_ph_n + act_ph_n, target_m)

        return train, update_target_m, {
            'm_values': m_values,
            'target_m_values': target_m_values
        }
Пример #6
0
def p_train(make_obs_ph_n, act_space_n, p_index, p_func, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        # Discrete type for spread
        # SoftCategoricalPdType
        # SoftCategoricalPd
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]
        #print('act_pdtype_n:\n',act_pdtype_n)

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        #print('obs_ph_n:\n', obs_ph_n)
        act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))]
        #print('act_ph_n:\n', act_ph_n)

        p_input = obs_ph_n[p_index]

        p = p_func(inputs=p_input, num_outputs=int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func", num_units=num_units, num_layers=3)
        #print('p',p) #shape=(?, 2)
        p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))

        # wrap parameters in distribution
        act_pd = act_pdtype_n[p_index].pdfromflat(p)
        #print('act_pd:\n', act_pd)

        act_sample = act_pd.sample()
        #print('act_sample 1111111:\n', act_sample) #shape=(?, 2)
        p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))

        act_input_n = act_ph_n + []
        act_input_n[p_index] = act_pd.sample()
        q_input = tf.concat(obs_ph_n + act_input_n, 1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1)
        q = q_func(inputs=q_input, num_outputs=1, scope="q_func", reuse=True, num_units=num_units, num_layers=3)[:,0]
        pg_loss = -tf.reduce_mean(q)

        loss = pg_loss + p_reg * 1e-3

        optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr])
        act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample)
        p_values = U.function([obs_ph_n[p_index]], p)

        # target network
        target_p = p_func(inputs=p_input, num_outputs=int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func", num_units=num_units, num_layers=3)
        target_p_func_vars = U.scope_vars(U.absolute_scope_name("target_p_func"))
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)

        target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample()
        target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample)

        return act, train, update_target_p, {'p_values': p_values, 'target_act': target_act}
Пример #7
0
def p_train(make_obs_ph_n, act_space_n, p_index, p_func, q_func, optimizer, grad_norm_clipping=None, local_q_func=False,
            num_units=64, scope="trainer", reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n  # [U.ensure_tf_input(make_obs_ph_n[i]("observation"+str(i))).get() for i in range(len(make_obs_ph_n))]
        act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action" + str(i)) for i in range(len(act_space_n))]

        p_input = obs_ph_n[p_index]

        p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func", num_units=num_units)
        p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))

        # wrap parameters in distribution
        act_pd = act_pdtype_n[p_index].pdfromflat(p)

        act_sample = act_pd.sample()
        p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))

        act_input_n = act_ph_n + []
        act_input_n[p_index] = act_pd.sample()  # act_pd.mode() #
        q_input = tf.concat(obs_ph_n + act_input_n, 1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1)
        q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:, 0]
        pg_loss = -tf.reduce_mean(q)

        loss = pg_loss + p_reg * 1e-3

        optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr])
        act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample)
        p_values = U.function([obs_ph_n[p_index]], p)

        # target network
        target_p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func",
                          num_units=num_units)
        target_p_func_vars = U.scope_vars(U.absolute_scope_name("target_p_func"))
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)
        sync_target_p = make_update_exp(p_func_vars, target_p_func_vars, rate=1.0)

        target_act_pd = act_pdtype_n[p_index].pdfromflat(target_p)
        target_act_sample = target_act_pd.sample()
        target_act_mode = target_act_pd.mode()
        target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample)
        target_mode = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_mode)
        target_p_values = U.function([obs_ph_n[p_index]], target_p)

        return act, train, update_target_p, sync_target_p, {'p_values': p_values, 'target_p_values': target_p_values,
                                                            'target_mode': target_mode, 'target_act': target_act}
Пример #8
0
def q_train(n_agents, make_state_ph_n, make_obs_ph_n, act_space_n, q_index, q_func, optimizer, grad_norm_clipping=None, local_q_func=False,
            scope="trainer", reuse=None, num_units=64, discrete_action=False, target_update_tau=0.001, use_global_state=False,
            share_weights=False):

    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        if not use_global_state:
            obs_ph_n = make_obs_ph_n
        else:
            obs_ph_n = make_state_ph_n
        act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))]
        target_ph = tf.placeholder(tf.float32, [None], name="target")

        q_input = tf.concat(obs_ph_n + act_ph_n, 1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1)
        if share_weights:
            # add agent id to input as layers share weights
            q_input = tf.concat([q_input,
                                 tf.tile(tf.eye(n_agents)[q_index:q_index+1],
                                         [tf.shape(q_input)[0], 1])], -1)
        q = q_func(q_input, 1, scope="q_func", reuse=share_weights, num_units=num_units,
                   constrain_out=False, discrete_action=discrete_action)[:, 0] #share_weights)[:, 0]
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        q_loss = tf.reduce_mean(tf.square(q - target_ph))

        # viscosity solution to Bellman differential equation in place of an initial condition
        q_reg = tf.reduce_mean(tf.square(q))
        loss = q_loss #+ 1e-3 * q_reg

        optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr])
        q_values = U.function(obs_ph_n + act_ph_n, q)

        # target network
        target_q = q_func(q_input, 1, scope="target_q_func", reuse=share_weights, num_units=num_units,
                          constrain_out=False, discrete_action=discrete_action)[:, 0]
        target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func"))
        update_target_q = make_update_exp(q_func_vars, target_q_func_vars, target_update_tau)

        target_q_values = U.function(obs_ph_n + act_ph_n, target_q)

        return train, update_target_q, {'q_values': q_values, 'target_q_values': target_q_values}
Пример #9
0
def p_train(env, make_obs_ph_n, act_space_n, p_index, vf_func, shana, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]
        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))]
        policy = shana(
        env_spec=env,
        af = 15,
        of = 22,
        K=2,
        hidden_layer_sizes=(100, 100),
        qf=q_func,
        reg=0.001
        )
        actions, log_pi = policy.actions_for(observations=make_obs_ph_n[p_index],
                                                   with_log_pis=True)
        print(actions)
        print(log_pi)
        p_func_vars = policy.get_params_internal()
        q_input = tf.concat(obs_ph_n + act_input_n, 1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1)
        q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:,0]
        pg_loss = -tf.reduce_mean(q)
        p_reg = tf.get_collection(
            tf.GraphKeys.REGULARIZATION_LOSSES,
            scope=policy.name)
        loss = pg_loss + p_reg * 1e-3

        optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr])
        act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample)
        p_values = U.function([obs_ph_n[p_index]], p)

        # target network
        target_p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func", num_units=num_units)
        target_p_func_vars = U.scope_vars(U.absolute_scope_name("target_p_func"))
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)

        target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample()
        target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample)

        return act, train, update_target_p, {'p_values': p_values, 'target_act': target_act}
Пример #10
0
def q_train(make_obs_ph_n,
            act_space_n,
            q_index,
            q_func,
            u_func,
            optimizer,
            optimizer_lamda,
            exp_var_alpha=None,
            cvar_alpha=None,
            cvar_beta=None,
            grad_norm_clipping=None,
            local_q_func=False,
            scope="trainer",
            reuse=None,
            num_units=64,
            u_estimation=False,
            constrained=True,
            constraint_type=None,
            agent_type=None):
    with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
        if constrained:
            lamda_constraint = tf.get_variable(
                'lamda_constraint' + str(q_index), [1],
                initializer=tf.constant_initializer(1.0),
                dtype=tf.float32)
            if constraint_type == "CVAR":
                v_constraint = tf.get_variable(
                    'v_constraint' + str(q_index), [1],
                    initializer=tf.constant_initializer(1.0),
                    dtype=tf.float32)
        # create distribtuions
        act_pdtype_n = make_pdtype(act_space_n[q_index])
        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [act_pdtype_n.sample_placeholder([None], name="action0")]
        target_ph = tf.placeholder(tf.float32, [None], name="target")
        if u_estimation:
            target_ph_u = tf.placeholder(tf.float32, [None], name="target_u")
        rew = tf.placeholder(tf.float32, [None], name="reward")
        q_input = tf.concat(obs_ph_n + act_ph_n, 1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[0], act_ph_n[0]], 1)
        q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:, 0]

        if u_estimation:
            u_input = tf.concat(obs_ph_n + act_ph_n, 1)
            u = u_func(u_input, 1, scope="u_func", num_units=num_units)[:, 0]
            u_loss = tf.reduce_mean(
                tf.square(
                    tf.square(rew) + 2 * tf.multiply(rew, target_ph) +
                    target_ph_u - u))
            var = u - tf.square(q)
        else:
            var = tf.square(rew + target_ph) - tf.square(q)
        if constrained:
            if constraint_type == "Exp_Var":
                #print ('In constraint generation with lamda alpha')
                constraint = lamda_constraint * (var - exp_var_alpha)
                q_loss = tf.reduce_mean(
                    tf.square(q - (target_ph + rew - constraint)))
            elif constraint_type == "CVAR":
                cvar = v_constraint + (1.0 /
                                       (1.0 - cvar_beta)) * tf.reduce_mean(
                                           tf.nn.relu(q - v_constraint))
                constraint = lamda_constraint * (cvar_alpha - cvar)
                q_loss = tf.reduce_mean(
                    tf.square(q - (target_ph + rew - constraint)))
        else:
            q_loss = tf.reduce_mean(tf.square(q - (target_ph + rew)))

        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))
        if u_estimation:
            u_func_vars = U.scope_vars(U.absolute_scope_name("u_func"))

        # viscosity solution to Bellman differential equation in place of an initial condition
        q_reg = tf.reduce_mean(tf.square(q))
        train3 = None
        if u_estimation:
            loss = q_loss + u_loss  #+ 1e-3 * q_reg
            optimize_expr = U.minimize_and_clip(optimizer, loss,
                                                q_func_vars + u_func_vars,
                                                grad_norm_clipping)
            train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] +
                               [target_ph_u] + [rew],
                               outputs=[q_loss, u_loss],
                               updates=[optimize_expr])
            var_fn = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] +
                                [target_ph_u] + [rew],
                                outputs=var)
        elif constraint_type == "CVAR":
            loss = q_loss
            optimize_expr = U.minimize_and_clip(optimizer, loss,
                                                q_func_vars + [v_constraint],
                                                grad_norm_clipping)
            train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] +
                               [rew],
                               outputs=q_loss,
                               updates=[optimize_expr])
            cvar_fn = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] +
                                 [rew],
                                 outputs=cvar)
            var_fn = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] +
                                [rew],
                                outputs=var)
        else:
            #print ('in loss minimization over q_func_vars')
            loss = q_loss
            optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars,
                                                grad_norm_clipping)
            train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] +
                               [rew],
                               outputs=q_loss,
                               updates=[optimize_expr])
            var_fn = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] +
                                [rew],
                                outputs=var)
            if not constrained:
                optimize_expr3 = U.minimize_and_clip(optimizer, loss,
                                                     [v_constraint],
                                                     grad_norm_clipping)
                train3 = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] +
                                    [rew],
                                    outputs=q_loss,
                                    updates=[optimize_expr3])
        #loss = loss + 1e-4*q_reg
        # Create callable functions

        q_values = U.function(obs_ph_n + act_ph_n, q)

        # target network
        target_q = q_func(q_input,
                          1,
                          scope="target_q_func",
                          num_units=num_units)[:, 0]
        target_q_func_vars = U.scope_vars(
            U.absolute_scope_name("target_q_func"))
        update_target_q = make_update_exp(q_func_vars, target_q_func_vars)
        target_q_values = U.function(obs_ph_n + act_ph_n, target_q)

        if u_estimation:
            u_values = U.function(obs_ph_n + act_ph_n, u)
            target_u = u_func(u_input,
                              1,
                              scope="target_u_func",
                              num_units=num_units)[:, 0]
            target_u_func_vars = U.scope_vars(
                U.absolute_scope_name("target_u_func"))
            update_target_u = make_update_exp(u_func_vars, target_u_func_vars)
            target_u_values = U.function(obs_ph_n + act_ph_n, target_u)

        if constrained:
            loss2 = -loss
            #print ('in loss maximisation over lamda')
            optimize_expr2 = U.minimize_and_clip(optimizer_lamda, loss2,
                                                 [lamda_constraint],
                                                 grad_norm_clipping)
            if u_estimation:
                train2 = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] +
                                    [target_ph_u] + [rew],
                                    outputs=loss2,
                                    updates=[optimize_expr2])
            else:
                train2 = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] +
                                    [rew],
                                    outputs=loss2,
                                    updates=[optimize_expr2])

        if not u_estimation:
            update_target_u = None
            target_u_values = None
            u_values = None
        if not constrained:
            train2 = None
            lamda_constraint = None
        if constraint_type != "CVAR":
            cvar_fn = None
            v_constraint = None
        return train, train2, train3, update_target_q, update_target_u, {
            'q_values': q_values,
            'u_values': u_values,
            'target_q_values': target_q_values,
            'target_u_values': target_u_values,
            'var': var_fn,
            'cvar': cvar_fn,
            'lamda_constraint': lamda_constraint,
            'v_constraint': v_constraint,
            'optimize_expr': optimize_expr
        }
Пример #11
0
def q_train(name,
            make_obs_ph_n,
            adj_n,
            act_space_n,
            num_adversaries,
            neighbor_n,
            q_func,
            agent_n,
            optimizer,
            grad_norm_clipping=None,
            local_q_func=False,
            reuse=None,
            scope="trainer",
            num_units=64):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # number of agents in this species
        agent_n_species = num_adversaries if name == "adversaries" else agent_n - num_adversaries

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]
        target_ph = [
            tf.placeholder(tf.float32, [None], name="target")
            for _ in range(agent_n_species)
        ]

        q = []
        q_square = []
        q_input = tf.concat(obs_ph_n + act_ph_n, 1)
        for a in range(agent_n_species):
            temp = q_func(q_input,
                          1,
                          scope="q_func_%d" % a,
                          num_units=num_units)[:, 0]
            q.append(temp)
        # q1 = tf.stack([q[i] for i in range(agent_n_species)], axis=1)
        # q_square = [tf.square(tf.reduce_mean(q[i] - target_ph[i], axis=1)) for i in range(agent_n_species)]
        q_func_vars = [
            U.scope_vars(U.absolute_scope_name("q_func_%d" % i))
            for i in range(agent_n_species)
        ]
        q_loss = [
            tf.reduce_mean(tf.square(q[i] - target_ph[i]))
            for i in range(agent_n_species)
        ]

        # viscosity solution to Bellman differential equation in place of an initial condition
        # q_reg = tf.reduce_mean(tf.square(q1))
        loss = q_loss
        # + 1e-3 * q_reg

        optimize_expr = [
            U.minimize_and_clip(optimizer, loss[i], q_func_vars[i],
                                grad_norm_clipping)
            for i in range(agent_n_species)
        ]

        # Create callable functions
        train = [
            U.function(inputs=obs_ph_n + act_ph_n + [target_ph[i]],
                       outputs=loss[i],
                       updates=[optimize_expr[i]])
            for i in range(agent_n_species)
        ]
        q_values = U.function(obs_ph_n + act_ph_n, q)

        # target network
        target_q = []
        for a in range(agent_n_species):
            temp = q_func(q_input,
                          1,
                          scope="target_q_func_%d" % a,
                          num_units=num_units)[:, 0]
            target_q.append(temp)
        target_q_func_vars = [
            U.scope_vars(U.absolute_scope_name("target_q_func_%d" % i))
            for i in range(agent_n_species)
        ]
        update_target_q = make_update_exp(q_func_vars,
                                          target_q_func_vars,
                                          central=False)

        target_q_values = U.function(obs_ph_n + act_ph_n, target_q)

        return train, update_target_q, q_values, target_q_values
Пример #12
0
def q_train(make_obs_ph_n,
            act_space_n,
            q_index,
            q_func,
            optimizer,
            adversarial,
            adv_eps,
            adv_eps_s,
            num_adversaries,
            grad_norm_clipping=None,
            local_q_func=False,
            scope="trainer",
            reuse=None,
            num_units=64):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]
        target_ph = tf.placeholder(tf.float32, [None], name="target")

        q_input = tf.concat(obs_ph_n + act_ph_n, 1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1)
        q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:, 0]
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        q_loss = tf.reduce_mean(tf.square(q - target_ph))

        # viscosity solution to Bellman differential equation in place of an initial condition
        q_reg = tf.reduce_mean(tf.square(q))
        loss = q_loss  #+ 1e-3 * q_reg

        optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars,
                                            grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph],
                           outputs=loss,
                           updates=[optimize_expr])
        q_values = U.function(obs_ph_n + act_ph_n, q)

        # target network
        target_q = q_func(q_input,
                          1,
                          scope="target_q_func",
                          num_units=num_units)[:, 0]

        if adversarial:
            num_agents = len(act_ph_n)
            if q_index < num_adversaries:
                adv_rate = [
                    adv_eps_s * (i < num_adversaries) + adv_eps *
                    (i >= num_adversaries) for i in range(num_agents)
                ]
            else:
                adv_rate = [
                    adv_eps_s * (i >= num_adversaries) + adv_eps *
                    (i < num_adversaries) for i in range(num_agents)
                ]
            print("      adv rate for q_index : ", q_index, adv_rate)

            pg_loss = -tf.reduce_mean(target_q)
            raw_perturb = tf.gradients(pg_loss, act_ph_n)
            perturb = [
                adv_eps * tf.stop_gradient(tf.nn.l2_normalize(elem, axis=1))
                for elem in raw_perturb
            ]
            new_act_n = [
                perturb[i] + act_ph_n[i] if i != q_index else act_ph_n[i]
                for i in range(len(act_ph_n))
            ]
            adv_q_input = tf.concat(obs_ph_n + new_act_n, 1)
            target_q = q_func(adv_q_input,
                              1,
                              scope='target_q_func',
                              reuse=True,
                              num_units=num_units)[:, 0]

        target_q_func_vars = U.scope_vars(
            U.absolute_scope_name("target_q_func"))
        update_target_q = make_update_exp(q_func_vars, target_q_func_vars)

        target_q_values = U.function(obs_ph_n + act_ph_n, target_q)

        return train, update_target_q, {
            'q_values': q_values,
            'target_q_values': target_q_values
        }
Пример #13
0
def p_train(make_obs_ph_n,
            act_space_n,
            p_index,
            p_func,
            q_func,
            optimizer,
            adversarial,
            adv_eps,
            adv_eps_s,
            num_adversaries,
            grad_norm_clipping=None,
            local_q_func=False,
            num_units=64,
            scope="trainer",
            reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]

        p_input = obs_ph_n[p_index]

        p = p_func(p_input,
                   int(act_pdtype_n[p_index].param_shape()[0]),
                   scope="p_func",
                   num_units=num_units)
        p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))

        # wrap parameters in distribution
        act_pd = act_pdtype_n[p_index].pdfromflat(p)

        act_sample = act_pd.sample()
        p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))

        act_input_n = act_ph_n + []
        act_input_n[p_index] = act_pd.sample()

        q_input = tf.concat(obs_ph_n + act_input_n, 1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1)
        q = q_func(q_input, 1, scope="q_func", reuse=True,
                   num_units=num_units)[:, 0]
        pg_loss = -tf.reduce_mean(q)

        if adversarial:
            num_agents = len(act_input_n)
            if p_index < num_adversaries:
                adv_rate = [
                    adv_eps_s * (i < num_adversaries) + adv_eps *
                    (i >= num_adversaries) for i in range(num_agents)
                ]
            else:
                adv_rate = [
                    adv_eps_s * (i >= num_adversaries) + adv_eps *
                    (i < num_adversaries) for i in range(num_agents)
                ]
            print("      adv rate for p_index : ", p_index, adv_rate)
            raw_perturb = tf.gradients(pg_loss, act_input_n)
            perturb = [
                tf.stop_gradient(tf.nn.l2_normalize(elem, axis=1))
                for elem in raw_perturb
            ]
            perturb = [perturb[i] * adv_rate[i] for i in range(num_agents)]
            new_act_n = [
                perturb[i] + act_input_n[i] if i != p_index else act_input_n[i]
                for i in range(len(act_input_n))
            ]

            adv_q_input = tf.concat(obs_ph_n + new_act_n, 1)
            adv_q = q_func(adv_q_input,
                           1,
                           scope="q_func",
                           reuse=True,
                           num_units=num_units)[:, 0]
            pg_loss = -tf.reduce_mean(q)

        loss = pg_loss + p_reg * 1e-3

        optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars,
                                            grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n,
                           outputs=loss,
                           updates=[optimize_expr])
        act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample)
        p_values = U.function([obs_ph_n[p_index]], p)

        # target network
        target_p = p_func(p_input,
                          int(act_pdtype_n[p_index].param_shape()[0]),
                          scope="target_p_func",
                          num_units=num_units)
        target_p_func_vars = U.scope_vars(
            U.absolute_scope_name("target_p_func"))
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)

        target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample()
        target_act = U.function(inputs=[obs_ph_n[p_index]],
                                outputs=target_act_sample)

        return act, train, update_target_p, {
            'p_values': p_values,
            'target_act': target_act
        }
def p_train(make_obs_ph_n,
            act_space_n,
            p_index,
            p_func,
            q_func,
            optimizer,
            grad_norm_clipping=None,
            local_q_func=False,
            num_units=64,
            scope="trainer",
            reuse=None):
    with tf.variable_scope(scope, reuse=reuse):  # 重用变量
        # create distribtuions初始动作概率分布列表
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n
                        ]  # 为所有agent的动作空间都创造一个动作概率分布类
        # 类的集合
        # set up placeholders
        obs_ph_n = make_obs_ph_n  # 所有的agent观察到的环境信息
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]
        # 返回用于存放每个agent的动作的占位符集合,用于填充所有agent选择的动作[none]代表可以填入无数组数据
        p_input = obs_ph_n[p_index]  # 仅观察到自身周围环境

        p = p_func(p_input,
                   int(act_pdtype_n[p_index].param_shape()[0]),
                   scope="p_func",
                   num_units=num_units)
        # 建立神经网络,输出单元数为动作个数...这代码写的太呆了 输出每一个动作的值
        p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))
        # 获取该神经网络全部变量
        # wrap parameters in distribution
        act_pd = act_pdtype_n[p_index].pdfromflat(p)
        act_sample = act_pd.sample()  # 确定性动作叠加噪声进行探索,成为随机策略,得到一组act,作用未知
        p_reg = tf.reduce_mean(tf.square(
            act_pd.flatparam()))  # flatparam是所有动作的actor网络输出值的集合
        # 猜测引入p_reg是因为预测其agent动作的需要
        act_input_n = act_ph_n + []
        act_input_n[p_index] = act_pd.sample(
        )  # 仅替换自己的动作输入,自己的动作来自于自己的policy网络输出
        # 所以通过这一步将两个网络连接,通过q网络优化自己的policy网络
        q_input = tf.concat(obs_ph_n + act_input_n,
                            1)  # q输入所有的环境观察值与所有的agents采取的动作
        # q的输入
        if local_q_func:
            q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1)
        q = q_func(q_input, 1, scope="q_func", reuse=True,
                   num_units=num_units)[:, 0]
        # 这里是用的q_func由于reuse所以使用已经创建好的变量,即自己的q网络而不是再创建一个
        # q_train,p_train属于同一个scope!
        # 策略优化目标
        pg_loss = -tf.reduce_mean(q)  # loss与p_reg均需要加-号进行优化
        # 目标使q的均值最大,等于采样后的-reduce_mean最小
        loss = pg_loss + p_reg * 1e-3  # 引入熵?
        # 梯度下降优化器节点表达式
        optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars,
                                            grad_norm_clipping)

        # Create callable functions可调用函数,批量使用session训练
        train = U.function(inputs=obs_ph_n + act_ph_n,
                           outputs=loss,
                           updates=[optimize_expr])
        act = U.function(inputs=[obs_ph_n[p_index]],
                         outputs=act_sample)  # 依据自身观察给出确定性动作
        p_values = U.function([obs_ph_n[p_index]], p)  # 输出的是动作值集合

        # target network
        target_p = p_func(p_input,
                          int(act_pdtype_n[p_index].param_shape()[0]),
                          scope="target_p_func",
                          num_units=num_units)
        target_p_func_vars = U.scope_vars(
            U.absolute_scope_name("target_p_func"))
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)

        target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample()
        target_act = U.function(inputs=[obs_ph_n[p_index]],
                                outputs=target_act_sample)

        return act, train, update_target_p, {
            'p_values': p_values,
            'target_act': target_act
        }
Пример #15
0
def p_train(make_obs_ph_n,
            make_meesages_ph_n,
            act_space_n,
            p_func,
            q_func,
            optimizer,
            grad_norm_clipping=None,
            local_q_func=False,
            num_units=64,
            scope="trainer",
            reuse=None,
            beta=0.01):
    with tf.variable_scope(scope, reuse=reuse):
        num_agents = len(make_obs_ph_n)

        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(num_agents)
        ]

        messages_ph_n = make_meesages_ph_n

        # multi_head = pre_message(messages_ph_n)

        items = [
            p_func([obs_ph_n[i], tf.concat(messages_ph_n, 1)],
                   int(act_pdtype_n[i].param_shape()[0]),
                   scope="p_func_{}".format(i),
                   num_units=num_units) for i in range(num_agents)
        ]
        p_n, message_n, mu_message_n, logvar_message_n = list(zip(*items))

        logvar_message_n = [
            tf.clip_by_value(log, -10, 10) for log in logvar_message_n
        ]  # constrain kl_loss not to be too large

        p_func_vars = [
            U.scope_vars(U.absolute_scope_name("p_func_{}".format(i)))
            for i in range(num_agents)
        ]

        # wrap parameters in distribution
        act_pd_n = [
            act_pdtype_n[i].pdfromflat(p_n[i]) for i in range(num_agents)
        ]

        act_sample_n = [act_pd.sample() for act_pd in act_pd_n]
        p_reg_n = [
            tf.reduce_mean(tf.square(act_pd.flatparam()))
            for act_pd in act_pd_n
        ]

        act_input_n_n = [act_ph_n + [] for _ in range(num_agents)]
        for i in range(num_agents):
            act_input_n_n[i][i] = act_pd_n[i].sample()
        q_input_n = [
            tf.concat(obs_ph_n + messages_ph_n + act_input_n, 1)
            for act_input_n in act_input_n_n
        ]

        q_n = [
            q_func(q_input_n[i],
                   1,
                   scope="q_func_{}".format(i),
                   reuse=True,
                   num_units=num_units)[:, 0] for i in range(num_agents)
        ]
        pg_loss_n = [-tf.reduce_mean(q) for q in q_n]

        kl_loss_message_n = [
            0.5 * (tf.pow(mu, 2) + tf.pow(tf.exp(log), 2)) - log - 0.5
            for mu, log in zip(mu_message_n, logvar_message_n)
        ]
        kl_loss_message = tf.reduce_mean(kl_loss_message_n)

        pg_loss = tf.reduce_sum(pg_loss_n)
        p_reg = tf.reduce_sum(p_reg_n)
        loss = pg_loss + p_reg * 1e-3 + beta * kl_loss_message

        var_list = []
        var_list.extend(p_func_vars)
        var_list = list(itertools.chain(*var_list))
        optimize_expr = U.minimize_and_clip(optimizer, loss, var_list,
                                            grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + messages_ph_n + act_ph_n,
                           outputs=loss,
                           updates=[optimize_expr])
        act = U.function(inputs=obs_ph_n + messages_ph_n,
                         outputs=[act_sample_n, message_n])
        p_values = U.function(inputs=obs_ph_n + messages_ph_n, outputs=p_n)

        # target network
        target_items = [
            p_func([obs_ph_n[i], tf.concat(messages_ph_n, 1)],
                   int(act_pdtype_n[i].param_shape()[0]),
                   scope="target_p_func_{}".format(i),
                   num_units=num_units) for i in range(num_agents)
        ]

        target_p_n, target_message_n, target_mu_message_n, target_logvar_message_n = list(
            zip(*target_items))
        target_logvar_message_n = [
            tf.clip_by_value(log, -10, 10) for log in target_logvar_message_n
        ]  # constrain kl_loss not to be too large

        target_p_func_vars = [
            U.scope_vars(U.absolute_scope_name("target_p_func_{}".format(i)))
            for i in range(num_agents)
        ]

        target_var_list = []
        target_var_list.extend(target_p_func_vars)
        target_var_list = list(itertools.chain(*target_var_list))
        update_target_p = make_update_exp(var_list, target_var_list)

        target_act_sample_n = [
            act_pdtype_n[i].pdfromflat(target_p_n[i]).sample()
            for i in range(num_agents)
        ]
        target_act = U.function(
            inputs=obs_ph_n + messages_ph_n,
            outputs=[target_act_sample_n, target_message_n])

        return act, train, update_target_p, {
            'p_values': p_values,
            'target_act': target_act
        }
Пример #16
0
def c_next(make_obs_ph,
           act_space,
           c_ph,
           c_next_func,
           num_constraints,
           optimizer,
           grad_norm_clipping,
           num_units=64,
           reuse=False,
           scope="c_next"):
    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        act_pdtype = make_pdtype(act_space[0])
        obs_ph = make_obs_ph
        act_ph = act_pdtype.sample_placeholder([None], name="action")
        c_next_target_ph = []
        for _ in range(num_constraints):
            c_next_target_ph.append(
                tf.placeholder(tf.float32, [None, 1], name="target" + str(_)))

        c_next_input = tf.concat(obs_ph, 1)
        gs_ = []
        for _ in range(num_constraints):
            gs_.append(
                c_next_func(c_next_input,
                            int((act_pdtype.param_shape()[0]) / 2),
                            scope="c_next_func" + str(_),
                            num_units=num_units))

        c_ = []  # to be testified
        for _ in range(num_constraints):
            temp = c_ph[_] + tf.multiply(gs_[_], act_ph)
            c_.append(tf.reduce_sum(temp, -1))

        c_next_vars = [
            U.scope_vars(U.absolute_scope_name("c_next_func" + str(_)))
            for _ in range(num_constraints)
        ]

        diff = [(c_[_] - c_next_target_ph[_]) for _ in range(num_constraints)]
        c_next_loss = [
            tf.reduce_mean(tf.square(diff[_])) for _ in range(num_constraints)
        ]

        optimize_expr = [
            U.minimize_and_clip(optimizer, c_next_loss[_], c_next_vars[_],
                                grad_norm_clipping)
            for _ in range(num_constraints)
        ]

        # Create callable functions
        train = [
            U.function(inputs=[obs_ph] + [act_ph] + [c_ph[_]] +
                       [c_next_target_ph[_]],
                       outputs=c_next_loss[_],
                       updates=[optimize_expr[_]])
        ]
        c_next_values = [
            U.function([obs_ph] + [act_ph] + [c_ph[_]], c_[_])
            for _ in range(num_constraints)
        ]
        g_next_values = [
            U.function([obs_ph], gs_[_]) for _ in range(num_constraints)
        ]
        return train, c_next_values, g_next_values
def q_train(make_obs_ph_n,
            act_space_n,
            q_index,
            q_func,
            optimizer,
            grad_norm_clipping=None,
            local_q_func=False,
            scope="trainer",
            reuse=None,
            num_units=64):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        # make_ob_ph_n是输入的placeholder,与obs_n同shape
        act_pdtype_n = [make_pdtype(act_space)
                        for act_space in act_space_n]  # 获取概率类型,传入动作维度(5)
        # act_space来自于env.act_space,由实验环境决定
        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]
        target_ph = tf.placeholder(tf.float32, [None],
                                   name="target")  # 一维输入占位符
        # 以上为三个placeholder, [None]增加维度,不知道喂进去多少数据时使用, 即None是batchsize大小

        q_input = tf.concat(obs_ph_n + act_ph_n,
                            1)  # q函数输入网络为动作加上环境,在1维上,即q网络输入是所有agent观察和动作
        if local_q_func:  # 用ddpg时即只用自己的行为训练
            q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1)
        q = q_func(q_input, 1, scope="q_func",
                   num_units=num_units)[:, 0]  # 取所有行的第0个数据
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))
        # q网络变量集合
        q_loss = tf.reduce_mean(
            tf.square(q - target_ph))  # target_ph 会被什么占据呢? 会被喂进去的td target占据
        # q网络的损失函数,均方差,target_ph来自于target网络的预测
        # viscosity solution to Bellman differential equation in place of an initial condition
        q_reg = tf.reduce_mean(tf.square(q))
        loss = q_loss  # + 1e-3 * q_reg

        optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars,
                                            grad_norm_clipping)
        # 优化器表达式,以及是否梯度clip
        # Create callable functions
        # theano function
        train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph],
                           outputs=loss,
                           updates=[optimize_expr])
        q_values = U.function(obs_ph_n + act_ph_n, q)

        # target network
        target_q = q_func(q_input,
                          1,
                          scope="target_q_func",
                          num_units=num_units)[:, 0]
        target_q_func_vars = U.scope_vars(
            U.absolute_scope_name("target_q_func"))
        update_target_q = make_update_exp(q_func_vars, target_q_func_vars)

        target_q_values = U.function(obs_ph_n + act_ph_n, target_q)
        # 以下返回值均为theano function可以直接填入传入placeholder的参数
        return train, update_target_q, {
            'q_values': q_values,
            'target_q_values': target_q_values
        }
Пример #18
0
def q_train(make_obs_ph_n,
            act_space_n,
            make_obs_history_n,
            make_act_history_n,
            q_index,
            q_func,
            optimizer,
            grad_norm_clipping=None,
            local_q_func=False,
            scope="trainer",
            reuse=None,
            num_units=64):
    """
    Q-Learning

        make_obs_ph_n (tf.placeholder): Placeholder for the observation space of all agents
        act_space_n (list): A list of the action spaces for all agents
        make_obs_history_n (tf.placeholder): Placeholder for the observation history of all agents
        make_act_history_n (tf.placeholder): Placeholder for the action space history of all agents
        q_index (int): Agent index number
        q_func (function): MLP Neural Network model for the agent.
        optimizer (function): Network Optimizer function
        grad_norm_clipping (float): Value by which to clip the norm of the gradient
        local_q_func (boolean): Flag for using local q function
        num_units (int): The number outputs for the layers of the model
        scope (str): The name of the scope
        reuse (boolean): Flag specifying whether to reuse the scope

    Returns:
        train (function): Training function for Q network
        update_target_q (function): Update function for updating Q network values
        q_debug (dict): Contains 'q_values' and 'target_q_values' of the Q network
    """
    with tf.variable_scope(scope, reuse=reuse):
        # Create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # Set up placeholders
        obs_ph_n = make_obs_ph_n
        obs_history_n = make_obs_history_n
        act_history_n = make_act_history_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]
        target_ph = tf.placeholder(tf.float32, [None], name="target")

        # obs_ph_n = [tf.concat(3*[x],1,name="observation{}".format(i)) for i,x in enumerate(obs_ph_n)]
        # act_ph_n = [tf.concat(3*[x],1,name="action{}".format(i)) for i,x in enumerate(act_ph_n)]

        # Original implementation
        # q_input = tf.concat(obs_ph_n + act_ph_n, 1)

        # Modified
        # Current plus 2 previous time-steps
        q_input = tf.concat(
            obs_ph_n + obs_history_n + act_ph_n + act_history_n, 1)

        if local_q_func:
            # Only have observations about myself when 'ddpg'
            # Importantly... self position is relative to prey
            q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1)

        q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:, 0]
        q_func_vars = tf_util.scope_vars(tf_util.absolute_scope_name("q_func"))

        # ************************************************************************************************

        # ccm_input = data for ccm
        # ccm_value = ccm_func(ccm_input)

        # ************************************************************************************************

        # q_loss = tf.reduce_mean(tf.square(q - target_ph)) - ccm_loss

        q_loss = tf.reduce_mean(tf.square(q - target_ph))

        # Viscosity solution to Bellman differential equation in place of an initial condition
        q_reg = tf.reduce_mean(tf.square(q))
        # loss = q_loss + 1e-3 * q_reg
        loss = q_loss

        optimize_expr = tf_util.minimize_and_clip(optimizer, loss, q_func_vars,
                                                  grad_norm_clipping)

        # Create callable functions
        train = tf_util.function(inputs=obs_ph_n + obs_history_n + act_ph_n +
                                 act_history_n + [target_ph],
                                 outputs=loss,
                                 updates=[optimize_expr])
        q_values = tf_util.function(
            obs_ph_n + obs_history_n + act_ph_n + act_history_n, q)

        # Target network
        target_q = q_func(q_input,
                          1,
                          scope="target_q_func",
                          num_units=num_units)[:, 0]
        target_q_func_vars = tf_util.scope_vars(
            tf_util.absolute_scope_name("target_q_func"))
        update_target_q = make_update_exp(q_func_vars, target_q_func_vars)

        target_q_values = tf_util.function(
            obs_ph_n + obs_history_n + act_ph_n + act_history_n, target_q)

        return train, update_target_q, {
            'q_values': q_values,
            'target_q_values': target_q_values
        }
Пример #19
0
def q_train(make_obs_ph_n,
            act_space_n,
            q_index,
            q_func,
            u_func,
            optimizer,
            grad_norm_clipping=None,
            local_q_func=False,
            scope="trainer",
            reuse=None,
            num_units=64,
            u_estimation=False):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]
        target_ph = tf.placeholder(tf.float32, [None], name="target")
        rew = tf.placeholder(tf.float32, [None], name="reward")
        if u_estimation:
            target_ph_u = tf.placeholder(tf.float32, [None], name="target_u")
        q_input = tf.concat(obs_ph_n + act_ph_n, 1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1)
        q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:, 0]
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))
        if u_estimation:
            u_input = tf.concat(obs_ph_n + act_ph_n, 1)
            u = u_func(u_input, 1, scope="u_func", num_units=num_units)[:, 0]
            u_loss = tf.reduce_mean(
                tf.square(
                    tf.square(rew) + 2 * tf.multiply(rew, target_ph) +
                    target_ph_u - u))
            var = u - tf.square(q)
        else:
            var = tf.square(rew + target_ph) - tf.square(q)
        if u_estimation:
            u_func_vars = U.scope_vars(U.absolute_scope_name("u_func"))
        q_loss = tf.reduce_mean(tf.square(q - (rew + target_ph)))

        # viscosity solution to Bellman differential equation in place of an initial condition
        q_reg = tf.reduce_mean(tf.square(q))
        if u_estimation:
            loss = q_loss + u_loss  #+ 1e-3 * q_reg
            optimize_expr = U.minimize_and_clip(optimizer, loss,
                                                q_func_vars + u_func_vars,
                                                grad_norm_clipping)
            train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] +
                               [target_ph_u] + [rew],
                               outputs=[q_loss, u_loss],
                               updates=[optimize_expr])
            var_fn = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] +
                                [target_ph_u] + [rew],
                                outputs=var)
        else:
            loss = q_loss  #+ 1e-3 * q_reg
            optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars,
                                                grad_norm_clipping)
            train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] +
                               [rew],
                               outputs=loss,
                               updates=[optimize_expr])
        q_values = U.function(obs_ph_n + act_ph_n, q)

        var_fn = U.function(inputs=obs_ph_n + act_ph_n + [target_ph] + [rew],
                            outputs=var)
        # target network
        target_q = q_func(q_input,
                          1,
                          scope="target_q_func",
                          num_units=num_units)[:, 0]
        target_q_func_vars = U.scope_vars(
            U.absolute_scope_name("target_q_func"))
        update_target_q = make_update_exp(q_func_vars, target_q_func_vars)

        target_q_values = U.function(obs_ph_n + act_ph_n, target_q)

        if u_estimation:
            u_values = U.function(obs_ph_n + act_ph_n, u)
            target_u = u_func(u_input,
                              1,
                              scope="target_u_func",
                              num_units=num_units)[:, 0]
            target_u_func_vars = U.scope_vars(
                U.absolute_scope_name("target_u_func"))
            update_target_u = make_update_exp(u_func_vars, target_u_func_vars)
            target_u_values = U.function(obs_ph_n + act_ph_n, target_u)

        if u_estimation:
            return train, update_target_q, update_target_u, {
                'q_values': q_values,
                'u_values': u_values,
                'var': var_fn,
                'target_q_values': target_q_values,
                'target_u_values': target_u_values
            }
        else:
            return train, update_target_q, {
                'q_values': q_values,
                'var': var_fn,
                'target_q_values': target_q_values
            }
Пример #20
0
def q_train(make_obs_ph_n,
            act_space_n,
            q_index,
            q_func,
            optimizer,
            grad_norm_clipping=None,
            local_q_func=False,
            scope="trainer",
            reuse=None,
            num_units=64):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]
        target_ph = tf.placeholder(tf.float32, [None], name="target")

        # get flattened obs and act shape
        act_shape = tf.shape(act_ph_n)
        act_serial = tf.concat(act_ph_n, 1)
        act_serial = tf.reshape(act_serial,
                                [act_shape[1], act_shape[0] * act_shape[-1]])
        act_serial_values = U.function(act_ph_n, act_serial)

        obs_shape = tf.shape(obs_ph_n)
        obs_serial = tf.concat(obs_ph_n, 1)
        obs_serial = tf.reshape(obs_serial,
                                [obs_shape[1], obs_shape[0] * obs_shape[-1]])
        obs_serial_values = U.function(obs_ph_n, obs_serial)

        obs_flat_shape = [len(obs_ph_n) * int(obs_ph_n[0].shape[-1])]
        act_flat_shape = [len(act_space_n) * int(act_space_n[0].shape[-1])]
        obs_flat_ph = tf.placeholder(tf.float32,
                                     shape=[None] + obs_flat_shape,
                                     name="obs_flat_input")
        act_flat_ph = tf.placeholder(tf.float32,
                                     shape=[None] + act_flat_shape,
                                     name="act_flat_input")

        target_input = tf.concat([obs_flat_ph, act_flat_ph], axis=-1)
        q_input = tf.concat(obs_ph_n + act_ph_n, 1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1)
        q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:, 0]
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        q_loss = tf.reduce_mean(tf.square(q - target_ph))

        # viscosity solution to Bellman differential equation in place of an initial condition
        q_reg = tf.reduce_mean(tf.square(q))
        loss = q_loss  #+ 1e-3 * q_reg

        optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars,
                                            grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph],
                           outputs=loss,
                           updates=[optimize_expr])
        q_values = U.function(obs_ph_n + act_ph_n, q)

        # target network
        # target_orig_q = q_func(q_input, 1, scope="target_orig_q_func", num_units=num_units)[:,0]
        target_q = q_func(target_input,
                          1,
                          scope="target_q_func",
                          num_units=num_units)[:, 0]
        target_q_func_vars = U.scope_vars(
            U.absolute_scope_name("target_q_func"))
        update_target_q = make_update_exp(q_func_vars, target_q_func_vars)

        # target_q_values = U.function(obs_ph_n + act_ph_n, target_q)
        target_q_values = U.function([obs_flat_ph, act_flat_ph], target_q)

        # calculate gradient of target q value wrt actions
        raw_grad = tf.gradients(target_q, act_flat_ph)
        grad_norm = tf.divide(raw_grad, tf.norm(raw_grad))
        grad_norm_value = U.function([obs_flat_ph, act_flat_ph], grad_norm)

        return train, update_target_q, {
            'q_values': q_values,
            'target_q_values': target_q_values,
            'act_serial_values': act_serial_values,
            'obs_serial_values': obs_serial_values,
            'grad_norm_value': grad_norm_value
        }
Пример #21
0
def q_train(make_obs_ph_n,
            act_space_n,
            q_index,
            q_func,
            optimizer,
            grad_norm_clipping=None,
            local_q_func=False,
            scope="trainer",
            reuse=None,
            num_units=64):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]
        target_ph = tf.placeholder(tf.float32, [None], name="target")

        q_input = tf.concat(obs_ph_n + act_ph_n, 1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1)
        q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:, 0]
        #q_func是一个函数 其输出为全连接网络的输出,即q
        q_func_vars = U.scope_vars(
            U.absolute_scope_name("q_func"))  #得到函数中的参数(全连接的参数)

        q_loss = tf.reduce_mean(tf.square(q - target_ph))
        #定义平方损失,这是critic中的DQN的损失函数
        # viscosity solution to Bellman differential equation in place of an initial condition
        q_reg = tf.reduce_mean(tf.square(q))
        loss = q_loss  #+ 1e-3 * q_reg #类似参数衰减,防止过拟合

        optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars,
                                            grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph],
                           outputs=loss,
                           updates=[optimize_expr])
        #将输入到输出打包为一个函数
        q_values = U.function(obs_ph_n + act_ph_n, q)

        # target network
        target_q = q_func(q_input,
                          1,
                          scope="target_q_func",
                          num_units=num_units)[:, 0]
        #目标Q网络,用于计算Q现实,不必训练参数,每隔一段时间从q网络复制参数
        target_q_func_vars = U.scope_vars(
            U.absolute_scope_name("target_q_func"))
        #得到目标Q网络的参数
        update_target_q = make_update_exp(q_func_vars, target_q_func_vars)

        target_q_values = U.function(obs_ph_n + act_ph_n, target_q)
        #将这个网络打包为一个函数,调用这个函数就可以方便地计算Q现实
        return train, update_target_q, {
            'q_values': q_values,
            'target_q_values': target_q_values
        }
Пример #22
0
def p_train_adv(make_obs_ph_n,
                act_space_n,
                p_index,
                p_func,
                q_func,
                optimizer,
                grad_norm_clipping=None,
                local_q_func=False,
                num_units=64,
                scope="trainer",
                reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]

        p_input = obs_ph_n[p_index]

        p = p_func(p_input,
                   int(act_pdtype_n[p_index].param_shape()[0]),
                   scope="p_func",
                   num_units=num_units)
        p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))

        # wrap parameters in distribution
        act_pd = act_pdtype_n[p_index].pdfromflat(p)

        act_sample = act_pd.sample()
        p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))

        act_input_n = act_ph_n + []

        # changed
        sample = act_pd.sample()
        act_input_n[p_index] = sample

        q_input = tf.concat(obs_ph_n + act_input_n, 1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1)
        q = q_func(q_input, 1, scope="q_func", reuse=True,
                   num_units=num_units)[:, 0]

        ## Modifications here
        ## Create values vector: auto solve rows by 1 column
        v = tf.tile([0.0],
                    [tf.shape(sample)[0]])  # variable for value function
        for i in range(act_space_n[p_index].n):
            # create row tensor with ith element as 1, actions are one-hot
            a = np.zeros((1, act_space_n[p_index].n), dtype=np.float32)
            a[0, i] = 1
            a = tf.convert_to_tensor(a)

            # tile this row tensor automatic number of times
            a = tf.tile(a, [tf.shape(sample)[0], 1])

            act_input = act_ph_n + []
            act_input[p_index] = tf.convert_to_tensor(a)
            q_input_tmp = tf.concat(obs_ph_n + act_input, 1)
            if local_q_func:
                q_input_tmp = tf.concat(
                    [obs_ph_n[p_index], act_input_n[p_index]], 1)
            # add Q(a[i], s) * pi(a[i]) to value
            p_i = act_pd.logits[:, i]
            # tmp is q values for action i multiplied by probability of taking action i
            tmp = tf.multiply(
                q_func(q_input_tmp,
                       1,
                       scope="q_func",
                       reuse=True,
                       num_units=num_units)[:, 0], p_i)
            v = tf.add(v, tmp)

        a = tf.subtract(v, q)
        # loss is equal to advantage
        pg_loss = -tf.reduce_mean(a)
        ## Modifications end

        loss = pg_loss + p_reg * 1e-3

        optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars,
                                            grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n,
                           outputs=loss,
                           updates=[optimize_expr])
        act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample)
        p_values = U.function([obs_ph_n[p_index]], p)

        # target network
        target_p = p_func(p_input,
                          int(act_pdtype_n[p_index].param_shape()[0]),
                          scope="target_p_func",
                          num_units=num_units)
        target_p_func_vars = U.scope_vars(
            U.absolute_scope_name("target_p_func"))
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)

        target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample()
        target_act = U.function(inputs=[obs_ph_n[p_index]],
                                outputs=target_act_sample)

        return act, train, update_target_p, {
            'p_values': p_values,
            'target_act': target_act
        }
Пример #23
0
def p_train(make_obs_ph_n,
            act_space_n,
            p_index,
            p_func,
            q_func,
            optimizer,
            index,
            grad_norm_clipping=None,
            local_q_func=False,
            num_units=64,
            scope="trainer",
            reuse=None,
            ensemble_num=5):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]
        #不懂
        # set up placeholders
        obs_ph_n = make_obs_ph_n  #输入:观测
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None],
                                               name="action" + str(index) +
                                               str(i))
            for i in range(len(act_space_n))
        ]
        #输出:行动
        p_input = obs_ph_n[p_index]  #这个智能体得到的观测

        p = p_func(p_input,
                   int(act_pdtype_n[p_index].param_shape()[0]),
                   scope="p_func" + str(index),
                   num_units=num_units)
        #得到映射函数,是个全连接网络
        p_func_vars = U.scope_vars(U.absolute_scope_name("p_func" +
                                                         str(index)))
        #得到这个网络的所有参数
        # wrap parameters in distribution
        act_pd = act_pdtype_n[p_index].pdfromflat(p)
        #不懂
        act_sample = act_pd.sample()  #采样,得到一个动作输出(一个实数)

        p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))  #将参数展为一维,计算模方

        act_input_n = act_ph_n + []  #动作输入,是placeholder
        act_input_n[p_index] = act_pd.sample()
        q_input = tf.concat(obs_ph_n + act_input_n, 1)
        if local_q_func:  #如果是局部Q函数
            q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1)
        q = q_func(q_input, 1, scope="q_func", reuse=True,
                   num_units=num_units)[:, 0]
        #不懂,一个全连接网络
        pg_loss = -tf.reduce_mean(q)

        loss = pg_loss + p_reg * 1e-3

        optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars,
                                            grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n,
                           outputs=loss,
                           updates=[optimize_expr])
        #训练网络
        act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample)
        #输出的动作
        p_values = U.function([obs_ph_n[p_index]], p)
        #不懂
        # target network
        target_p = p_func(p_input,
                          int(act_pdtype_n[p_index].param_shape()[0]),
                          scope="target_p_func" + str(index),
                          num_units=num_units)
        #现实网络的函数。一个全连接网络
        target_p_func_vars = U.scope_vars(
            U.absolute_scope_name("target_p_func" + str(index)))
        #得到全连接网络的参数
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)
        #更新现实网络的参数,以soft的形式,即每次更新一点点(动量更新)
        target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample()
        #不懂
        target_act = U.function(inputs=[obs_ph_n[p_index]],
                                outputs=target_act_sample)
        #得到现实网络的动作
        return act, train, update_target_p, {
            'p_values': p_values,
            'target_act': target_act
        }
Пример #24
0
def p_train(make_obs_ph_n,
            act_space_n,
            p_index,
            p_func,
            q_func,
            optimizer,
            grad_norm_clipping=None,
            local_q_func=False,
            num_units=64,
            scope="trainer",
            reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]

        p_input = obs_ph_n[p_index]

        p = p_func(p_input,
                   int(act_pdtype_n[p_index].param_shape()[0]),
                   scope="p_func",
                   num_units=num_units)
        # p = clip_without_loss_of_gradient(p, axis=1)
        p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))

        # wrap parameters in distribution
        act_pd = act_pdtype_n[p_index].pdfromflat(p)

        act_sample = act_pd.sample()
        p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))

        act_input_n = act_ph_n + []
        act_input_n[p_index] = act_pd.sample()
        q_input = tf.concat(obs_ph_n + act_input_n, 1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1)
        q = q_func(q_input, 1, scope="q_func", reuse=True,
                   num_units=num_units)[:, 0]
        pg_loss = -tf.reduce_mean(q)
        p_loss_summary = tf.summary.scalar('p_loss', pg_loss)
        p_cov_summary = tf.summary.scalar(
            'p_cov', tf.reduce_mean(tf.square(act_pd.std)))

        loss = pg_loss + p_reg * 1e-3

        optimize_expr, hist = U.minimize_and_clip(optimizer,
                                                  loss,
                                                  p_func_vars,
                                                  grad_norm_clipping,
                                                  histogram_name='p_gradient')

        p_loss_summary_merge = tf.summary.merge(
            [p_loss_summary, p_cov_summary, hist])

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n,
                           outputs=[loss, p_loss_summary_merge],
                           updates=[optimize_expr])
        act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample)
        p_values = U.function([obs_ph_n[p_index]],
                              [act_pd.mean, act_pd.logstd])

        # target network
        target_p = p_func(p_input,
                          int(act_pdtype_n[p_index].param_shape()[0]),
                          scope="target_p_func",
                          num_units=num_units)
        target_p_func_vars = U.scope_vars(
            U.absolute_scope_name("target_p_func"))
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)

        target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample()
        target_act = U.function(inputs=[obs_ph_n[p_index]],
                                outputs=target_act_sample)

        return act, train, update_target_p, {
            'p_values': p_values,
            'target_act': target_act
        }
Пример #25
0
def dqn_train(make_obs_ph_n,
              act_space_n,
              p_index,
              p_func,
              q_func,
              optimizer,
              grad_norm_clipping=None,
              local_q_func="dqn",
              num_units=64,
              scope="trainer",
              reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]

        p_input = obs_ph_n[p_index]

        p = p_func(p_input,
                   int(act_pdtype_n[p_index].param_shape()[0]),
                   scope="p_func",
                   num_units=num_units)
        p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))

        # wrap parameters in distribution
        act_pd = act_pdtype_n[p_index].pdfromflat(p)
        act_sample = act_pd.sample()

        target_ph = tf.placeholder(tf.float32, [None], name="target")

        tf_p = tf.reduce_sum(p, reduction_indices=1)
        loss = tf.reduce_mean(tf.square(tf_p - target_ph))
        optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars,
                                            grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph],
                           outputs=loss,
                           updates=[optimize_expr])
        act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample)
        p_values = U.function([obs_ph_n[p_index]], p)

        # target network
        target_p = p_func(p_input,
                          int(act_pdtype_n[p_index].param_shape()[0]),
                          scope="target_p_func",
                          num_units=num_units)
        target_p_func_vars = U.scope_vars(
            U.absolute_scope_name("target_p_func"))
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)

        target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample()
        target_act = U.function(inputs=[obs_ph_n[p_index]],
                                outputs=target_act_sample)

        return act, train, update_target_p, {
            'p_values': p_values,
            'target_act': target_act
        }
Пример #26
0
def p_train(name,
            make_obs_ph_n,
            adj_n,
            act_space_n,
            neighbor_n,
            p_index,
            p_func,
            q_func,
            num_adversaries,
            optimizer,
            grad_norm_clipping=None,
            local_q_func=False,
            num_units=128,
            scope="trainer",
            reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]
        agent_n = len(obs_ph_n)
        vec_n = U.BatchInput([1, neighbor_n], name="vec").get()

        p_input1 = obs_ph_n[
            0:num_adversaries] if name == "adversaries" else obs_ph_n[
                num_adversaries:agent_n]
        p_input2 = adj_n[0:num_adversaries] if name == "adversaries" else adj_n[
            num_adversaries:agent_n]
        p_input3 = vec_n

        # call for actor network
        # act_space is not good!!!!!!!!!!
        p = p_func(p_input1,
                   p_input2,
                   p_input3,
                   neighbor_n,
                   num_adversaries if name == "adversaries" else
                   (agent_n - num_adversaries),
                   5,
                   scope="p_func",
                   num_units=num_units)
        p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))

        # wrap parameters in distribution
        act_pd = []
        act_sample = []
        for i in range(0, num_adversaries) if name == "adversaries" else range(
                num_adversaries, agent_n):
            act_pd_temp = act_pdtype_n[i].pdfromflat(
                p[i - (0 if name == "adversaries" else num_adversaries)])
            act_pd.append(act_pd_temp)
            act_sample.append(act_pd_temp.sample())

        temp = []
        for i in range(len(act_pd)):
            temp.append(act_pd[i].flatparam())

        # Is this regularization method correct?????????????????????????????/
        p_reg = tf.reduce_mean(tf.square(temp))

        act_input_n = act_ph_n + []
        for i in range(0, num_adversaries) if name == "adversaries" else range(
                num_adversaries, agent_n):
            act_input_n[i] = act_sample[
                i - (0 if name == "adversaries" else num_adversaries)]

        q_input = tf.concat(obs_ph_n + act_input_n, 1)
        q = []
        q_reduce_mean = []
        for a in range(0, num_adversaries) if name == "adversaries" else range(
                num_adversaries, agent_n):
            index = a if name == "adversaries" else a - num_adversaries
            temp = q_func(q_input,
                          1,
                          scope="q_func_%d" % index,
                          reuse=True,
                          num_units=num_units)[:, 0]
            q.append(temp)
            q_reduce_mean += temp
        pg_loss = -tf.reduce_mean(q)

        loss = pg_loss + p_reg * 1e-3

        optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars,
                                            grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n + adj_n + [vec_n],
                           outputs=loss,
                           updates=[optimize_expr])
        act = U.function(inputs=p_input1 +
                         (adj_n[0:num_adversaries] if name == "adversaries"
                          else adj_n[num_adversaries:agent_n]) + [p_input3],
                         outputs=act_sample,
                         list_output=True)
        p_values = U.function(
            p_input1 + (adj_n[0:num_adversaries] if name == "adversaries" else
                        adj_n[num_adversaries:agent_n]) + [p_input3],
            p,
            list_output=True)

        # target network
        target_p = p_func(p_input1,
                          p_input2,
                          p_input3,
                          neighbor_n,
                          num_adversaries if name == "adversaries" else
                          (agent_n - num_adversaries),
                          5,
                          scope="target_p_func",
                          num_units=num_units)
        target_p_func_vars = U.scope_vars(
            U.absolute_scope_name("target_p_func"))
        update_target_p = make_update_exp(p_func_vars,
                                          target_p_func_vars,
                                          central=True)

        target_act_sample = []
        for i in range(0, num_adversaries) if name == "adversaries" else range(
                num_adversaries, agent_n):
            target_act_sample.append(act_pdtype_n[i].pdfromflat(target_p[i - (
                0 if name == "adversaries" else num_adversaries)]).sample())
        target_act = U.function(
            inputs=p_input1 +
            (adj_n[0:num_adversaries] if name == "adversaries" else
             adj_n[num_adversaries:agent_n]) + [p_input3],
            outputs=target_act_sample,
            list_output=True)

        return act, train, update_target_p, p_values, target_act
Пример #27
0
def p_train(make_obs_ph_n,
            act_space_n,
            p_index,
            p_func,
            q_func,
            optimizer,
            grad_norm_clipping=None,
            local_q_func=False,
            num_units=64,
            scope="trainer",
            reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]

        p_input = obs_ph_n[p_index]

        p = p_func(p_input,
                   int(act_pdtype_n[p_index].param_shape()[0]),
                   scope="p_func",
                   num_units=num_units)
        p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))

        # wrap parameters in distribution
        act_pd = act_pdtype_n[p_index].pdfromflat(p)

        act_sample = act_pd.sample()
        p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))

        act_input_n = act_ph_n + []
        act_input_n[p_index] = act_pd.sample()
        q_input = tf.concat(obs_ph_n + act_input_n, 1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1)
        q = q_func(q_input, 1, scope="q_func", reuse=True,
                   num_units=num_units)[:, 0]
        pg_loss = -tf.reduce_mean(q)

        # Gradient computation mods
        # ---------------------------------------------------------------------------------------------
        obs_flat_shape = [len(obs_ph_n) * int(obs_ph_n[0].shape[-1])]
        act_flat_shape = [len(act_space_n) * int(act_space_n[0].shape[-1])]
        obs_flat_ph = tf.placeholder(tf.float32,
                                     shape=[None] + obs_flat_shape,
                                     name="obs_flat_input")
        act_flat_ph = tf.placeholder(tf.float32,
                                     shape=[None] + act_flat_shape,
                                     name="act_flat_input")

        q_vec_input = tf.concat([obs_flat_ph, act_flat_ph], axis=-1)
        serial_q = q_func(q_vec_input,
                          1,
                          scope="q_func",
                          reuse=True,
                          num_units=num_units)[:, 0]

        # calculate gradient of serial q value wrt actions
        raw_grad = tf.gradients(serial_q, act_flat_ph)
        grad_norm = tf.divide(raw_grad, tf.norm(raw_grad))
        grad_norm_value = U.function([obs_flat_ph, act_flat_ph], grad_norm)
        # ---------------------------------------------------------------------------------------------

        loss = pg_loss + p_reg * 1e-3

        optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars,
                                            grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n,
                           outputs=loss,
                           updates=[optimize_expr])
        act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample)
        p_values = U.function([obs_ph_n[p_index]], p)

        # target network
        target_p = p_func(p_input,
                          int(act_pdtype_n[p_index].param_shape()[0]),
                          scope="target_p_func",
                          num_units=num_units)
        target_p_func_vars = U.scope_vars(
            U.absolute_scope_name("target_p_func"))
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)

        target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample()
        target_act = U.function(inputs=[obs_ph_n[p_index]],
                                outputs=target_act_sample)

        return act, train, update_target_p, {
            'p_values': p_values,
            'target_act': target_act,
            'grad_norm_value': grad_norm_value
        }
Пример #28
0
def p_train_recurrent(make_obs_ph_n,
                      make_state_ph_n,
                      make_obs_next_n,
                      make_obs_pred_n,
                      act_space_n,
                      p_index,
                      p_policy,
                      p_predict,
                      q_func,
                      optimizer,
                      grad_norm_clipping=None,
                      local_q_func=False,
                      num_units=64,
                      scope="trainer",
                      reuse=None):

    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions

        # set up placeholders
        obs_ph_n = make_obs_ph_n  # all obs, in shape Agent_num * batch_size * time_step * obs_shape
        obs_next_n = make_obs_next_n
        state_ph_n = make_state_ph_n
        obs_pred_n = make_obs_pred_n

        # used for action
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]

        # p_input is local obs of an agent
        obs_input = obs_ph_n[p_index]
        state_input = state_ph_n[p_index]
        act_input = act_ph_n[p_index]
        obs_next = obs_next_n[p_index]
        obs_pred_input = obs_pred_n[p_index]

        # get output and state
        p, gru_out, state = p_policy(
            obs_input,
            state_input,
            obs_pred_input,
            int(act_pdtype_n[p_index].param_shape()[0]),
            scope="p_policy",
            num_units=num_units)
        act_pd = act_pdtype_n[p_index].pdfromflat(
            p)  # wrap parameters in distribution
        act_sample = act_pd.sample()  # sample an action

        # predict the next obs
        obs_pred = p_predict(act_input,
                             gru_out,
                             int(obs_input.shape[1]),
                             scope="p_predict",
                             num_units=num_units)

        # variables for optimization
        p_func_vars = U.scope_vars(
            U.absolute_scope_name("p_policy")) + U.scope_vars(
                U.absolute_scope_name("p_predict"))

        pred_loss = tf.reduce_mean(tf.square(obs_next -
                                             obs_pred))  # predict loss
        p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))  # reg item
        # use critic net to get the loss about policy
        act_input_n = act_ph_n + []
        act_input_n[p_index] = act_pd.sample(
        )  # only modify the action of this agent
        q_input = tf.concat(
            obs_ph_n + act_input_n,
            1)  # get the input for Q net (all obs + all action)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1)
        q = q_func(q_input, 1, scope="q_func", reuse=True,
                   num_units=num_units)[:, 0]  # get q values
        pg_loss = -tf.reduce_mean(q)  # calculate loss to maximize Q values

        loss = pg_loss + p_reg * 1e-3 + pred_loss * 1e-3
        optimize_expr = U.minimize_and_clip(
            optimizer, loss, p_func_vars,
            grad_norm_clipping)  # update p Net parameters

        # Create callable functions
        # update P NET
        train = U.function(inputs=obs_ph_n + state_ph_n + act_ph_n +
                           obs_next_n + obs_pred_n,
                           outputs=loss,
                           updates=[optimize_expr])
        # return action and state
        step = U.function(inputs=[obs_ph_n[p_index]] + [state_ph_n[p_index]] +
                          [obs_pred_n[p_index]],
                          outputs=[act_sample] + [state] + [gru_out])
        p_values = U.function(inputs=[obs_ph_n[p_index]] +
                              [state_ph_n[p_index]] + [obs_pred_n[p_index]],
                              outputs=p)

        # target network
        target_p, target_gru_out, target_state = \
            p_policy(obs_input, state_input, obs_pred_input,
                     int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_policy", num_units=num_units)
        target_obs_pred = p_predict(act_input,
                                    target_gru_out,
                                    int(obs_input.shape[1]),
                                    scope="target_p_predict",
                                    num_units=num_units)

        target_p_func_vars = U.scope_vars(U.absolute_scope_name("target_p_policy")) + \
                             U.scope_vars(U.absolute_scope_name("target_p_predict"))
        # update the parameters θ'i = τθi + (1 − τ)θ'i
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)
        target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample()

        target_step = U.function(inputs=[obs_ph_n[p_index]] +
                                 [state_ph_n[p_index]] + [obs_pred_n[p_index]],
                                 outputs=[target_act_sample] + [target_state] +
                                 [target_gru_out])

        # return predicted obs
        gru_temp = tf.placeholder(tf.float32, [None] + [num_units],
                                  name='gru_out')
        pred_temp = p_predict(act_input,
                              gru_temp,
                              int(obs_input.shape[1]),
                              scope="p_predict",
                              num_units=num_units)
        predict = U.function(inputs=[act_ph_n[p_index]] + [gru_temp],
                             outputs=pred_temp)
        target_pred_temp = p_predict(act_input,
                                     gru_temp,
                                     int(obs_input.shape[1]),
                                     scope="target_p_predict",
                                     num_units=num_units)
        target_predict = U.function(inputs=[act_ph_n[p_index]] + [gru_temp],
                                    outputs=target_pred_temp)

        return step, predict, train, update_target_p, {
            'p_values': p_values,
            'target_step': target_step,
            'target_predict': target_predict
        }
Пример #29
0
def p_train(make_obs_ph_n,
            act_space_n,
            make_obs_history_n,
            make_act_history_n,
            p_index,
            p_func,
            q_func,
            optimizer,
            grad_norm_clipping=None,
            local_q_func=False,
            num_units=64,
            scope="trainer",
            reuse=None):
    """
    Policy learning guided by Q-value

    Args:
        make_obs_ph_n (tf.placeholder): Placeholder for the observation space of all agents
        act_space_n (list): A list of the action spaces for all agents
        make_obs_history_n (tf.placeholder): Placeholder for the observation history of all agents
        make_act_history_n (tf.placeholder): Placeholder for the action space history of all agents
        p_index (int): Agent index number
        p_func (function): MLP Neural Network model for the agent.
        q_func (function): MLP Neural Network model for the agent.
        optimizer (function): Network Optimizer function
        grad_norm_clipping (float): Value by which to clip the norm of the gradient
        local_q_func (boolean): Flag for using local q function
        num_units (int): The number outputs for the layers of the model
        scope (str): The name of the scope
        reuse (boolean): Flag specifying whether to reuse the scope

    Returns:
        act (function): Action function for retrieving agent action.
        train (function): Training function for P network
        update_target_p (function): Update function for updating P network values
        p_debug (dict): Contains 'p_values' and 'target_act' of the P network
    """
    with tf.variable_scope(scope, reuse=reuse):
        # Create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # Set up placeholders
        obs_ph_n = make_obs_ph_n
        obs_history_n = make_obs_history_n
        act_history_n = make_act_history_n

        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]
        ccm_ph_n = [
            tf.placeholder(tf.float32, [None], name="ccm" + str(p_index))
        ]
        ccm_lambda = [
            tf.placeholder(tf.float32, [None], name="lambda" + str(p_index))
        ]
        ccm_switch = [
            tf.placeholder(tf.float32, [None], name="switch" + str(p_index))
        ]

        # Original implementation
        # p_input = obs_ph_n[p_index]

        # Modified
        p_input = tf.concat([obs_ph_n[p_index], obs_history_n[p_index]], 1)

        p = p_func(p_input,
                   int(act_pdtype_n[p_index].param_shape()[0]),
                   scope="p_func",
                   num_units=num_units)
        p_func_vars = tf_util.scope_vars(tf_util.absolute_scope_name("p_func"))

        # Wrap parameters in distribution
        act_pd = act_pdtype_n[p_index].pdfromflat(p)

        act_sample = act_pd.sample()
        p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))

        # Original implementation
        act_input_n = act_ph_n + []
        act_input_n[p_index] = act_pd.sample()

        # Modified
        # act_input_n = act_hist_ph_n + []
        # act_input_n[p_index] = act_pd.mode()

        # Original implementation
        # q_input = tf.concat(obs_ph_n + act_input_n, 1)

        # Modified
        # Current plus previous time-steps
        q_input = tf.concat(
            obs_ph_n + obs_history_n + act_ph_n + act_history_n, 1)

        if local_q_func:
            # Only have observations about myself when 'ddpg'
            # Importantly... [my.x, my.y, my.dx, my.dy, r1.x]
            q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1)

        q = q_func(q_input, 1, scope="q_func", reuse=True,
                   num_units=num_units)[:, 0]

        # This is probably because of DDPG, rather than DSPG
        # pg_loss = -tf.reduce_mean(q)
        # ************************************************************************************************
        # ccm_input = something
        # ccm_value = ccm_func(ccm_input)

        pg_loss = -(1 - ccm_lambda[0]) * tf.reduce_mean(q) * (1 - ccm_switch[0]) - \
                  ccm_lambda[0] * ccm_ph_n[0] * ccm_switch[0]

        # ************************************************************************************************

        loss = pg_loss + p_reg * 1e-3

        optimize_expr = tf_util.minimize_and_clip(optimizer, loss, p_func_vars,
                                                  grad_norm_clipping)

        # Create callable functions
        train = tf_util.function(inputs=obs_ph_n + obs_history_n + act_ph_n +
                                 act_history_n + ccm_ph_n + ccm_lambda +
                                 ccm_switch,
                                 outputs=loss,
                                 updates=[optimize_expr])

        # Original implementation
        # act = tf_util.function(inputs=[obs_ph_n[p_index]], outputs=act_sample)
        # p_values = tf_util.function([obs_ph_n[p_index]], p)

        # Modified
        act = tf_util.function(inputs=[obs_ph_n[p_index]] +
                               [obs_history_n[p_index]],
                               outputs=act_sample)
        p_values = tf_util.function(inputs=[obs_ph_n[p_index]] +
                                    [obs_history_n[p_index]],
                                    outputs=p)

        # target network
        target_p = p_func(p_input,
                          int(act_pdtype_n[p_index].param_shape()[0]),
                          scope="target_p_func",
                          num_units=num_units)
        target_p_func_vars = tf_util.scope_vars(
            tf_util.absolute_scope_name("target_p_func"))
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)

        target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample()

        # Original implementation
        # target_act = tf_util.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample)

        # Modified
        target_act = tf_util.function(inputs=[obs_ph_n[p_index]] +
                                      [obs_history_n[p_index]],
                                      outputs=target_act_sample)

        return act, train, update_target_p, {
            'p_values': p_values,
            'target_act': target_act
        }
Пример #30
0
def p_train(make_obs_ph_n,
            act_space_n,
            p_index,
            p_func,
            q_func,
            optimizer,
            num_outputs,
            grad_norm_clipping=None,
            local_q_func=False,
            num_units=64,
            scope="coma_trainer",
            reuse=None):

    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        # act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))]
        act_ph_n = [
            tf.placeholder(tf.int32, [None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]

        # actor的输入为本地的obs
        p_input = obs_ph_n[p_index]

        p = p_func(p_input,
                   int(act_pdtype_n[p_index].param_shape()[0]),
                   scope="coma_p_func",
                   num_units=num_units)
        p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))

        # wrap parameters in distribution
        act_pd = act_pdtype_n[p_index].pdfromflat(p)

        # 得到各个action的概率
        act_sample = act_pd.sample()
        # sample操作即gumble softmax  coma训练需要某个特定的动作,所以需要一个argmax操作
        act_picked = [act.tolist().index(max(act)) for act in act_sample]

        p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))

        # 为什么要加一个[]
        act_input_n = act_ph_n + []
        # 动作概率分布  替换当前agent的动作
        act_input_n[p_index] = act_picked
        q_input = tf.concat(obs_ph_n + act_input_n, 1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1)
        q = q_func(q_input,
                   num_outputs,
                   scope="coma_q_func",
                   reuse=True,
                   num_units=num_units)

        # 反事实基线
        baseline = [
            baseline_calculation(act_distribute, q_list)
            for act_distribute, q_list in zip(act_sample, q)
        ]
        # 根据真实采取的动作获得q
        actual_picked_q = [q_list[act] for act, q_list in zip(act_picked, q)]
        # 计算当前动作的q相对于反事实基线的差值
        a = [q - b for q, b in zip(actual_picked_q, baseline)]

        pg_loss = -tf.reduce_mean(a)

        loss = pg_loss + p_reg * 1e-3

        optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars,
                                            grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n,
                           outputs=loss,
                           updates=[optimize_expr])
        act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample)
        p_values = U.function([obs_ph_n[p_index]], p)

        # target network
        target_p = p_func(p_input,
                          int(act_pdtype_n[p_index].param_shape()[0]),
                          scope="coma_target_p_func",
                          num_units=num_units)
        target_p_func_vars = U.scope_vars(
            U.absolute_scope_name("target_p_func"))
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)

        target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample()
        target_act = U.function(inputs=[obs_ph_n[p_index]],
                                outputs=target_act_sample)

        return act, train, update_target_p, {
            'p_values': p_values,
            'target_act': target_act
        }