Пример #1
0
def c_train(make_obs_ph_n,
            make_target_loc_ph_n,
            c_index,
            c_func,
            q_func,
            optimizer,
            scope="trainer",
            num_units=128,
            grad_norm_clipping=None,
            reuse=tf.AUTO_REUSE,
            local_q_func=False):
    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        obs_ph_n = make_obs_ph_n
        target_loc_ph = make_target_loc_ph_n[
            c_index]  #tf.placeholder(tf.float32, [None,2], name="target_loc")
        self_obs_ph = obs_ph_n[c_index]
        labels_ph = tf.placeholder(tf.float32, [None, 2], name="labels")
        # prior network
        c_input = tf.concat((self_obs_ph, target_loc_ph), 1)
        c = c_func(c_input, 2, scope="c_func", type='cls', num_units=num_units)
        c_pred = tf.nn.softmax(c)
        c_flags = tf.greater(c_pred[:, 0], 0.5)
        c_func_vars = U.scope_vars(U.absolute_scope_name("c_func"))
        # loss and optimization
        loss = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(logits=c,
                                                    labels=labels_ph))
        optimize_expr = U.minimize_and_clip(optimizer, loss, c_func_vars,
                                            grad_norm_clipping)
        # Create callable functions
        c_train = U.function(
            inputs=[obs_ph_n[c_index], target_loc_ph, labels_ph],
            outputs=loss,
            updates=[optimize_expr])
        c_act = U.function(inputs=[obs_ph_n[c_index], target_loc_ph],
                           outputs=c_flags)
        c_values = U.function([obs_ph_n[c_index], target_loc_ph],
                              outputs=c_pred)
        # target network
        target_c_values = c_func(c_input,
                                 2,
                                 scope="target_c_func",
                                 type='cls',
                                 num_units=num_units)
        target_c_pred = tf.nn.softmax(target_c_values)
        target_c_flags = tf.greater(target_c_pred[:, 0], 0.5)
        target_c_func_vars = U.scope_vars(
            U.absolute_scope_name("target_c_func"))
        update_target_c = make_update_exp(c_func_vars, target_c_func_vars)
        target_c_act = U.function(inputs=[obs_ph_n[c_index], target_loc_ph],
                                  outputs=target_c_flags)
        return c_act, c_train, update_target_c, {
            'c_values': c_values,
            'target_c_act': target_c_act
        }
Пример #2
0
def q_train(make_obs_ph_n,
            act_space_n,
            q_index,
            q_func,
            optimizer,
            grad_norm_clipping=None,
            local_q_func=False,
            scope="trainer",
            reuse=tf.AUTO_REUSE,
            num_units=128):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]
        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]
        target_ph = tf.placeholder(tf.float32, [None], name="target")
        # q network
        q_input = tf.concat(obs_ph_n + act_ph_n, 1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1)
        q = q_func(q_input, 1, scope="q_func", type='fit',
                   num_units=num_units)[:, 0]
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))
        q_loss = tf.reduce_mean(tf.square(q - target_ph))
        # viscosity solution to Bellman differential equation in place of an initial condition
        q_reg = tf.reduce_mean(tf.square(q))
        loss = q_loss  #+ 1e-3 * q_reg
        optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars,
                                            grad_norm_clipping)
        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph],
                           outputs=loss,
                           updates=[optimize_expr])
        q_values = U.function(obs_ph_n + act_ph_n, q)
        # target network
        target_q = q_func(q_input,
                          1,
                          scope="target_q_func",
                          type='fit',
                          num_units=num_units)[:, 0]
        target_q_func_vars = U.scope_vars(
            U.absolute_scope_name("target_q_func"))
        update_target_q = make_update_exp(q_func_vars, target_q_func_vars)
        target_q_values = U.function(obs_ph_n + act_ph_n, target_q)
        return train, update_target_q, {
            'q_values': q_values,
            'target_q_values': target_q_values
        }
Пример #3
0
def make_update_exp(vals, target_vals):
    polyak = 1.0 - 1e-2
    expression = []
    for var, var_target in zip(sorted(vals, key=lambda v: v.name),
                               sorted(target_vals, key=lambda v: v.name)):
        expression.append(
            var_target.assign(polyak * var_target + (1.0 - polyak) * var))
    expression = tf.group(*expression)
    return U.function([], [], updates=[expression])
Пример #4
0
def p_m_train(make_obs_ph_n,
              make_message_ph_n,
              act_space_n,
              num_agents_obs,
              p_index,
              m_func,
              p_func,
              q_func,
              optimizer,
              grad_norm_clipping=None,
              local_q_func=False,
              num_units=128,
              scope="trainer",
              reuse=tf.AUTO_REUSE):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]
        # set up placeholders
        obs_ph_n = make_obs_ph_n
        message_ph_n = make_message_ph_n
        act_ph_n = [
            act_pdtype_n[i].sample_placeholder([None], name="action" + str(i))
            for i in range(len(act_space_n))
        ]
        blz_distribution = tf.placeholder(tf.float32,
                                          [None, act_space_n[p_index].n],
                                          name="blz_distribution")
        m_input = message_ph_n[p_index]
        encode_dim = m_input.get_shape().as_list()[-1]
        # message encoder
        message_encode = m_func(m_input,
                                encode_dim,
                                num_agents_obs,
                                scope='m_func',
                                num_units=num_units)
        m_func_vars = U.scope_vars(U.absolute_scope_name("m_func"))
        # policy
        p_input = tf.concat((obs_ph_n[p_index], message_encode), 1)
        p = p_func(p_input,
                   int(act_pdtype_n[p_index].param_shape()[0]),
                   scope="p_func",
                   type='fit',
                   num_units=num_units)
        p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))
        # wrap parameters in distribution
        act_pd = act_pdtype_n[p_index].pdfromflat(p)
        act_sample = act_pd.sample()
        p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))
        act_input_n = act_ph_n + []
        # correlation reg
        k = tf.keras.losses.KLDivergence()
        #KL_reg = k(blz_distribution, act_sample)
        # q network
        act_input_n[p_index] = act_pd.sample()
        q_input = tf.concat(obs_ph_n + act_input_n, 1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1)
        q = q_func(q_input,
                   1,
                   scope="q_func",
                   type='fit',
                   reuse=True,
                   num_units=num_units)[:, 0]
        # loss and optimization
        pg_loss = -tf.reduce_mean(q)
        loss = pg_loss  #+ KL_reg * 1e-2
        optimize_expr = U.minimize_and_clip(optimizer, loss,
                                            [p_func_vars, m_func_vars],
                                            grad_norm_clipping)
        # Create callable functions
        train = U.function(inputs=obs_ph_n + message_ph_n + act_ph_n +
                           [blz_distribution],
                           outputs=loss,
                           updates=[optimize_expr])
        act = U.function(inputs=[obs_ph_n[p_index], message_ph_n[p_index]],
                         outputs=act_sample)
        p_values = U.function([obs_ph_n[p_index], message_ph_n[p_index]],
                              outputs=p)
        # target network
        target_message_encode = m_func(m_input,
                                       encode_dim,
                                       num_agents_obs,
                                       scope='target_m_func',
                                       num_units=num_units)
        target_m_func_vars = U.scope_vars(
            U.absolute_scope_name("target_m_func"))
        p_input = tf.concat((obs_ph_n[p_index], target_message_encode), 1)
        target_p = p_func(p_input,
                          int(act_pdtype_n[p_index].param_shape()[0]),
                          scope="target_p_func",
                          type='fit',
                          num_units=num_units)
        target_p_func_vars = U.scope_vars(
            U.absolute_scope_name("target_p_func"))
        update_target_m = make_update_exp(m_func_vars, target_m_func_vars)
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)
        target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample()
        target_act = U.function(
            inputs=[obs_ph_n[p_index], message_ph_n[p_index]],
            outputs=target_act_sample)
        return act, train, update_target_p, update_target_m, {
            'p_values': p_values,
            'target_act': target_act
        }