예제 #1
0
def q_train_ddpg(observPlaceHolderList, actionSpaceList, q_index, q_func, optimizer, grad_norm_clipping=None, scope="trainer", reuse=None, num_units=64):
    with tf.variable_scope(scope, reuse=reuse):
        actionPlaceHolderTypeList = [make_pdtype(actionSpace) for actionSpace in actionSpaceList]

        # set up placeholders
        act_ph_n = [actionPlaceHolderTypeList[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(actionSpaceList))]
        target_ph = tf.placeholder(tf.float32, [None], name="target")

        # q_input = tf.concat(observPlaceHolderList + act_ph_n, 1)
        q_input = tf.concat([observPlaceHolderList[q_index], act_ph_n[q_index]], 1) # specific for ddpg
        q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:,0]
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        q_loss = tf.reduce_mean(tf.square(q - target_ph))

        loss = q_loss #+ 1e-3 * q_reg

        optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=observPlaceHolderList + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr])
        q_values = U.function(observPlaceHolderList + act_ph_n, q)

        # target network
        target_q = q_func(q_input, 1, scope="target_q_func", num_units=num_units)[:,0]
        target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func"))
        update_target_q = make_update_exp(q_func_vars, target_q_func_vars)

        target_q_values = U.function(observPlaceHolderList + act_ph_n, target_q)
        return train, update_target_q, {'q_values': q_values, 'target_q_values': target_q_values}
예제 #2
0
def p_train(make_obs_ph_n, act_space_n, p_index, p_func, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, num_units=64, scope="trainer", reuse=None):
    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n] # SoftCategoricalPdType Object

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))]
# 0th element: name = 'agent_0_1/action0:0', shape = (?, 5)

        p_input = obs_ph_n[p_index]

        # mlp_model(tensor(,12), 5, scope="p_func", num_units=64)
        p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="p_func", num_units=num_units)
        p_func_vars = U.scope_vars(U.absolute_scope_name("p_func"))

        # wrap parameters in distribution
        act_pd = act_pdtype_n[p_index].pdfromflat(p) # SoftCategoricalPd Object

        act_sample = act_pd.sample()
        p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))

        act_input_n = act_ph_n + []
        act_input_n[p_index] = act_pd.sample()
        q_input = tf.concat(obs_ph_n + act_input_n, 1) # shape = 12+12+10+5*3
        if local_q_func: # ddpg, uses only personal obs/ act
            q_input = tf.concat([obs_ph_n[p_index], act_input_n[p_index]], 1) # shape = 17 = 12+ 5

        q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:,0]
        pg_loss = -tf.reduce_mean(q)

        loss = pg_loss + p_reg * 1e-3

        optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n, outputs=loss, updates=[optimize_expr])
        act = U.function(inputs=[obs_ph_n[p_index]], outputs=act_sample)
        p_values = U.function([obs_ph_n[p_index]], p)

        # target network
        target_p = p_func(p_input, int(act_pdtype_n[p_index].param_shape()[0]), scope="target_p_func", num_units=num_units)
        target_p_func_vars = U.scope_vars(U.absolute_scope_name("target_p_func"))
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)

        target_act_sample = act_pdtype_n[p_index].pdfromflat(target_p).sample()
        target_act = U.function(inputs=[obs_ph_n[p_index]], outputs=target_act_sample)

        return act, train, update_target_p, {'p_values': p_values, 'target_act': target_act}
예제 #3
0
def make_update_exp(vals, target_vals):
    polyak = 1.0 - 1e-2
    expression = []
    for var, var_target in zip(sorted(vals, key=lambda v: v.name), sorted(target_vals, key=lambda v: v.name)):
        expression.append(var_target.assign(polyak * var_target + (1.0-polyak) * var))
    expression = tf.group(*expression)
    return U.function([], [], updates=[expression])
예제 #4
0
def p_train(observPlaceHolderList, actionSpaceList, agentIndex, getMLPModel, q_func, optimizer,
            grad_norm_clipping=None, ddpg=False, num_units=64, scope="trainer", reuse=None):

    with tf.variable_scope(scope, reuse=reuse):
        # create distribtuions
        actionPlaceHolderTypeList = [make_pdtype(actionSpace) for actionSpace in actionSpaceList]

        # set up placeholders
        act_ph_n = [actionPlaceHolderTypeList[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(actionSpaceList))]

        p_input = observPlaceHolderList[agentIndex]

        p = getMLPModel(p_input, int(actionPlaceHolderTypeList[agentIndex].param_shape()[0]), scope="getMLPModel", num_units=num_units)
        p_func_vars = U.scope_vars(U.absolute_scope_name("getMLPModel"))

        # wrap parameters in distribution
        act_pd = actionPlaceHolderTypeList[agentIndex].pdfromflat(p)

        act_sample = act_pd.sample()
        p_reg = tf.reduce_mean(tf.square(act_pd.flatparam()))

        act_input_n = act_ph_n + []
        act_input_n[agentIndex] = act_pd.sample()
        q_input = tf.concat(observPlaceHolderList + act_input_n, 1)
        if ddpg:
            q_input = tf.concat([observPlaceHolderList[agentIndex], act_input_n[agentIndex]], 1)
        q = q_func(q_input, 1, scope="q_func", reuse=True, num_units=num_units)[:,0]
        pg_loss = -tf.reduce_mean(q)

        loss = pg_loss + p_reg * 1e-3

        optimize_expr = U.minimize_and_clip(optimizer, loss, p_func_vars, grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=observPlaceHolderList + act_ph_n, outputs=loss, updates=[optimize_expr])
        act = U.function(inputs=[observPlaceHolderList[agentIndex]], outputs=act_sample)
        p_values = U.function([observPlaceHolderList[agentIndex]], p)

        # target network
        target_p = getMLPModel(p_input, int(actionPlaceHolderTypeList[agentIndex].param_shape()[0]), scope="target_p_func", num_units=num_units)
        target_p_func_vars = U.scope_vars(U.absolute_scope_name("target_p_func"))
        update_target_p = make_update_exp(p_func_vars, target_p_func_vars)

        target_act_sample = actionPlaceHolderTypeList[agentIndex].pdfromflat(target_p).sample()
        target_act = U.function(inputs=[observPlaceHolderList[agentIndex]], outputs=target_act_sample)

        return act, train, update_target_p, {'p_values': p_values, 'target_act': target_act}
예제 #5
0
def q_train(make_obs_ph_n, act_space_n, q_index, q_func, optimizer, grad_norm_clipping=None, local_q_func=False, scope="trainer", reuse=None, num_units=64):
    with tf.variable_scope(scope, reuse=reuse):  #    parser.add_argument("--num-units", type=int, default=64, help="number of units in the mlp")

    # local_q_func = False if maddpgAlgor, = true if ddpg
        # create distribtuions
        act_pdtype_n = [make_pdtype(act_space) for act_space in act_space_n]

        # set up placeholders
        obs_ph_n = make_obs_ph_n
        act_ph_n = [act_pdtype_n[i].sample_placeholder([None], name="action"+str(i)) for i in range(len(act_space_n))]
        target_ph = tf.placeholder(tf.float32, [None], name="target")

        q_input = tf.concat(obs_ph_n + act_ph_n, 1)
        if local_q_func:
            q_input = tf.concat([obs_ph_n[q_index], act_ph_n[q_index]], 1)
        q = q_func(q_input, 1, scope="q_func", num_units=num_units)[:,0]
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        q_loss = tf.reduce_mean(tf.square(q - target_ph))

        # viscosity solution to Bellman differential equation in place of an initial condition
        q_reg = tf.reduce_mean(tf.square(q))
        loss = q_loss #+ 1e-3 * q_reg

        optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars, grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=obs_ph_n + act_ph_n + [target_ph], outputs=loss, updates=[optimize_expr])
        q_values = U.function(obs_ph_n + act_ph_n, q)

        # target network
        target_q = q_func(q_input, 1, scope="target_q_func", num_units=num_units)[:,0]
        target_q_func_vars = U.scope_vars(U.absolute_scope_name("target_q_func"))
        update_target_q = make_update_exp(q_func_vars, target_q_func_vars)

        target_q_values = U.function(obs_ph_n + act_ph_n, target_q)

        return train, update_target_q, {'q_values': q_values, 'target_q_values': target_q_values}
def p_train(observPlaceHolderList,
            actionSpaceList,
            agentIndex,
            p_func,
            q_func,
            optimizer,
            grad_norm_clipping,
            ddpg,
            num_units=64,
            scope="trainer",
            reuse=None):

    with tf.variable_scope(scope, reuse=reuse):
        # set up placeholders
        actionPlaceHolderList = [
            tf.placeholder(dtype=tf.float32,
                           shape=[None] + [actionSpaceList[i].n],
                           name="action" + str(i))
            for i in range(len(actionSpaceList))
        ]

        policyNetInput = observPlaceHolderList[
            agentIndex]  # personal observation
        policyOutputShape = int(actionSpaceList[agentIndex].n)
        policyTrainOutput = p_func(policyNetInput,
                                   policyOutputShape,
                                   scope="p_func",
                                   num_units=num_units)
        policyNetVariables = U.scope_vars(U.absolute_scope_name("p_func"))

        sampleNoise = tf.random_uniform(tf.shape(policyTrainOutput), seed=0)
        actionSample = U.softmax(policyTrainOutput -
                                 tf.log(-tf.log(sampleNoise)),
                                 axis=-1)  # output of function act
        p_reg = tf.reduce_mean(tf.square(policyTrainOutput))

        actionInputPlaceHolderList = actionPlaceHolderList + []
        actionInputPlaceHolderList[agentIndex] = actionSample

        qNetInput = tf.concat(
            observPlaceHolderList + actionInputPlaceHolderList, 1)
        if ddpg:
            qNetInput = tf.concat(
                [observPlaceHolderList[agentIndex], actionSample], 1)

        q = q_func(qNetInput,
                   1,
                   scope="q_func",
                   reuse=True,
                   num_units=num_units)[:, 0]
        pg_loss = -tf.reduce_mean(q)

        loss = pg_loss + p_reg * 1e-3  ####### didnt change this optimization process in my ddpg

        optimize_expr = U.minimize_and_clip(optimizer, loss,
                                            policyNetVariables,
                                            grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=observPlaceHolderList +
                           actionPlaceHolderList,
                           outputs=loss,
                           updates=[optimize_expr])
        act = U.function(inputs=[observPlaceHolderList[agentIndex]],
                         outputs=actionSample)
        p_values = U.function([observPlaceHolderList[agentIndex]],
                              policyTrainOutput)

        # target network
        target_p = p_func(policyNetInput,
                          int(actionSpaceList[agentIndex].n),
                          scope="target_p_func",
                          num_units=num_units)
        targetNetVariables = U.scope_vars(
            U.absolute_scope_name("target_p_func"))
        update_target_p = make_update_exp(policyNetVariables,
                                          targetNetVariables)

        uTarget = tf.random_uniform(tf.shape(target_p))
        target_act_sample = U.softmax(target_p - tf.log(-tf.log(uTarget)),
                                      axis=-1)
        target_act = U.function(inputs=[observPlaceHolderList[agentIndex]],
                                outputs=target_act_sample)

        return act, train, update_target_p, {
            'p_values': p_values,
            'target_act': target_act
        }
def q_train(observPlaceHolderList,
            actionSpaceList,
            agentIndex,
            q_func,
            optimizer,
            grad_norm_clipping=None,
            ddpg=False,
            scope="trainer",
            reuse=None,
            num_units=64):

    with tf.variable_scope(scope, reuse=reuse):
        actionPlaceHolderList = [
            tf.placeholder(dtype=tf.float32,
                           shape=[None] + [actionSpaceList[i].n],
                           name="action" + str(i))
            for i in range(len(actionSpaceList))
        ]
        yi_ = tf.placeholder(tf.float32, [None], name="target")

        q_input = tf.concat(observPlaceHolderList + actionPlaceHolderList,
                            1)  # shape (?, 24)
        if ddpg:
            q_input = tf.concat([
                observPlaceHolderList[agentIndex],
                actionPlaceHolderList[agentIndex]
            ], 1)  # shape (?, 13)

        q = q_func(
            q_input, 1, scope="q_func",
            num_units=num_units)[:,
                                 0]  # drop a level: shape (?, 1) to shape (?,)
        q_func_vars = U.scope_vars(U.absolute_scope_name("q_func"))

        loss = tf.reduce_mean(tf.square(q - yi_))
        optimize_expr = U.minimize_and_clip(optimizer, loss, q_func_vars,
                                            grad_norm_clipping)

        # Create callable functions
        train = U.function(inputs=observPlaceHolderList +
                           actionPlaceHolderList + [yi_],
                           outputs=loss,
                           updates=[optimize_expr])
        q_values = U.function(observPlaceHolderList + actionPlaceHolderList, q)

        # target network
        target_q = q_func(q_input,
                          1,
                          scope="target_q_func",
                          num_units=num_units)[:, 0]
        target_q_func_vars = U.scope_vars(
            U.absolute_scope_name("target_q_func"))
        update_target_q = make_update_exp(q_func_vars, target_q_func_vars)

        target_q_values = U.function(
            observPlaceHolderList + actionPlaceHolderList, target_q)

        return train, update_target_q, {
            'q_values': q_values,
            'target_q_values': target_q_values
        }