Пример #1
0
def approx_advantage_policy_gradient(model, network, observs, actions, config):
    """
    Policy gradient of the advantage function. Estimates the advantage from
    learned value and action-value functions.
    """
    with tf.variable_scope('behavior'):
        state = model.add_input('state', observs)
        hidden = network(model, state)
        value = model.add_output(
            'value', tf.squeeze(dense(hidden, 1, tf.identity), [1]))
        qvalues = model.add_output(
            'qvalues', tf.squeeze(dense(hidden, actions, tf.identity), [1]))
        policy = dense(value, actions, tf.nn.softmax)
        model.add_output('choice', tf.squeeze(tf.multinomial(policy, 1), [1]))
    with tf.variable_scope('learning'):
        action = model.add_input('action', type_=tf.int32)
        action = tf.one_hot(action, actions)
        return_ = model.add_input('return_')
        qvalue = qvalues * action
        advantage = tf.stop_gradient(qvalue - value)
        logprob = tf.log(tf.reduce_sum(policy * action, 1) + 1e-13)
        entropy = -tf.reduce_sum(tf.log(policy + 1e-13) * policy)
        actor = config.actor_weight * advantage * logprob
        critic = config.critic_weight * (return_ - value)**2 / 2
        qcritic = config.critic_weight * (return_ - qvalue)**2 / 2
        entropy = config.entropy_weight * entropy
        model.add_cost('cost', critic - actor - entropy)
Пример #2
0
def minecraft_large(model, x):
    # Barron, Whitehead, Yeung (2016)
    x = conv2d(x, 64, 3, 1, tf.nn.relu, 2)
    x = conv2d(x, 128, 3, 1, tf.nn.relu, 2)
    x = conv2d(x, 256, 3, 1, tf.nn.relu, 2)
    x = conv2d(x, 215, 3, 1, tf.nn.relu, 2)
    x = conv2d(x, 215, 3, 1, tf.nn.relu, 2)
    x = dense(x, 4096, tf.nn.relu)
    x = dense(x, 4096, tf.nn.relu)
    return x
Пример #3
0
def doom_large(model, x):
    # Kempka et al. (2016)
    x = conv2d(x, 32, 7, 1, tf.nn.relu, 2)
    x = conv2d(x, 32, 5, 1, tf.nn.relu, 2)
    x = conv2d(x, 32, 3, 1, tf.nn.relu, 2)
    x = dense(x, 1024, tf.nn.relu)
    return x
Пример #4
0
def dqn_2015(model, x):
    # Mnih et al. (2015)
    x = conv2d(x, 32, 8, 4, tf.nn.relu)
    x = conv2d(x, 64, 4, 2, tf.nn.relu)
    x = conv2d(x, 64, 3, 1, tf.nn.relu)
    x = dense(x, 512, tf.nn.relu)
    return x
Пример #5
0
def a3c_lstm(model, x):
    # Mnih et al. (2016)
    x = conv2d(x, 16, 8, 4, tf.nn.relu)
    x = conv2d(x, 32, 4, 2, tf.nn.relu)
    x = dense(x, 256, tf.nn.relu)
    # x = rnn(model, x, 256, tf.nn.rnn_cell.LSTMCell)
    x = rnn(model, x, 256)
    return x
Пример #6
0
def policy_gradient(model, network, observs, actions, config):
    """
    Policy gradient of the return.
    """
    with tf.variable_scope('behavior'):
        state = model.add_input('state', observs)
        hidden = network(model, state)
        value = model.add_output(
            'value', tf.squeeze(dense(hidden, 1, tf.identity), [1]))
        policy = dense(value, actions, tf.nn.softmax)
        model.add_output('choice', tf.squeeze(tf.multinomial(policy, 1), [1]))
    with tf.variable_scope('learning'):
        action = model.add_input('action', type_=tf.int32)
        action = tf.one_hot(action, actions)
        return_ = model.add_input('return_')
        logprob = tf.log(tf.reduce_sum(policy * action, 1) + 1e-13)
        entropy = -tf.reduce_sum(tf.log(policy + 1e-13) * policy)
        actor = config.actor_weight * return_ * logprob
        entropy = config.entropy_weight * entropy
        model.add_cost('cost', -actor + -entropy)
Пример #7
0
def q_function(model, network, observs, actions, config=None):
    """
    Action value approximation.
    """
    with tf.variable_scope('behavior'):
        state = model.add_input('state', observs)
        hidden = network(model, state)
        qvalues = dense(hidden, actions, tf.identity)
        qvalues = model.add_output('qvalues', qvalues)
        model.add_output('choice', tf.argmax(qvalues, 1))
    with tf.variable_scope('learning'):
        action = model.add_input('action', type_=tf.int32)
        action = tf.one_hot(action, actions)
        return_ = model.add_input('return_')
        model.add_output('qvalue', tf.reduce_max(qvalues, 1))
        model.add_cost('cost',
                       (tf.reduce_sum(action * qvalues, 1) - return_)**2)
Пример #8
0
def control(model, x):
    # x = dense(x, 100, tf.nn.relu)
    # x = dense(x, 50, tf.nn.relu)
    x = dense(x, 32, tf.nn.relu)
    x = dense(x, 32, tf.nn.relu)
    return x
Пример #9
0
def test(model, x):
    x = dense(x, 8, tf.nn.relu)
    x = dense(x, 8, tf.nn.relu)
    return x
Пример #10
0
def dqn_2013(model, x):
    # Mnih et al. (2013)
    x = conv2d(x, 16, 8, 4, tf.nn.relu)
    x = conv2d(x, 32, 4, 2, tf.nn.relu)
    x = dense(x, 256, tf.nn.relu)
    return x
Пример #11
0
def minecraft_small(model, x):
    # Barron, Whitehead, Yeung (2016)
    x = conv2d(x, 32, 8, 4, tf.nn.relu)
    x = conv2d(x, 64, 4, 2, tf.nn.relu)
    x = dense(x, 512, tf.nn.relu)
    return x