def approx_advantage_policy_gradient(model, network, observs, actions, config): """ Policy gradient of the advantage function. Estimates the advantage from learned value and action-value functions. """ with tf.variable_scope('behavior'): state = model.add_input('state', observs) hidden = network(model, state) value = model.add_output( 'value', tf.squeeze(dense(hidden, 1, tf.identity), [1])) qvalues = model.add_output( 'qvalues', tf.squeeze(dense(hidden, actions, tf.identity), [1])) policy = dense(value, actions, tf.nn.softmax) model.add_output('choice', tf.squeeze(tf.multinomial(policy, 1), [1])) with tf.variable_scope('learning'): action = model.add_input('action', type_=tf.int32) action = tf.one_hot(action, actions) return_ = model.add_input('return_') qvalue = qvalues * action advantage = tf.stop_gradient(qvalue - value) logprob = tf.log(tf.reduce_sum(policy * action, 1) + 1e-13) entropy = -tf.reduce_sum(tf.log(policy + 1e-13) * policy) actor = config.actor_weight * advantage * logprob critic = config.critic_weight * (return_ - value)**2 / 2 qcritic = config.critic_weight * (return_ - qvalue)**2 / 2 entropy = config.entropy_weight * entropy model.add_cost('cost', critic - actor - entropy)
def minecraft_large(model, x): # Barron, Whitehead, Yeung (2016) x = conv2d(x, 64, 3, 1, tf.nn.relu, 2) x = conv2d(x, 128, 3, 1, tf.nn.relu, 2) x = conv2d(x, 256, 3, 1, tf.nn.relu, 2) x = conv2d(x, 215, 3, 1, tf.nn.relu, 2) x = conv2d(x, 215, 3, 1, tf.nn.relu, 2) x = dense(x, 4096, tf.nn.relu) x = dense(x, 4096, tf.nn.relu) return x
def doom_large(model, x): # Kempka et al. (2016) x = conv2d(x, 32, 7, 1, tf.nn.relu, 2) x = conv2d(x, 32, 5, 1, tf.nn.relu, 2) x = conv2d(x, 32, 3, 1, tf.nn.relu, 2) x = dense(x, 1024, tf.nn.relu) return x
def dqn_2015(model, x): # Mnih et al. (2015) x = conv2d(x, 32, 8, 4, tf.nn.relu) x = conv2d(x, 64, 4, 2, tf.nn.relu) x = conv2d(x, 64, 3, 1, tf.nn.relu) x = dense(x, 512, tf.nn.relu) return x
def a3c_lstm(model, x): # Mnih et al. (2016) x = conv2d(x, 16, 8, 4, tf.nn.relu) x = conv2d(x, 32, 4, 2, tf.nn.relu) x = dense(x, 256, tf.nn.relu) # x = rnn(model, x, 256, tf.nn.rnn_cell.LSTMCell) x = rnn(model, x, 256) return x
def policy_gradient(model, network, observs, actions, config): """ Policy gradient of the return. """ with tf.variable_scope('behavior'): state = model.add_input('state', observs) hidden = network(model, state) value = model.add_output( 'value', tf.squeeze(dense(hidden, 1, tf.identity), [1])) policy = dense(value, actions, tf.nn.softmax) model.add_output('choice', tf.squeeze(tf.multinomial(policy, 1), [1])) with tf.variable_scope('learning'): action = model.add_input('action', type_=tf.int32) action = tf.one_hot(action, actions) return_ = model.add_input('return_') logprob = tf.log(tf.reduce_sum(policy * action, 1) + 1e-13) entropy = -tf.reduce_sum(tf.log(policy + 1e-13) * policy) actor = config.actor_weight * return_ * logprob entropy = config.entropy_weight * entropy model.add_cost('cost', -actor + -entropy)
def q_function(model, network, observs, actions, config=None): """ Action value approximation. """ with tf.variable_scope('behavior'): state = model.add_input('state', observs) hidden = network(model, state) qvalues = dense(hidden, actions, tf.identity) qvalues = model.add_output('qvalues', qvalues) model.add_output('choice', tf.argmax(qvalues, 1)) with tf.variable_scope('learning'): action = model.add_input('action', type_=tf.int32) action = tf.one_hot(action, actions) return_ = model.add_input('return_') model.add_output('qvalue', tf.reduce_max(qvalues, 1)) model.add_cost('cost', (tf.reduce_sum(action * qvalues, 1) - return_)**2)
def control(model, x): # x = dense(x, 100, tf.nn.relu) # x = dense(x, 50, tf.nn.relu) x = dense(x, 32, tf.nn.relu) x = dense(x, 32, tf.nn.relu) return x
def test(model, x): x = dense(x, 8, tf.nn.relu) x = dense(x, 8, tf.nn.relu) return x
def dqn_2013(model, x): # Mnih et al. (2013) x = conv2d(x, 16, 8, 4, tf.nn.relu) x = conv2d(x, 32, 4, 2, tf.nn.relu) x = dense(x, 256, tf.nn.relu) return x
def minecraft_small(model, x): # Barron, Whitehead, Yeung (2016) x = conv2d(x, 32, 8, 4, tf.nn.relu) x = conv2d(x, 64, 4, 2, tf.nn.relu) x = dense(x, 512, tf.nn.relu) return x