예제 #1
0
 def test_target_var_init(self):
     """ test target_var_init op, sets target and main variables equal
     """
     with tf.variable_scope(TARGET):
         target_val = tf_utils.mlp(self.obs_ph, (4, ), activation=tf.tanh)
     with tf.variable_scope(MAIN):
         main_val = tf_utils.mlp(self.obs_ph, (4, ), activation=tf.tanh)
     with self.agent.sess as sess:
         sess.run(tf.global_variables_initializer())
         target_vars = tf_utils.var_list(TARGET)
         main_vars = tf_utils.var_list(MAIN)
         target_nps, main_nps = sess.run((target_vars, main_vars))
         for targ, upd in zip(target_nps, main_nps):
             assert targ.shape == upd.shape
             # the biases should actually be the same, all zeros
             if len(targ.shape) > 1:
                 assert not (targ == upd).all()
         # now set target and main equal
         init_op = self.agent.target_var_init()
         # now make sure all target and main parrameters are equal
         target_vars = tf_utils.var_list(TARGET)
         main_vars = tf_utils.var_list(MAIN)
         target_nps, main_nps = sess.run((target_vars, main_vars))
         for targ, upd in zip(target_nps, main_nps):
             assert targ.shape == upd.shape
             np.testing.assert_allclose(targ, upd)
예제 #2
0
 def build_value_function(obs_ph, hidden_sizes, activation):
     """ build the graph for the value function """
     with tf.variable_scope('val'):
         val = mlp(obs_ph,
                   hidden_sizes=hidden_sizes + (1, ),
                   activation=activation)
     return tf.reshape(val, [-1])
 def test_mlp_multiple_layers(self):
     """ test mlp makes multiple layers, with weights of the correct
     shapes """
     batch_size = 3
     input_dim = 2
     hidden_sizes = (5, 4, 3)
     x = np.zeros(shape=[batch_size, input_dim], dtype=np.float32)
     with self.cached_session() as sess:
         x_ph = tf.placeholder(dtype=tf.float32, shape=[None, input_dim])
         ret = tf_utils.mlp(x_ph, hidden_sizes=hidden_sizes)
         sess.run(tf.global_variables_initializer())
         n_trainable_variables = 6 # 3 kernels and 3 bias
         ret_eval = sess.run(ret, feed_dict={x_ph: x})
         self.assertEqual(ret_eval.shape, (batch_size, 3))
         trainable_variables = sess.run(tf.trainable_variables())
         variable_shapes = [var.shape for var in trainable_variables]
         self.assertEqual(len(trainable_variables), n_trainable_variables)
         # kernels
         self.assertIn((2, 5), variable_shapes)
         self.assertIn((5, 4), variable_shapes)
         self.assertIn((4, 3), variable_shapes)
         # biases
         self.assertIn((5,), variable_shapes)
         self.assertIn((4,), variable_shapes)
         self.assertIn((3,), variable_shapes)
예제 #4
0
 def build_action_value_function(obs_ph, act_ph, hidden_sizes, activation):
     """ build the action-value function estimator, Q """
     features = tf.concat([obs_ph, act_ph], 1)
     qval = tf_utils.mlp(features,
                         hidden_sizes=hidden_sizes + (1, ),
                         activation=activation)
     return tf.reshape(qval, [-1])
def mlp_categorical_policy(obs, act, hidden_sizes, activation, action_space):
    """ Build Stochastic Categorical Policy """
    n_cat = action_space.n # number of categories
    mlp_hidden_sizes = hidden_sizes + (n_cat,)
    logits = mlp(obs, hidden_sizes=mlp_hidden_sizes, activation=activation)
    log_probs = logits_to_log_probs(logits)
    pi = sample_actions(logits)
    logp = log_prob_of_action(log_probs, act)
    logp_pi = log_prob_of_action(log_probs, pi)
    return pi, logp, logp_pi
예제 #6
0
def mlp_gaussian_policy(x, a, hidden_sizes, activation, action_space,
                        output_activation=None):
    """ Builds symbols to sample actions and compute log-probs of actions.  """
    act_dim = np.prod(action_space.shape)
    mlp_hidden_sizes = hidden_sizes + (act_dim,)
    mu = mlp(x, hidden_sizes=mlp_hidden_sizes, activation=activation,
             output_activation=output_activation)
    log_std = make_log_std_var(act_dim)
    pi = sample_actions(mu, log_std)
    logp = gaussian_likelihood(a, mu, log_std)
    logp_pi = gaussian_likelihood(pi, mu, log_std)
    return pi, logp, logp_pi
def mlp_deterministic_policy(obs,
                             hidden_sizes,
                             activation,
                             action_space,
                             output_activation=None):
    """ Build a MLP deterministic policy (ie observation in, action out. No
    stochasticity, no sampling from a distribution """
    act_dim = np.prod(action_space.shape)
    mlp_hidden_sizes = hidden_sizes + (act_dim, )
    return tf_utils.mlp(obs,
                        hidden_sizes=mlp_hidden_sizes,
                        activation=activation,
                        output_activation=output_activation)
 def test_mlp_smoke(self):
     """ Smoke test mlp """
     batch_size = 3
     input_dim = 2
     output_dim = 4
     x = np.zeros(shape=[batch_size, input_dim], dtype=np.float32)
     with self.cached_session() as sess:
         x_ph = tf.placeholder(dtype=tf.float32, shape=[None, input_dim])
         ret = tf_utils.mlp(x_ph, hidden_sizes=(output_dim,))
         sess.run(tf.global_variables_initializer())
         n_trainable_variables = 2 # 1 kernel and 1 bias
         trainable_variables = tf.trainable_variables()
         self.assertEqual(len(trainable_variables), n_trainable_variables)
         ret_eval = sess.run(ret, feed_dict={x_ph: x})
         self.assertEqual(ret_eval.shape, (batch_size, output_dim))