def test_target_var_init(self): """ test target_var_init op, sets target and main variables equal """ with tf.variable_scope(TARGET): target_val = tf_utils.mlp(self.obs_ph, (4, ), activation=tf.tanh) with tf.variable_scope(MAIN): main_val = tf_utils.mlp(self.obs_ph, (4, ), activation=tf.tanh) with self.agent.sess as sess: sess.run(tf.global_variables_initializer()) target_vars = tf_utils.var_list(TARGET) main_vars = tf_utils.var_list(MAIN) target_nps, main_nps = sess.run((target_vars, main_vars)) for targ, upd in zip(target_nps, main_nps): assert targ.shape == upd.shape # the biases should actually be the same, all zeros if len(targ.shape) > 1: assert not (targ == upd).all() # now set target and main equal init_op = self.agent.target_var_init() # now make sure all target and main parrameters are equal target_vars = tf_utils.var_list(TARGET) main_vars = tf_utils.var_list(MAIN) target_nps, main_nps = sess.run((target_vars, main_vars)) for targ, upd in zip(target_nps, main_nps): assert targ.shape == upd.shape np.testing.assert_allclose(targ, upd)
def build_value_function(obs_ph, hidden_sizes, activation): """ build the graph for the value function """ with tf.variable_scope('val'): val = mlp(obs_ph, hidden_sizes=hidden_sizes + (1, ), activation=activation) return tf.reshape(val, [-1])
def test_mlp_multiple_layers(self): """ test mlp makes multiple layers, with weights of the correct shapes """ batch_size = 3 input_dim = 2 hidden_sizes = (5, 4, 3) x = np.zeros(shape=[batch_size, input_dim], dtype=np.float32) with self.cached_session() as sess: x_ph = tf.placeholder(dtype=tf.float32, shape=[None, input_dim]) ret = tf_utils.mlp(x_ph, hidden_sizes=hidden_sizes) sess.run(tf.global_variables_initializer()) n_trainable_variables = 6 # 3 kernels and 3 bias ret_eval = sess.run(ret, feed_dict={x_ph: x}) self.assertEqual(ret_eval.shape, (batch_size, 3)) trainable_variables = sess.run(tf.trainable_variables()) variable_shapes = [var.shape for var in trainable_variables] self.assertEqual(len(trainable_variables), n_trainable_variables) # kernels self.assertIn((2, 5), variable_shapes) self.assertIn((5, 4), variable_shapes) self.assertIn((4, 3), variable_shapes) # biases self.assertIn((5,), variable_shapes) self.assertIn((4,), variable_shapes) self.assertIn((3,), variable_shapes)
def build_action_value_function(obs_ph, act_ph, hidden_sizes, activation): """ build the action-value function estimator, Q """ features = tf.concat([obs_ph, act_ph], 1) qval = tf_utils.mlp(features, hidden_sizes=hidden_sizes + (1, ), activation=activation) return tf.reshape(qval, [-1])
def mlp_categorical_policy(obs, act, hidden_sizes, activation, action_space): """ Build Stochastic Categorical Policy """ n_cat = action_space.n # number of categories mlp_hidden_sizes = hidden_sizes + (n_cat,) logits = mlp(obs, hidden_sizes=mlp_hidden_sizes, activation=activation) log_probs = logits_to_log_probs(logits) pi = sample_actions(logits) logp = log_prob_of_action(log_probs, act) logp_pi = log_prob_of_action(log_probs, pi) return pi, logp, logp_pi
def mlp_gaussian_policy(x, a, hidden_sizes, activation, action_space, output_activation=None): """ Builds symbols to sample actions and compute log-probs of actions. """ act_dim = np.prod(action_space.shape) mlp_hidden_sizes = hidden_sizes + (act_dim,) mu = mlp(x, hidden_sizes=mlp_hidden_sizes, activation=activation, output_activation=output_activation) log_std = make_log_std_var(act_dim) pi = sample_actions(mu, log_std) logp = gaussian_likelihood(a, mu, log_std) logp_pi = gaussian_likelihood(pi, mu, log_std) return pi, logp, logp_pi
def mlp_deterministic_policy(obs, hidden_sizes, activation, action_space, output_activation=None): """ Build a MLP deterministic policy (ie observation in, action out. No stochasticity, no sampling from a distribution """ act_dim = np.prod(action_space.shape) mlp_hidden_sizes = hidden_sizes + (act_dim, ) return tf_utils.mlp(obs, hidden_sizes=mlp_hidden_sizes, activation=activation, output_activation=output_activation)
def test_mlp_smoke(self): """ Smoke test mlp """ batch_size = 3 input_dim = 2 output_dim = 4 x = np.zeros(shape=[batch_size, input_dim], dtype=np.float32) with self.cached_session() as sess: x_ph = tf.placeholder(dtype=tf.float32, shape=[None, input_dim]) ret = tf_utils.mlp(x_ph, hidden_sizes=(output_dim,)) sess.run(tf.global_variables_initializer()) n_trainable_variables = 2 # 1 kernel and 1 bias trainable_variables = tf.trainable_variables() self.assertEqual(len(trainable_variables), n_trainable_variables) ret_eval = sess.run(ret, feed_dict={x_ph: x}) self.assertEqual(ret_eval.shape, (batch_size, output_dim))