def test_update_both(self): obs_t = np.random.random((32, ) + self.params.state_shape) actions_t = np.random.random((32, self.params.num_actions)) rewards_tp1 = np.random.random((32, )) obs_tp1 = np.random.random((32, ) + self.params.state_shape) dones_tp1 = np.random.random((32, )) critic_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'td3/critic') actor_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'td3/actor') with self.test_session() as sess: sess.run(tf.global_variables_initializer()) before_critic = sess.run(critic_variables) before_actor = sess.run(actor_variables) critic_loss, actor_loss = self.network.update( obs_t=obs_t, actions_t=actions_t, rewards_tp1=rewards_tp1, obs_tp1=obs_tp1, dones_tp1=dones_tp1, update_actor=True) after_critic = sess.run(critic_variables) after_actor = sess.run(actor_variables) assert_variable_mismatch(before_critic, after_critic) assert_variable_mismatch(before_actor, after_actor) assert actor_loss is not None
def test_make_fcs(self): inpt = make_tf_inpt() fcs = make_fcs() activation = mock_activation() w_init = tf.random_uniform_initializer(-0.1, 0.1) out = _make_fcs(fcs, inpt, activation, w_init) # to check connection optimizer = tf.train.AdamOptimizer(1e-4) optimize_expr = optimizer.minimize(tf.reduce_mean(out)) # check variable shapes variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'hiddens') assert_hidden_variable_shape(variables, inpt, fcs) # check if activation is actually called assert activation.call_count == len(fcs) with self.test_session() as sess: sess.run(tf.global_variables_initializer()) before = sess.run(variables) for variable in before: assert_variable_range(variable, -0.1, 0.1) sess.run(optimize_expr) after = sess.run(variables) assert_variable_mismatch(before, after)
def test_success(self): dim1 = np.random.randint(10) + 1 dim2 = np.random.randint(10) + 1 var1 = tf.Variable(np.random.random((dim1, dim2)), name='var1') var2 = tf.Variable(np.random.random((dim1, dim2)), name='var2') ops = build_optim(var1, 1e-4, 'var1') with self.test_session() as sess: sess.run(tf.global_variables_initializer()) before_var1, before_var2 = sess.run([var1, var2]) sess.run(ops) after_var1, after_var2 = sess.run([var1, var2]) assert_variable_mismatch(before_var1, after_var1) assert_variable_match(before_var2, after_var2)
def test_with_share_false(self): inpt = make_tf_inpt() fcs = make_fcs() num_actions = np.random.randint(10) + 1 w_init = tf.random_uniform_initializer(-0.1, 0.1) b_init = tf.random_uniform_initializer(-0.1, 0.1) dist = stochastic_policy_function( fcs, inpt, num_actions, share=False, w_init=w_init, last_w_init=w_init, last_b_init=b_init) # to check connection optimizer = tf.train.AdamOptimizer(1e-4) optimize_expr = optimizer.minimize(tf.reduce_mean(dist.sample(1))) assert int(dist.sample(1)[0].shape[0]) == int(inpt.shape[0]) assert int(dist.sample(1)[0].shape[1]) == num_actions hiddens = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'policy/hiddens') assert_hidden_variable_shape(hiddens, inpt, fcs) mean = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'policy/mean')[0] assert int(mean.shape[0]) == fcs[-1] assert int(mean.shape[1]) == num_actions logstd = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'policy/logstd')[0] assert int(logstd.shape[0]) == 1 assert int(logstd.shape[1]) == num_actions with self.test_session() as sess: sess.run(tf.global_variables_initializer()) before_mean = sess.run(mean) assert_variable_range(before_mean, -0.1, 0.1) before_logstd = sess.run(logstd) assert np.all(before_logstd == np.zeros_like(before_logstd)) before = sess.run(hiddens) sess.run(optimize_expr) after_mean, after_logstd = sess.run([mean, logstd]) assert_variable_mismatch([before_mean, before_logstd], [after_mean, after_logstd]) after = sess.run(hiddens) assert_variable_mismatch(before, after)
def test_q_function(self): inpt = make_tf_inpt() fcs = make_fcs() w_init = tf.random_uniform_initializer(-0.1, 0.1) action = tf.constant(np.random.random((int(inpt.shape[0]), np.random.randint(10) + 1)), dtype=tf.float32) concat_index = np.random.randint(len(fcs)) value = q_function( fcs, inpt, action, concat_index, w_init=w_init, last_w_init=w_init, last_b_init=w_init) # to check connection optimizer = tf.train.AdamOptimizer(1e-4) optimize_expr = optimizer.minimize(tf.reduce_mean(value)) assert int(value.shape[0]) == int(inpt.shape[0]) assert int(value.shape[1]) == 1 hiddens = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, 'action_value/hiddens') concat = hiddens[concat_index * 2] if concat_index == 0: dim = int(inpt.shape[1]) else: dim = fcs[concat_index - 1] assert int(concat.shape[0]) == dim + int(action.shape[1]) output = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, 'action_value/output')[0] assert int(output.shape[0]) == fcs[-1] assert int(output.shape[1]) == 1 variable = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'value') with self.test_session() as sess: sess.run(tf.global_variables_initializer()) before = sess.run(variable) for var in before: assert_variable_range(var, -0.1, 0.1) sess.run(optimize_expr) after = sess.run(variable) assert_variable_mismatch(before, after)
def test_update(self): obs_t = np.random.random((32, ) + self.params.state_shape) actions_t = np.random.random((32, self.params.num_actions)) rewards_tp1 = np.random.random((32, )) obs_tp1 = np.random.random((32, ) + self.params.state_shape) dones_tp1 = np.random.random((32, )) variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'ddpg') with self.test_session() as sess: sess.run(tf.global_variables_initializer()) before = sess.run(variables) critic_loss, actor_loss = self.network.update( obs_t=obs_t, actions_t=actions_t, rewards_tp1=rewards_tp1, obs_tp1=obs_tp1, dones_tp1=dones_tp1) after = sess.run(variables) assert_variable_mismatch(before, after)
def test_deterministic_policy_function(self): inpt = make_tf_inpt() fcs = make_fcs() num_actions = np.random.randint(10) + 1 w_init = tf.random_uniform_initializer(-0.1, 0.1) policy = deterministic_policy_function( fcs, inpt, num_actions, w_init=w_init, last_w_init=w_init, last_b_init=w_init) # to check connection optimizer = tf.train.AdamOptimizer(1e-4) optimize_expr = optimizer.minimize(tf.reduce_mean(policy)) assert int(policy.shape[0]) == int(inpt.shape[0]) assert int(policy.shape[1]) == num_actions hiddens = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'policy/hiddens') assert_hidden_variable_shape(hiddens, inpt, fcs) output = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'policy/output')[0] assert int(output.shape[0]) == fcs[-1] assert int(output.shape[1]) == num_actions variable = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'policy') with self.test_session() as sess: sess.run(tf.global_variables_initializer()) before = sess.run(variable) for var in before: assert_variable_range(var, -0.1, 0.1) sess.run(optimize_expr) after = sess.run(variable) assert_variable_mismatch(before, after)
def test_value_function(self): inpt = make_tf_inpt() fcs = make_fcs() w_init = tf.random_uniform_initializer(-0.1, 0.1) b_init = tf.random_uniform_initializer(-0.1, 0.1) value = value_function( fcs, inpt, w_init=w_init, last_w_init=w_init, last_b_init=b_init) # to check connection optimizer = tf.train.AdamOptimizer(1e-4) optimize_expr = optimizer.minimize(tf.reduce_mean(value)) assert int(value.shape[0]) == int(inpt.shape[0]) assert int(value.shape[1]) == 1 hiddens = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'value/hiddens') assert_hidden_variable_shape(hiddens, inpt, fcs) output = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'value/output')[0] assert int(output.shape[0]) == fcs[-1] assert int(output.shape[1]) == 1 variable = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'value') with self.test_session() as sess: sess.run(tf.global_variables_initializer()) before = sess.run(variable) for var in before: assert_variable_range(var, -0.1, 0.1) sess.run(optimize_expr) after = sess.run(variable) assert_variable_mismatch(before, after)