def test_target_var_init(self): """ test target_var_init op, sets target and main variables equal """ with tf.variable_scope(TARGET): target_val = tf_utils.mlp(self.obs_ph, (4, ), activation=tf.tanh) with tf.variable_scope(MAIN): main_val = tf_utils.mlp(self.obs_ph, (4, ), activation=tf.tanh) with self.agent.sess as sess: sess.run(tf.global_variables_initializer()) target_vars = tf_utils.var_list(TARGET) main_vars = tf_utils.var_list(MAIN) target_nps, main_nps = sess.run((target_vars, main_vars)) for targ, upd in zip(target_nps, main_nps): assert targ.shape == upd.shape # the biases should actually be the same, all zeros if len(targ.shape) > 1: assert not (targ == upd).all() # now set target and main equal init_op = self.agent.target_var_init() # now make sure all target and main parrameters are equal target_vars = tf_utils.var_list(TARGET) main_vars = tf_utils.var_list(MAIN) target_nps, main_nps = sess.run((target_vars, main_vars)) for targ, upd in zip(target_nps, main_nps): assert targ.shape == upd.shape np.testing.assert_allclose(targ, upd)
def target_var_init(self): """ returns tensorflow op to initialize target variables to be equal to the updated variables """ op_list = [ tf.assign(target_var, updated_var) for target_var, updated_var in zip(tf_utils.var_list(TARGET), tf_utils.var_list(MAIN)) ] self.sess.run(tf.group(op_list))
def test_build_policy_and_qval(self): """ smoke test, make sure the number of parameters is right """ pi, qval, qval_pi = self.agent.build_policy_and_qval( self.obs_ph, self.act_ph, self.env.action_space) with self.cached_session() as sess: sess.run(tf.global_variables_initializer()) pi_vars = tf_utils.var_list(POLICY) assert len(pi_vars) == 4 # 2 kernels and 2 biases qval_vars = tf_utils.var_list(QVAL) assert len(qval_vars) == 4 # 2 kernels and 2 biases
def build_target_update_op(self): """ returns tensorflow operation to update target parameters based on updated parameters and polyak """ op_list = [ tf.assign( target_var, self.polyak * target_var + (1 - self.polyak) * updated_var) for (target_var, updated_var ) in zip(tf_utils.var_list(TARGET), tf_utils.var_list(MAIN)) ] return tf.group(op_list)
def build_policy_loss(self, qval_pi): """ build loss function and train op for deterministic policy """ loss = -1 * tf.reduce_mean(qval_pi) train_op = tf.train.AdamOptimizer(learning_rate=self.pi_lr).minimize( loss, var_list=tf_utils.var_list(MAIN + '/' + POLICY)) return loss, train_op
def build_qval_loss(self, qval, qval_target): """ build loss for action-value function """ loss = tf.losses.mean_squared_error(qval, qval_target) train_op = tf.train.AdamOptimizer(learning_rate=self.q_lr).minimize( loss, var_list=tf_utils.var_list(MAIN + '/' + QVAL)) return loss, train_op