def test_get_target_ops(self): var = tf.compat.v1.get_variable('var', [1], initializer=tf.constant_initializer(1)) target_var = tf.compat.v1.get_variable( 'target_var', [1], initializer=tf.constant_initializer(2)) self.sess.run(tf.compat.v1.global_variables_initializer()) assert target_var.eval() == 2 update_ops = get_target_ops([var], [target_var]) self.sess.run(update_ops) assert target_var.eval() == 1
def test_get_target_ops_tau(self): var = tf.compat.v1.get_variable('var', [1], initializer=tf.constant_initializer(1)) target_var = tf.compat.v1.get_variable( 'target_var', [1], initializer=tf.constant_initializer(2)) self.sess.run(tf.compat.v1.global_variables_initializer()) assert target_var.eval() == 2 init_ops, update_ops = get_target_ops([var], [target_var], tau=0.2) self.sess.run(update_ops) assert np.allclose(target_var.eval(), 1.8) self.sess.run(init_ops) assert np.allclose(target_var.eval(), 1)
def _init_opt(self): """Build the loss function and init the optimizer.""" with tf.name_scope(self._name): # Create target policy (actor) and qf (critic) networks with tf.name_scope('inputs'): obs_dim = self._env_spec.observation_space.flat_dim y = tf.compat.v1.placeholder(tf.float32, shape=(None, 1), name='input_y') obs = tf.compat.v1.placeholder(tf.float32, shape=(None, obs_dim), name='input_observation') actions = tf.compat.v1.placeholder( tf.float32, shape=(None, self._env_spec.action_space.flat_dim), name='input_action') policy_network_outputs = self._target_policy.build(obs, name='policy') target_qf_outputs = self._target_qf.build(obs, actions, name='qf') target_qf2_outputs = self._target_qf2.build(obs, actions, name='qf') self._target_policy_f_prob_online = compile_function( inputs=[obs], outputs=policy_network_outputs) self._target_qf_f_prob_online = compile_function( inputs=[obs, actions], outputs=target_qf_outputs) self._target_qf2_f_prob_online = compile_function( inputs=[obs, actions], outputs=target_qf2_outputs) # Set up target init and update functions with tf.name_scope('setup_target'): policy_init_op, policy_update_op = get_target_ops( self.policy.get_global_vars(), self._target_policy.get_global_vars(), self._tau) qf_init_ops, qf_update_ops = get_target_ops( self.qf.get_global_vars(), self._target_qf.get_global_vars(), self._tau) qf2_init_ops, qf2_update_ops = get_target_ops( self.qf2.get_global_vars(), self._target_qf2.get_global_vars(), self._tau) target_init_op = policy_init_op + qf_init_ops + qf2_init_ops target_update_op = (policy_update_op + qf_update_ops + qf2_update_ops) f_init_target = compile_function(inputs=[], outputs=target_init_op) f_update_target = compile_function(inputs=[], outputs=target_update_op) # Set up policy training function next_action = self.policy.build(obs, name='policy_action') next_qval = self.qf.build(obs, next_action, name='policy_action_qval') with tf.name_scope('action_loss'): action_loss = -tf.reduce_mean(next_qval) with tf.name_scope('minimize_action_loss'): policy_optimizer = make_optimizer( self._policy_optimizer, learning_rate=self._policy_lr, name='PolicyOptimizer') policy_train_op = policy_optimizer.minimize( action_loss, var_list=self.policy.get_trainable_vars()) f_train_policy = compile_function( inputs=[obs], outputs=[policy_train_op, action_loss]) # Set up qf training function qval = self.qf.build(obs, actions, name='q_value') q2val = self.qf2.build(obs, actions, name='q2_value') with tf.name_scope('qval1_loss'): qval1_loss = tf.reduce_mean(tf.math.squared_difference( y, qval)) with tf.name_scope('qval2_loss'): qval2_loss = tf.reduce_mean( tf.math.squared_difference(y, q2val)) with tf.name_scope('minimize_qf_loss'): qf_optimizer = make_optimizer(self._qf_optimizer, learning_rate=self._qf_lr, name='QFunctionOptimizer') qf_train_op = qf_optimizer.minimize( qval1_loss, var_list=self.qf.get_trainable_vars()) qf2_train_op = qf_optimizer.minimize( qval2_loss, var_list=self.qf2.get_trainable_vars()) f_train_qf = compile_function( inputs=[y, obs, actions], outputs=[qf_train_op, qval1_loss, qval]) f_train_qf2 = compile_function( inputs=[y, obs, actions], outputs=[qf2_train_op, qval2_loss, q2val]) self._f_train_policy = f_train_policy self._f_train_qf = f_train_qf self._f_init_target = f_init_target self._f_update_target = f_update_target self._f_train_qf2 = f_train_qf2
def _init_opt(self): """Build the loss function and init the optimizer.""" with tf.name_scope(self._name): # Create target policy and qf network with tf.name_scope('inputs'): obs_dim = self._env_spec.observation_space.flat_dim input_y = tf.compat.v1.placeholder(tf.float32, shape=(None, 1), name='input_y') obs = tf.compat.v1.placeholder(tf.float32, shape=(None, obs_dim), name='input_observation') actions = tf.compat.v1.placeholder( tf.float32, shape=(None, self._env_spec.action_space.flat_dim), name='input_action') policy_network_outputs = self._target_policy.build(obs, name='policy') target_qf_outputs = self._target_qf.build(obs, actions, name='qf') self._target_policy_f_prob_online = compile_function( inputs=[obs], outputs=policy_network_outputs) self._target_qf_f_prob_online = compile_function( inputs=[obs, actions], outputs=target_qf_outputs) # Set up target init and update function with tf.name_scope('setup_target'): ops = get_target_ops(self.policy.get_global_vars(), self._target_policy.get_global_vars(), self._tau) policy_init_ops, policy_update_ops = ops qf_init_ops, qf_update_ops = get_target_ops( self._qf.get_global_vars(), self._target_qf.get_global_vars(), self._tau) target_init_op = policy_init_ops + qf_init_ops target_update_op = policy_update_ops + qf_update_ops f_init_target = compile_function(inputs=[], outputs=target_init_op) f_update_target = compile_function(inputs=[], outputs=target_update_op) with tf.name_scope('inputs'): obs_dim = self._env_spec.observation_space.flat_dim input_y = tf.compat.v1.placeholder(tf.float32, shape=(None, 1), name='input_y') obs = tf.compat.v1.placeholder(tf.float32, shape=(None, obs_dim), name='input_observation') actions = tf.compat.v1.placeholder( tf.float32, shape=(None, self._env_spec.action_space.flat_dim), name='input_action') # Set up policy training function next_action = self.policy.build(obs, name='policy_action') next_qval = self._qf.build(obs, next_action, name='policy_action_qval') with tf.name_scope('action_loss'): action_loss = -tf.reduce_mean(next_qval) if self._policy_weight_decay > 0.: regularizer = tf.keras.regularizers.l2( self._policy_weight_decay) for var in self.policy.get_regularizable_vars(): policy_reg = regularizer(var) action_loss += policy_reg with tf.name_scope('minimize_action_loss'): policy_optimizer = make_optimizer( self._policy_optimizer, learning_rate=self._policy_lr, name='PolicyOptimizer') policy_train_op = policy_optimizer.minimize( action_loss, var_list=self.policy.get_trainable_vars()) f_train_policy = compile_function( inputs=[obs], outputs=[policy_train_op, action_loss]) # Set up qf training function qval = self._qf.build(obs, actions, name='q_value') with tf.name_scope('qval_loss'): qval_loss = tf.reduce_mean( tf.compat.v1.squared_difference(input_y, qval)) if self._qf_weight_decay > 0.: regularizer = tf.keras.regularizers.l2( self._qf_weight_decay) for var in self._qf.get_regularizable_vars(): qf_reg = regularizer(var) qval_loss += qf_reg with tf.name_scope('minimize_qf_loss'): qf_optimizer = make_optimizer(self._qf_optimizer, learning_rate=self._qf_lr, name='QFunctionOptimizer') qf_train_op = qf_optimizer.minimize( qval_loss, var_list=self._qf.get_trainable_vars()) f_train_qf = compile_function( inputs=[input_y, obs, actions], outputs=[qf_train_op, qval_loss, qval]) self._f_train_policy = f_train_policy self._f_train_qf = f_train_qf self._f_init_target = f_init_target self._f_update_target = f_update_target
def _init_opt(self): """Initialize the networks and Ops. Assume discrete space for dqn, so action dimension will always be action_space.n """ action_dim = self._env_spec.action_space.n # build q networks with tf.name_scope(self._name): action_t_ph = tf.compat.v1.placeholder(tf.int32, None, name='action') reward_t_ph = tf.compat.v1.placeholder(tf.float32, None, name='reward') done_t_ph = tf.compat.v1.placeholder(tf.float32, None, name='done') with tf.name_scope('update_ops'): target_update_op = get_target_ops( self._qf.get_global_vars(), self._target_qf.get_global_vars()) self._qf_update_ops = compile_function(inputs=[], outputs=target_update_op) with tf.name_scope('td_error'): # Q-value of the selected action action = tf.one_hot(action_t_ph, action_dim, on_value=1., off_value=0.) q_selected = tf.reduce_sum( self._qf.q_vals * action, # yapf: disable axis=1) # r + Q'(s', argmax_a(Q(s', _)) - Q(s, a) if self._double_q: target_qval_with_online_q = self._qf.build( self._target_qf.input, self._qf.name) future_best_q_val_action = tf.argmax( target_qval_with_online_q, 1) future_best_q_val = tf.reduce_sum( self._target_qf.q_vals * tf.one_hot(future_best_q_val_action, action_dim, on_value=1., off_value=0.), axis=1) else: # r + max_a(Q'(s', _)) - Q(s, a) future_best_q_val = tf.reduce_max(self._target_qf.q_vals, axis=1) q_best_masked = (1.0 - done_t_ph) * future_best_q_val # if done, it's just reward # else reward + discount * future_best_q_val target_q_values = (reward_t_ph + self._discount * q_best_masked) # td_error = q_selected - tf.stop_gradient(target_q_values) loss = tf.compat.v1.losses.huber_loss( q_selected, tf.stop_gradient(target_q_values)) loss = tf.reduce_mean(loss) with tf.name_scope('optimize_ops'): qf_optimizer = make_optimizer(self._qf_optimizer, learning_rate=self._qf_lr) if self._grad_norm_clipping is not None: gradients = qf_optimizer.compute_gradients( loss, var_list=self._qf.get_trainable_vars()) for i, (grad, var) in enumerate(gradients): if grad is not None: gradients[i] = (tf.clip_by_norm( grad, self._grad_norm_clipping), var) optimize_loss = qf_optimizer.apply_gradients(gradients) else: optimize_loss = qf_optimizer.minimize( loss, var_list=self._qf.get_trainable_vars()) self._train_qf = compile_function(inputs=[ self._qf.input, action_t_ph, reward_t_ph, done_t_ph, self._target_qf.input ], outputs=[loss, optimize_loss])