def test_get_target_ops_tau(self): var = tf.get_variable('var', [1], initializer=tf.constant_initializer(1)) target_var = tf.get_variable('target_var', [1], initializer=tf.constant_initializer(2)) self.sess.run(tf.global_variables_initializer()) assert target_var.eval() == 2 init_ops, update_ops = get_target_ops([var], [target_var], tau=0.2) self.sess.run(update_ops) assert np.allclose(target_var.eval(), 1.8) self.sess.run(init_ops) assert np.allclose(target_var.eval(), 1)
def init_opt(self): """Build the loss function and init the optimizer.""" with tf.name_scope(self.name, 'DDPG'): # Create target policy and qf network self.target_policy_f_prob_online = tensor_utils.compile_function( inputs=[self.target_policy.model.networks['default'].input], outputs=self.target_policy.model.networks['default'].outputs) self.target_qf_f_prob_online = tensor_utils.compile_function( inputs=self.target_qf.model.networks['default'].inputs, outputs=self.target_qf.model.networks['default'].outputs) # Set up target init and update function with tf.name_scope('setup_target'): ops = tensor_utils.get_target_ops( self.policy.get_global_vars(), self.target_policy.get_global_vars(), self.tau) policy_init_ops, policy_update_ops = ops qf_init_ops, qf_update_ops = tensor_utils.get_target_ops( self.qf.get_global_vars(), self.target_qf.get_global_vars(), self.tau) target_init_op = policy_init_ops + qf_init_ops target_update_op = policy_update_ops + qf_update_ops f_init_target = tensor_utils.compile_function( inputs=[], outputs=target_init_op) f_update_target = tensor_utils.compile_function( inputs=[], outputs=target_update_op) with tf.name_scope('inputs'): if self.input_include_goal: obs_dim = self.env_spec.observation_space.\ flat_dim_with_keys(['observation', 'desired_goal']) else: obs_dim = self.env_spec.observation_space.flat_dim input_y = tf.compat.v1.placeholder(tf.float32, shape=(None, 1), name='input_y') obs = tf.compat.v1.placeholder(tf.float32, shape=(None, obs_dim), name='input_observation') actions = tf.compat.v1.placeholder( tf.float32, shape=(None, self.env_spec.action_space.flat_dim), name='input_action') # Set up policy training function next_action = self.policy.get_action_sym(obs, name='policy_action') next_qval = self.qf.get_qval_sym(obs, next_action, name='policy_action_qval') with tf.name_scope('action_loss'): action_loss = -tf.reduce_mean(next_qval) if self.policy_weight_decay > 0.: policy_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.policy_weight_decay), weights_list=self.policy.get_regularizable_vars()) action_loss += policy_reg with tf.name_scope('minimize_action_loss'): policy_train_op = self.policy_optimizer( self.policy_lr, name='PolicyOptimizer').minimize( action_loss, var_list=self.policy.get_trainable_vars()) f_train_policy = tensor_utils.compile_function( inputs=[obs], outputs=[policy_train_op, action_loss]) # Set up qf training function qval = self.qf.get_qval_sym(obs, actions, name='q_value') with tf.name_scope('qval_loss'): qval_loss = tf.reduce_mean( tf.compat.v1.squared_difference(input_y, qval)) if self.qf_weight_decay > 0.: qf_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.qf_weight_decay), weights_list=self.qf.get_regularizable_vars()) qval_loss += qf_reg with tf.name_scope('minimize_qf_loss'): qf_train_op = self.qf_optimizer( self.qf_lr, name='QFunctionOptimizer').minimize( qval_loss, var_list=self.qf.get_trainable_vars()) f_train_qf = tensor_utils.compile_function( inputs=[input_y, obs, actions], outputs=[qf_train_op, qval_loss, qval]) self.f_train_policy = f_train_policy self.f_train_qf = f_train_qf self.f_init_target = f_init_target self.f_update_target = f_update_target
def init_opt(self): """ Initialize the networks and Ops. Assume discrete space for dqn, so action dimension will always be action_space.n """ action_dim = self.env_spec.action_space.n self.episode_rewards = [] self.episode_qf_losses = [] # build q networks with tf.name_scope(self.name, 'DQN'): action_t_ph = tf.compat.v1.placeholder(tf.int32, None, name='action') reward_t_ph = tf.compat.v1.placeholder(tf.float32, None, name='reward') done_t_ph = tf.compat.v1.placeholder(tf.float32, None, name='done') with tf.name_scope('update_ops'): target_update_op = tensor_utils.get_target_ops( self.qf.get_global_vars(), self.target_qf.get_global_vars()) self._qf_update_ops = tensor_utils.compile_function( inputs=[], outputs=target_update_op) with tf.name_scope('td_error'): # Q-value of the selected action action = tf.one_hot(action_t_ph, action_dim) q_selected = tf.reduce_sum( self.qf.q_vals * action, # yapf: disable axis=1) # r + Q'(s', argmax_a(Q(s', _)) - Q(s, a) if self.double_q: target_qval_with_online_q = self.qf.get_qval_sym( self.target_qf.input, self.qf.name) future_best_q_val_action = tf.argmax( target_qval_with_online_q, 1) future_best_q_val = tf.reduce_sum( self.target_qf.q_vals * tf.one_hot(future_best_q_val_action, action_dim), axis=1) else: # r + max_a(Q'(s', _)) - Q(s, a) future_best_q_val = tf.reduce_max(self.target_qf.q_vals, axis=1) q_best_masked = (1.0 - done_t_ph) * future_best_q_val # if done, it's just reward # else reward + discount * future_best_q_val target_q_values = (reward_t_ph + self.discount * q_best_masked) # td_error = q_selected - tf.stop_gradient(target_q_values) loss = tf.compat.v1.losses.huber_loss( q_selected, tf.stop_gradient(target_q_values)) loss = tf.reduce_mean(loss) with tf.name_scope('optimize_ops'): optimizer = self.qf_optimizer(self.qf_lr) if self.grad_norm_clipping is not None: gradients = optimizer.compute_gradients( loss, var_list=self.qf.get_trainable_vars()) for i, (grad, var) in enumerate(gradients): if grad is not None: gradients[i] = (tf.clip_by_norm( grad, self.grad_norm_clipping), var) optimize_loss = optimizer.apply_gradients(gradients) else: optimize_loss = optimizer.minimize( loss, var_list=self.qf.get_trainable_vars()) self._train_qf = tensor_utils.compile_function( inputs=[ self.qf.input, action_t_ph, reward_t_ph, done_t_ph, self.target_qf.input ], outputs=[loss, optimize_loss])
def init_opt(self): """Build the loss function and init the optimizer.""" with tf.name_scope(self._name): # Create target policy (actor) and qf (critic) networks with tf.name_scope('inputs'): obs_dim = self.env_spec.observation_space.flat_dim y = tf.compat.v1.placeholder(tf.float32, shape=(None, 1), name='input_y') obs = tf.compat.v1.placeholder(tf.float32, shape=(None, obs_dim), name='input_observation') actions = tf.compat.v1.placeholder( tf.float32, shape=(None, self.env_spec.action_space.flat_dim), name='input_action') policy_network_outputs = self._target_policy.build(obs, name='policy') target_qf_outputs = self._target_qf.build(obs, actions, name='qf') target_qf2_outputs = self._target_qf2.build(obs, actions, name='qf') self.target_policy_f_prob_online = tensor_utils.compile_function( inputs=[obs], outputs=policy_network_outputs) self.target_qf_f_prob_online = tensor_utils.compile_function( inputs=[obs, actions], outputs=target_qf_outputs) self.target_qf2_f_prob_online = tensor_utils.compile_function( inputs=[obs, actions], outputs=target_qf2_outputs) # Set up target init and update functions with tf.name_scope('setup_target'): policy_init_op, policy_update_op = tensor_utils.get_target_ops( self.policy.get_global_vars(), self._target_policy.get_global_vars(), self._tau) qf_init_ops, qf_update_ops = tensor_utils.get_target_ops( self.qf.get_global_vars(), self._target_qf.get_global_vars(), self._tau) qf2_init_ops, qf2_update_ops = tensor_utils.get_target_ops( self.qf2.get_global_vars(), self._target_qf2.get_global_vars(), self._tau) target_init_op = policy_init_op + qf_init_ops + qf2_init_ops target_update_op = (policy_update_op + qf_update_ops + qf2_update_ops) f_init_target = tensor_utils.compile_function( inputs=[], outputs=target_init_op) f_update_target = tensor_utils.compile_function( inputs=[], outputs=target_update_op) # Set up policy training function next_action = self.policy.build(obs, name='policy_action') next_qval = self.qf.build(obs, next_action, name='policy_action_qval') with tf.name_scope('action_loss'): action_loss = -tf.reduce_mean(next_qval) with tf.name_scope('minimize_action_loss'): policy_optimizer = make_optimizer( self._policy_optimizer, learning_rate=self._policy_lr, name='PolicyOptimizer') policy_train_op = policy_optimizer.minimize( action_loss, var_list=self.policy.get_trainable_vars()) f_train_policy = tensor_utils.compile_function( inputs=[obs], outputs=[policy_train_op, action_loss]) # Set up qf training function qval = self.qf.build(obs, actions, name='q_value') q2val = self.qf2.build(obs, actions, name='q2_value') with tf.name_scope('qval1_loss'): qval1_loss = tf.reduce_mean(tf.math.squared_difference( y, qval)) with tf.name_scope('qval2_loss'): qval2_loss = tf.reduce_mean( tf.math.squared_difference(y, q2val)) with tf.name_scope('minimize_qf_loss'): qf_optimizer = make_optimizer(self._qf_optimizer, learning_rate=self._qf_lr, name='QFunctionOptimizer') qf_train_op = qf_optimizer.minimize( qval1_loss, var_list=self.qf.get_trainable_vars()) qf2_train_op = qf_optimizer.minimize( qval2_loss, var_list=self.qf2.get_trainable_vars()) f_train_qf = tensor_utils.compile_function( inputs=[y, obs, actions], outputs=[qf_train_op, qval1_loss, qval]) f_train_qf2 = tensor_utils.compile_function( inputs=[y, obs, actions], outputs=[qf2_train_op, qval2_loss, q2val]) self.f_train_policy = f_train_policy self.f_train_qf = f_train_qf self.f_init_target = f_init_target self.f_update_target = f_update_target self.f_train_qf2 = f_train_qf2
def init_opt(self): """Build the loss function and init the optimizer.""" with tf.name_scope(self.name, 'TD3'): # Create target policy (actor) and qf (critic) networks self.target_policy_f_prob_online = tensor_utils.compile_function( inputs=[self.target_policy.model.networks['default'].input], outputs=self.target_policy.model.networks['default'].outputs) self.target_qf_f_prob_online = tensor_utils.compile_function( inputs=self.target_qf.model.networks['default'].inputs, outputs=self.target_qf.model.networks['default'].outputs) self.target_qf2_f_prob_online = tensor_utils.compile_function( inputs=self.target_qf2.model.networks['default'].inputs, outputs=self.target_qf2.model.networks['default'].outputs) # Set up target init and update functions with tf.name_scope('setup_target'): policy_init_op, policy_update_op = tensor_utils.get_target_ops( self.policy.get_global_vars(), self.target_policy.get_global_vars(), self.tau) qf_init_ops, qf_update_ops = tensor_utils.get_target_ops( self.qf.get_global_vars(), self.target_qf.get_global_vars(), self.tau) qf2_init_ops, qf2_update_ops = tensor_utils.get_target_ops( self.qf2.get_global_vars(), self.target_qf2.get_global_vars(), self.tau) target_init_op = policy_init_op + qf_init_ops + qf2_init_ops target_update_op = (policy_update_op + qf_update_ops + qf2_update_ops) f_init_target = tensor_utils.compile_function( inputs=[], outputs=target_init_op) f_update_target = tensor_utils.compile_function( inputs=[], outputs=target_update_op) with tf.name_scope('inputs'): if self.input_include_goal: obs_dim = self.env_spec.observation_space.\ flat_dim_with_keys(['observation', 'desired_goal']) else: obs_dim = self.env_spec.observation_space.flat_dim y = tf.placeholder(tf.float32, shape=(None, 1), name='input_y') obs = tf.placeholder(tf.float32, shape=(None, obs_dim), name='input_observation') actions = tf.placeholder( tf.float32, shape=(None, self.env_spec.action_space.flat_dim), name='input_action') # Set up policy training function next_action = self.policy.get_action_sym(obs, name='policy_action') next_qval = self.qf.get_qval_sym(obs, next_action, name='policy_action_qval') with tf.name_scope('action_loss'): action_loss = -tf.reduce_mean(next_qval) with tf.name_scope('minimize_action_loss'): policy_train_op = self.policy_optimizer( self.policy_lr, name='PolicyOptimizer').minimize( action_loss, var_list=self.policy.get_trainable_vars()) f_train_policy = tensor_utils.compile_function( inputs=[obs], outputs=[policy_train_op, action_loss]) # Set up qf training function qval = self.qf.get_qval_sym(obs, actions, name='q_value') q2val = self.qf2.get_qval_sym(obs, actions, name='q2_value') with tf.name_scope('qval1_loss'): qval1_loss = tf.reduce_mean(tf.squared_difference(y, qval)) with tf.name_scope('qval2_loss'): qval2_loss = tf.reduce_mean(tf.squared_difference(y, q2val)) with tf.name_scope('minimize_qf_loss'): qf_train_op = self.qf_optimizer( self.qf_lr, name='QFunctionOptimizer').minimize( qval1_loss, var_list=self.qf.get_trainable_vars()) qf2_train_op = self.qf_optimizer( self.qf_lr, name='QFunctionOptimizer').minimize( qval2_loss, var_list=self.qf2.get_trainable_vars()) f_train_qf = tensor_utils.compile_function( inputs=[y, obs, actions], outputs=[qf_train_op, qval1_loss, qval]) f_train_qf2 = tensor_utils.compile_function( inputs=[y, obs, actions], outputs=[qf2_train_op, qval2_loss, q2val]) self.f_train_policy = f_train_policy self.f_train_qf = f_train_qf self.f_init_target = f_init_target self.f_update_target = f_update_target self.f_train_qf2 = f_train_qf2
def init_opt(self): """Build the loss function and init the optimizer.""" with tf.name_scope(self.name, 'JoLeDDPG'): # Create target policy and qf network self.target_policy_f_prob_online = tensor_utils.compile_function( inputs=[self.target_policy.model.networks['default'].input], outputs=self.target_policy.model.networks['default'].outputs) self.target_qf_f_prob_online = tensor_utils.compile_function( inputs=self.target_qf.model.networks['default'].inputs, outputs=self.target_qf.model.networks['default'].outputs) # Set up target init and update function with tf.name_scope('setup_target'): ops = tensor_utils.get_target_ops( self.policy.get_global_vars(), self.target_policy.get_global_vars(), self.tau) policy_init_ops, policy_update_ops = ops qf_init_ops, qf_update_ops = tensor_utils.get_target_ops( self.qf.get_global_vars(), self.target_qf.get_global_vars(), self.tau) target_init_op = policy_init_ops + qf_init_ops target_update_op = policy_update_ops + qf_update_ops f_init_target = tensor_utils.compile_function( inputs=[], outputs=target_init_op) f_update_target = tensor_utils.compile_function( inputs=[], outputs=target_update_op) with tf.name_scope('inputs'): if self.input_include_goal: obs_dim = self.env_spec.observation_space.\ flat_dim_with_keys(['observation', 'desired_goal']) else: obs_dim = self.env_spec.observation_space.flat_dim y = tf.compat.v1.placeholder(tf.float32, shape=(None, 1), name='input_y') obs = tf.compat.v1.placeholder(tf.float32, shape=(None, obs_dim), name='input_observation') actions = tf.compat.v1.placeholder( tf.float32, shape=(None, self.env_spec.action_space.flat_dim), name='input_action') next_obs = tf.compat.v1.placeholder(tf.float32, shape=(None, obs_dim), name='next_observation') reward = tf.compat.v1.placeholder(tf.float32, shape=(None, 1), name='reward') # Set up policy training function next_action = self.policy.get_action_sym(obs, name='policy_action') next_qval = self.qf.get_qval_sym(obs, next_action, name='policy_action_qval') with tf.name_scope('action_loss'): action_loss = -tf.reduce_mean(next_qval) if self.policy_weight_decay > 0.: policy_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.policy_weight_decay), weights_list=self.policy.get_regularizable_vars()) action_loss += policy_reg with tf.name_scope('minimize_action_loss'): policy_train_op = self.policy_optimizer( self.policy_lr, name='PolicyOptimizer').minimize( action_loss, var_list=self.policy.get_trainable_vars()) f_train_policy = tensor_utils.compile_function( inputs=[obs], outputs=[policy_train_op, action_loss]) # Set up qf training function qval = self.qf.get_qval_sym(obs, actions, name='q_value') with tf.name_scope('qval_loss'): qval_loss = tf.reduce_mean( tf.compat.v1.squared_difference(y, qval)) if self.qf_weight_decay > 0.: qf_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.qf_weight_decay), weights_list=self.qf.get_regularizable_vars()) qval_loss += qf_reg with tf.name_scope('minimize_qf_loss'): qf_train_op = self.qf_optimizer( self.qf_lr, name='QFunctionOptimizer').minimize( qval_loss, var_list=self.qf.get_trainable_vars()) f_train_qf = tensor_utils.compile_function( inputs=[y, obs, actions], outputs=[qf_train_op, qval_loss, qval]) mean, var = tf.nn.moments(obs, axes=[0]) #Set up of environment model training function predicted_next_obs = self.obs_model.get_fval_sym(obs, actions, name='obs_value') predicted_reward = self.reward_model.get_fval_sym( obs, actions, name='reward_value') with tf.name_scope('model_loss'): #change to predict the delta of s obs_model_loss = tf.reduce_mean( tf.compat.v1.squared_difference(next_obs - obs, predicted_next_obs)) reward_model_loss = tf.reduce_mean( tf.compat.v1.squared_difference(reward, predicted_reward)) with tf.name_scope('minimize_obs_model_loss'): obs_train_op = self.obs_model_optimizer( self.obs_model_lr, name='ObsModelOptimizer').minimize( obs_model_loss, var_list=self.obs_model.get_trainable_vars()) reward_train_op = self.reward_model_optimizer( self.reward_model_lr, name='RewardModelOptimizer').minimize( reward_model_loss, var_list=self.reward_model.get_trainable_vars()) f_train_obs_model = tensor_utils.compile_function( inputs=[next_obs, obs, actions], outputs=[obs_train_op, obs_model_loss]) f_train_reward_model = tensor_utils.compile_function( inputs=[reward, obs, actions], outputs=[reward_train_op, reward_model_loss]) f_obs_model_predict = tensor_utils.compile_function( inputs=[obs, actions], outputs=[predicted_next_obs, obs + predicted_next_obs]) f_reward_model_predict = tensor_utils.compile_function( inputs=[obs, actions], outputs=[predicted_reward]) #Set up of seperate environment model training function sepe_predicted_next_obs = self.sepe_obs_model.get_fval_sym( obs, actions, name='sepe_obs_value') sepe_predicted_reward = self.sepe_reward_model.get_fval_sym( obs, actions, name='sepe_reward_value') with tf.name_scope('model_loss'): #change to predict the delta of s sepe_obs_model_loss = tf.reduce_mean( tf.compat.v1.squared_difference(next_obs - obs, sepe_predicted_next_obs)) sepe_reward_model_loss = tf.reduce_mean( tf.compat.v1.squared_difference(reward, sepe_predicted_reward)) with tf.name_scope('minimize_obs_model_loss'): sepe_obs_train_op = self.obs_model_optimizer( self.obs_model_lr, name='SepeObsModelOptimizer').minimize( sepe_obs_model_loss, var_list=self.sepe_obs_model.get_trainable_vars()) sepe_reward_train_op = self.reward_model_optimizer( self.reward_model_lr, name='SepeRewardModelOptimizer').minimize( sepe_reward_model_loss, var_list=self.sepe_reward_model.get_trainable_vars()) f_train_sepe_obs_model = tensor_utils.compile_function( inputs=[next_obs, obs, actions], outputs=[sepe_obs_train_op, sepe_obs_model_loss]) f_train_sepe_reward_model = tensor_utils.compile_function( inputs=[reward, obs, actions], outputs=[sepe_reward_train_op, sepe_reward_model_loss]) self.f_train_sepe_obs_model = f_train_sepe_obs_model self.f_train_sepe_reward_model = f_train_sepe_reward_model # Copy the parameter of seperate env models when necessary with tf.name_scope('copy_sepe_env_models'): copy_sepe_obs_model_ops = tensor_utils.get_target_ops( self.sepe_obs_model.get_global_vars(), self.obs_model.get_global_vars()) copy_sepe_reward_model_ops = tensor_utils.get_target_ops( self.sepe_reward_model.get_global_vars(), self.reward_model.get_global_vars()) f_copy_sepe_obs_model = tensor_utils.compile_function( inputs=[], outputs=copy_sepe_obs_model_ops) f_copy_sepe_reward_model = tensor_utils.compile_function( inputs=[], outputs=copy_sepe_reward_model_ops) self.f_copy_sepe_reward_model = f_copy_sepe_reward_model self.f_copy_sepe_obs_model = f_copy_sepe_obs_model predicted_next_action = self.target_policy.get_action_sym( obs + predicted_next_obs, name='policy_jole') qval_jole = predicted_reward + self.discount * self.target_qf.get_qval_sym( obs + predicted_next_obs, predicted_next_action, name="qval_jole") with tf.name_scope('jole_loss'): jole_loss = tf.reduce_mean( tf.compat.v1.squared_difference(qval, qval_jole)) with tf.name_scope('minize_jole_loss'): jole_train_op_qf = self.jole_optimizer( self.jole_lr, name="JoleOptimizer").minimize( jole_loss, var_list=self.qf.get_trainable_vars()) jole_train_op_reward = self.jole_optimizer( self.jole_lr * 0.001, name="JoleOptimizer").minimize( jole_loss, var_list=self.reward_model.get_trainable_vars()) jole_train_op_obs = self.jole_optimizer( self.jole_lr * 0.00001, name="JoleOptimizer").minimize( jole_loss, var_list=self.obs_model.get_trainable_vars()) f_train_jole = tensor_utils.compile_function( inputs=[obs, actions], outputs=[ jole_train_op_qf, jole_train_op_reward, jole_train_op_obs, jole_loss ]) f_cal_jole_loss = tensor_utils.compile_function( inputs=[obs, actions], outputs=[jole_loss]) self.f_train_jole = f_train_jole self.f_cal_jole_loss = f_cal_jole_loss self.f_train_reward_model = f_train_reward_model self.f_train_obs_model = f_train_obs_model self.f_train_policy = f_train_policy self.f_train_qf = f_train_qf self.f_init_target = f_init_target self.f_update_target = f_update_target self.f_obs_model_predict = f_obs_model_predict self.f_reward_model_predict = f_reward_model_predict
def init_opt(self): """Build the loss function and init the optimizer.""" with tf.name_scope(self._name): # Create target policy and qf network with tf.name_scope('inputs'): obs_dim = self.env_spec.observation_space.flat_dim input_y = tf.compat.v1.placeholder(tf.float32, shape=(None, 1), name='input_y') obs = tf.compat.v1.placeholder(tf.float32, shape=(None, obs_dim), name='input_observation') actions = tf.compat.v1.placeholder( tf.float32, shape=(None, self.env_spec.action_space.flat_dim), name='input_action') policy_network_outputs = self._target_policy.get_action_sym( obs, name='policy') target_qf_outputs = self._target_qf.get_qval_sym(obs, actions, name='qf') self.target_policy_f_prob_online = tensor_utils.compile_function( inputs=[obs], outputs=policy_network_outputs) self.target_qf_f_prob_online = tensor_utils.compile_function( inputs=[obs, actions], outputs=target_qf_outputs) # Set up target init and update function with tf.name_scope('setup_target'): ops = tensor_utils.get_target_ops( self.policy.get_global_vars(), self._target_policy.get_global_vars(), self._tau) policy_init_ops, policy_update_ops = ops qf_init_ops, qf_update_ops = tensor_utils.get_target_ops( self._qf.get_global_vars(), self._target_qf.get_global_vars(), self._tau) target_init_op = policy_init_ops + qf_init_ops target_update_op = policy_update_ops + qf_update_ops f_init_target = tensor_utils.compile_function( inputs=[], outputs=target_init_op) f_update_target = tensor_utils.compile_function( inputs=[], outputs=target_update_op) with tf.name_scope('inputs'): obs_dim = self.env_spec.observation_space.flat_dim input_y = tf.compat.v1.placeholder(tf.float32, shape=(None, 1), name='input_y') obs = tf.compat.v1.placeholder(tf.float32, shape=(None, obs_dim), name='input_observation') actions = tf.compat.v1.placeholder( tf.float32, shape=(None, self.env_spec.action_space.flat_dim), name='input_action') # Set up policy training function next_action = self.policy.get_action_sym(obs, name='policy_action') next_qval = self._qf.get_qval_sym(obs, next_action, name='policy_action_qval') with tf.name_scope('action_loss'): action_loss = -tf.reduce_mean(next_qval) if self._policy_weight_decay > 0.: regularizer = tf.keras.regularizers.l2( self._policy_weight_decay) for var in self.policy.get_regularizable_vars(): policy_reg = regularizer(var) action_loss += policy_reg with tf.name_scope('minimize_action_loss'): policy_optimizer = make_optimizer( self._policy_optimizer, learning_rate=self._policy_lr, name='PolicyOptimizer') policy_train_op = policy_optimizer.minimize( action_loss, var_list=self.policy.get_trainable_vars()) f_train_policy = tensor_utils.compile_function( inputs=[obs], outputs=[policy_train_op, action_loss]) # Set up qf training function qval = self._qf.get_qval_sym(obs, actions, name='q_value') with tf.name_scope('qval_loss'): qval_loss = tf.reduce_mean( tf.compat.v1.squared_difference(input_y, qval)) if self._qf_weight_decay > 0.: regularizer = tf.keras.regularizers.l2( self._qf_weight_decay) for var in self._qf.get_regularizable_vars(): qf_reg = regularizer(var) qval_loss += qf_reg with tf.name_scope('minimize_qf_loss'): qf_optimizer = make_optimizer(self._qf_optimizer, learning_rate=self._qf_lr, name='QFunctionOptimizer') qf_train_op = qf_optimizer.minimize( qval_loss, var_list=self._qf.get_trainable_vars()) f_train_qf = tensor_utils.compile_function( inputs=[input_y, obs, actions], outputs=[qf_train_op, qval_loss, qval]) self.f_train_policy = f_train_policy self.f_train_qf = f_train_qf self.f_init_target = f_init_target self.f_update_target = f_update_target
def init_opt(self): """ Initialize the networks and Ops. Assume discrete space for dqn, so action dimension will always be action_space.n """ action_dim = self.env_spec.action_space.n obs_dim = self.env_spec.observation_space.flat_dim self.episode_rewards = [] self.episode_qf_losses = [] with tf.name_scope(self.name, "input"): action_t_ph = tf.compat.v1.placeholder(tf.int32, None, name='action') reward_t_ph = tf.compat.v1.placeholder(tf.float32, None, name='reward') done_t_ph = tf.compat.v1.placeholder(tf.float32, None, name='done') action = tf.one_hot(action_t_ph, action_dim) next_obs = tf.compat.v1.placeholder(tf.float32, (None, obs_dim), name='next_observations') jole_obs = tf.compat.v1.placeholder(tf.float32, (None, obs_dim), name='jole_input_observations') jole_actions_discrete = tf.compat.v1.placeholder( tf.int32, None, name='jole_input_action') jole_actions = tf.one_hot(jole_actions_discrete, action_dim) jole_clip_return_min = tf.compat.v1.placeholder( tf.float32, shape=(), name="jole_clip_return_min") jole_clip_return_max = tf.compat.v1.placeholder( tf.float32, shape=(), name="jole_clip_return_max") use_jole = tf.compat.v1.placeholder(tf.float32, shape=(), name="use_jole") obs = self.qf.input # set up jole with tf.name_scope(self.name, "jole"): #get Q(s,a) jole_qval = tf.reduce_sum( self.qf.get_qval_sym(jole_obs, name='jole_q_value') * jole_actions, axis=1) # get predicted next observations and actions jole_predicted_next_obs = tf.reshape(tf.reduce_sum( tf.reshape(self.obs_model.get_fval_sym(jole_obs, name='jole_obs_value'), shape=(-1, action_dim, obs_dim)) * tf.expand_dims(jole_actions, -1), axis=1), shape=(-1, obs_dim)) jole_predicted_reward = tf.reduce_sum( self.reward_model.get_fval_sym( jole_obs, name='jole_reward_value') * jole_actions, axis=1) jole_predicted_terminal = self.get_terminal_status( jole_predicted_next_obs) #jole_predicted_terminal = 0 #jole_predicted_terminal = tf.argmax(self.terminal_model.get_fval_sym(jole_predicted_next_obs, name='jole_terminal_value'), axis=-1) # r + Q'(s', argmax_a(Q(s', _)) - Q(s, a) if self.double_q: jole_target_qval_with_online_q = self.qf.get_qval_sym( jole_predicted_next_obs, name="jole_next_obs_value") jole_future_best_q_val_action = tf.argmax( jole_target_qval_with_online_q, 1) jole_future_best_q_val = tf.reduce_sum( self.target_qf.get_qval_sym(jole_predicted_next_obs, name="jole_next_obs_value") * tf.one_hot(jole_future_best_q_val_action, action_dim), axis=1) else: # r + max_a(Q'(s', _)) - Q(s, a) jole_future_best_q_val = tf.reduce_max( self.target_qf.get_qval_sym(jole_predicted_next_obs, name="jole_next_obs_value"), axis=1) #jole_done_t_ph = tf.condition jole_q_best_masked = (1.0 - tf.cast( jole_predicted_terminal, tf.float32)) * jole_future_best_q_val #jole_q_best_masked = jole_future_best_q_val # if done, it's just reward # else reward + discount * future_best_q_val jole_target_q_values_before_clip = ( jole_predicted_reward + self.discount * jole_q_best_masked) jole_target_q_values = jole_target_q_values_before_clip #tf.clip_by_value(jole_target_q_values_before_clip, jole_clip_return_min, jole_clip_return_max) jole_loss = tf.reduce_mean( tf.compat.v1.squared_difference(jole_qval, jole_target_q_values)) self.f_cal_jole_loss = tensor_utils.compile_function( inputs=[ jole_obs, jole_actions_discrete, jole_clip_return_min, jole_clip_return_max, use_jole ], outputs=[ jole_loss, jole_qval, jole_target_q_values, jole_target_q_values_before_clip ]) #train the env model with tf.name_scope(self.name, "env_model"): predicted_next_obs = tf.reduce_sum( tf.reshape(self.obs_model.get_fval_sym(obs, name='obs_value'), shape=(-1, action_dim, obs_dim)) * tf.expand_dims(action, -1), axis=1) predicted_reward = tf.reduce_sum( self.reward_model.get_fval_sym(obs, name='reward_value') * action, axis=1) #change to predict the delta of s original_obs_model_loss = tf.reduce_mean( tf.compat.v1.squared_difference(next_obs, predicted_next_obs)) obs_model_loss = original_obs_model_loss + use_jole * 0.0001 * jole_loss original_reward_model_loss = tf.reduce_mean( tf.compat.v1.squared_difference(reward_t_ph, predicted_reward)) reward_model_loss = original_reward_model_loss + use_jole * 0.0001 * jole_loss predicted_terminal = self.terminal_model.get_fval_sym( next_obs, name="terminal_value") terminal_model_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=predicted_terminal, labels=tf.cast(tf.squeeze(done_t_ph), dtype=tf.int32)) terminal_model_accurate = tf.reduce_sum(1 - tf.abs( tf.argmax(predicted_terminal, axis=-1) - tf.cast(tf.squeeze(done_t_ph), dtype=tf.int64))) with tf.name_scope('minimize_obs_model_loss'): obs_train_op = self.obs_model_optimizer( self.obs_model_lr, name='ObsModelOptimizer').minimize( obs_model_loss, var_list=self.obs_model.get_trainable_vars()) reward_train_op = self.reward_model_optimizer( self.reward_model_lr, name='RewardModelOptimizer').minimize( reward_model_loss, var_list=self.reward_model.get_trainable_vars()) terminal_train_op = self.terminal_model_optimizer( self.terminal_model_lr, name='TerminalModelOptimizer').minimize( terminal_model_loss, var_list=self.terminal_model.get_trainable_vars()) self.f_train_obs_model = tensor_utils.compile_function( inputs=[ next_obs, obs, action_t_ph, jole_obs, jole_actions_discrete, jole_clip_return_min, jole_clip_return_max, use_jole ], outputs=[ obs_train_op, obs_model_loss, original_obs_model_loss ]) self.f_train_reward_model = tensor_utils.compile_function( inputs=[ reward_t_ph, obs, action_t_ph, jole_obs, jole_actions_discrete, jole_clip_return_min, jole_clip_return_max, use_jole ], outputs=[ reward_train_op, reward_model_loss, original_reward_model_loss ]) self.f_train_terminal_model = tensor_utils.compile_function( inputs=[next_obs, done_t_ph], outputs=[ terminal_train_op, terminal_model_loss, terminal_model_accurate ]) self.f_obs_model_predict = tensor_utils.compile_function( inputs=[obs, action_t_ph], outputs=[predicted_next_obs - obs, predicted_next_obs]) self.f_reward_model_predict = tensor_utils.compile_function( inputs=[obs, action_t_ph], outputs=[predicted_reward]) self.f_terminal_model_predict = tensor_utils.compile_function( inputs=[next_obs], outputs=[ predicted_terminal, tf.argmax(predicted_terminal, axis=-1) ]) sepe_predicted_next_obs = tf.reduce_sum(tf.reshape( self.sepe_obs_model.get_fval_sym(obs, name='obs_value'), shape=(-1, action_dim, obs_dim)) * tf.expand_dims(action, -1), axis=1) sepe_predicted_reward = tf.reduce_sum( self.sepe_reward_model.get_fval_sym(obs, name='reward_value') * action, axis=1) #change to predict the delta of s sepe_obs_model_loss = tf.reduce_mean( tf.compat.v1.squared_difference(next_obs, sepe_predicted_next_obs)) sepe_reward_model_loss = tf.reduce_mean( tf.compat.v1.squared_difference(reward_t_ph, sepe_predicted_reward)) with tf.name_scope('minimize_sepe_obs_model_loss'): sepe_obs_train_op = self.obs_model_optimizer( self.obs_model_lr, name='SepeObsModelOptimizer').minimize( sepe_obs_model_loss, var_list=self.sepe_obs_model.get_trainable_vars()) sepe_reward_train_op = self.reward_model_optimizer( self.reward_model_lr, name='SepeRewardModelOptimizer').minimize( sepe_reward_model_loss, var_list=self.sepe_reward_model.get_trainable_vars()) f_train_sepe_obs_model = tensor_utils.compile_function( inputs=[next_obs, obs, action_t_ph], outputs=[sepe_obs_train_op, sepe_obs_model_loss]) f_train_sepe_reward_model = tensor_utils.compile_function( inputs=[reward_t_ph, obs, action_t_ph], outputs=[sepe_reward_train_op, sepe_reward_model_loss]) self.f_train_sepe_obs_model = f_train_sepe_obs_model self.f_train_sepe_reward_model = f_train_sepe_reward_model # Copy the parameter of seperate env models when necessary with tf.name_scope('copy_sepe_env_models'): copy_sepe_obs_model_ops = tensor_utils.get_target_ops( self.sepe_obs_model.get_global_vars(), self.obs_model.get_global_vars()) copy_sepe_reward_model_ops = tensor_utils.get_target_ops( self.sepe_reward_model.get_global_vars(), self.reward_model.get_global_vars()) self.f_copy_sepe_obs_model = tensor_utils.compile_function( inputs=[], outputs=copy_sepe_obs_model_ops) self.f_copy_sepe_reward_model = tensor_utils.compile_function( inputs=[], outputs=copy_sepe_reward_model_ops) # build q networks with tf.name_scope(self.name, 'DQN'): with tf.name_scope('update_ops'): target_update_op = tensor_utils.get_target_ops( self.qf.get_global_vars(), self.target_qf.get_global_vars()) self._qf_update_ops = tensor_utils.compile_function( inputs=[], outputs=target_update_op) with tf.name_scope('td_error'): # Q-value of the selected action q_selected = tf.reduce_sum( self.qf.q_vals * action, # yapf: disable axis=1) # r + Q'(s', argmax_a(Q(s', _)) - Q(s, a) if self.double_q: target_qval_with_online_q = self.qf.get_qval_sym( self.target_qf.input, self.qf.name) future_best_q_val_action = tf.argmax( target_qval_with_online_q, 1) future_best_q_val = tf.reduce_sum( self.target_qf.q_vals * tf.one_hot(future_best_q_val_action, action_dim), axis=1) else: # r + max_a(Q'(s', _)) - Q(s, a) future_best_q_val = tf.reduce_max(self.target_qf.q_vals, axis=1) q_best_masked = (1.0 - done_t_ph) * future_best_q_val # if done, it's just reward # else reward + discount * future_best_q_val target_q_values = (reward_t_ph + self.discount * q_best_masked) # td_error = q_selected - tf.stop_gradient(target_q_values) loss = tf.reduce_mean( tf.compat.v1.squared_difference( tf.stop_gradient(target_q_values), q_selected)) #loss = tf.compat.v1.losses.huber_loss( # q_selected, tf.stop_gradient(target_q_values)) #loss = tf.reduce_mean(loss) loss += use_jole * 0.2 * jole_loss with tf.name_scope('optimize_ops'): optimizer = self.qf_optimizer(self.qf_lr) if self.grad_norm_clipping is not None: gradients = optimizer.compute_gradients( loss, var_list=self.qf.get_trainable_vars()) for i, (grad, var) in enumerate(gradients): if grad is not None: gradients[i] = (tf.clip_by_norm( grad, self.grad_norm_clipping), var) optimize_loss = optimizer.apply_gradients(gradients) else: optimize_loss = optimizer.minimize( loss, var_list=self.qf.get_trainable_vars()) self._train_qf = tensor_utils.compile_function( inputs=[ self.qf.input, action_t_ph, reward_t_ph, done_t_ph, self.target_qf.input, jole_obs, jole_actions_discrete, use_jole, jole_clip_return_max, jole_clip_return_min ], outputs=[loss, optimize_loss, q_selected, target_q_values]) for variable in tf.trainable_variables(): print(variable)
def init_opt(self): """Build the loss function and init the optimizer.""" with tf.name_scope(self.name, 'JoLeDDPG'): # Create target policy and qf network self.target_policy_f_prob_online = tensor_utils.compile_function( inputs=[self.target_policy.model.networks['default'].input], outputs=self.target_policy.model.networks['default'].outputs) self.target_qf_f_prob_online = tensor_utils.compile_function( inputs=self.target_qf.model.networks['default'].inputs, outputs=self.target_qf.model.networks['default'].outputs) # Set up target init and update function with tf.name_scope('setup_target'): ops = tensor_utils.get_target_ops( self.policy.get_global_vars(), self.target_policy.get_global_vars(), self.tau) policy_init_ops, policy_update_ops = ops qf_init_ops, qf_update_ops = tensor_utils.get_target_ops( self.qf.get_global_vars(), self.target_qf.get_global_vars(), self.tau) target_init_op = policy_init_ops + qf_init_ops target_update_op = policy_update_ops + qf_update_ops f_init_target = tensor_utils.compile_function( inputs=[], outputs=target_init_op) f_update_target = tensor_utils.compile_function( inputs=[], outputs=target_update_op) with tf.name_scope('inputs'): if self.input_include_goal: obs_dim = self.env_spec.observation_space.\ flat_dim_with_keys(['observation', 'desired_goal']) else: obs_dim = self.env_spec.observation_space.flat_dim y = tf.compat.v1.placeholder(tf.float32, shape=(None, 1), name='input_y') obs = tf.compat.v1.placeholder(tf.float32, shape=(None, obs_dim), name='input_observation') actions = tf.compat.v1.placeholder( tf.float32, shape=(None, self.env_spec.action_space.flat_dim), name='input_action') next_obs = tf.compat.v1.placeholder(tf.float32, shape=(None, obs_dim), name='next_observation') reward = tf.compat.v1.placeholder(tf.float32, shape=(None, 1), name='reward') jole_obs = tf.compat.v1.placeholder( tf.float32, shape=(None, obs_dim), name='jole_input_observation') jole_actions = tf.compat.v1.placeholder( tf.float32, shape=(None, self.env_spec.action_space.flat_dim), name='jole_input_action') jole_clip_return_min = tf.compat.v1.placeholder( tf.float32, shape=(), name="jole_clip_return_min") jole_clip_return_max = tf.compat.v1.placeholder( tf.float32, shape=(), name="jole_clip_return_max") use_jole = tf.compat.v1.placeholder(tf.float32, shape=(), name="use_jole") reguzs = tf.compat.v1.placeholder(tf.float32, shape=(self.num_z, None, self.dim_z), name='reguzs') eps = tf.compat.v1.placeholder(tf.float32, shape=(None, self.dim_z), name='eps') # Set up policy training function next_action = self.policy.get_action_sym(obs, name='policy_action') next_qval = self.qf.get_qval_sym(obs, next_action, name='policy_action_qval') with tf.name_scope('action_loss'): action_loss = -tf.reduce_mean(next_qval) if self.policy_weight_decay > 0.: policy_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.policy_weight_decay), weights_list=self.policy.get_regularizable_vars()) action_loss += policy_reg with tf.name_scope('minimize_action_loss'): policy_train_op = self.policy_optimizer( self.policy_lr, name='PolicyOptimizer').minimize( action_loss, var_list=self.policy.get_trainable_vars()) f_train_policy = tensor_utils.compile_function( inputs=[obs], outputs=[policy_train_op, action_loss]) #get jole loss jole_qval = self.qf.get_qval_sym( jole_obs, jole_actions, name='jole_q_value') #[batch_size] jole_ys_samples_before_clip = [] jole_ys_samples = [] for i in range(self.num_z): regu_z = reguzs[i] jole_predicted_next_obs = self.obs_model_generator.get_fval_sym( jole_obs, jole_actions, regu_z, name="jole_obs_value{}".format(i)) jole_predicted_reward = self.reward_model_generator.get_fval_sym( jole_obs, jole_actions, regu_z, name="jole_reward{}".format(i)) jole_predicted_next_action = self.target_policy.get_action_sym( jole_predicted_next_obs, name='jole_policy_action{}'.format(i)) jole_ys_before_clip = jole_predicted_reward + self.discount * self.target_qf.get_qval_sym( jole_predicted_next_obs, jole_predicted_next_action, name="jole_ys_{}".format(i)) jole_ys_sample = jole_ys_before_clip #tf.clip_by_value(jole_ys_before_clip, jole_clip_return_min, jole_clip_return_max) jole_ys_samples_before_clip.append(jole_ys_before_clip) jole_ys_samples.append(jole_ys_sample) jole_ys = tf.reduce_mean(jole_ys_samples, axis=0) #[batch_size] jole_ys_before_clip = tf.reduce_mean(jole_ys_samples_before_clip, axis=0) with tf.name_scope('jole_loss'): jole_loss = tf.reduce_mean( tf.compat.v1.squared_difference(jole_qval, jole_ys)) f_cal_jole_loss = tensor_utils.compile_function( inputs=[ jole_obs, jole_actions, jole_clip_return_min, jole_clip_return_max, use_jole, reguzs ], outputs=[jole_loss, jole_qval, jole_ys, jole_ys_before_clip]) # Set up qf training function qval = self.qf.get_qval_sym(obs, actions, name='q_value') with tf.name_scope('qval_loss'): qval_loss = tf.reduce_mean( tf.compat.v1.squared_difference(y, qval)) if self.qf_weight_decay > 0.: qf_reg = tc.layers.apply_regularization( tc.layers.l2_regularizer(self.qf_weight_decay), weights_list=self.qf.get_regularizable_vars()) qval_loss += qf_reg qval_loss += use_jole * 0.2 * jole_loss with tf.name_scope('minimize_qf_loss'): qf_train_op = self.qf_optimizer( self.qf_lr, name='QFunctionOptimizer').minimize( qval_loss, var_list=self.qf.get_trainable_vars()) f_train_qf = tensor_utils.compile_function( inputs=[ y, obs, actions, jole_obs, jole_actions, jole_clip_return_min, jole_clip_return_max, use_jole, reguzs ], outputs=[qf_train_op, qval_loss, qval]) #obs cvae model z_obs_mean, z_obs_log_sigma = self.obs_model_recognition.get_fval_sym( obs, actions, next_obs, "obs_model_recoginition") z_obs_sample = z_obs_mean + tf.multiply( tf.sqrt(tf.exp(z_obs_log_sigma)), eps) obs_recontr_mean = self.obs_model_generator.get_fval_sym( obs, actions, z_obs_sample, "obs_model_generator") reconstr_loss_obs = tf.reduce_mean( tf.reduce_sum(tf.square(next_obs - obs_recontr_mean), 1)) latent_loss_obs = tf.reduce_mean(-0.5 * tf.reduce_sum( 1. + z_obs_log_sigma - tf.square(z_obs_mean) - tf.exp(z_obs_log_sigma), 1)) cvae_cost_obs = reconstr_loss_obs + latent_loss_obs #reward cvae model z_reward_mean, z_reward_log_sigma = self.reward_model_recognition.get_fval_sym( obs, actions, reward, "reward_model_recoginition") z_reward_sample = z_reward_mean + tf.multiply( tf.sqrt(tf.exp(z_reward_log_sigma)), eps) reward_recontr_mean = self.reward_model_generator.get_fval_sym( obs, actions, z_reward_sample, "reward_model_generator") reconstr_loss_reward = tf.reduce_mean( tf.reduce_sum(tf.square(reward - reward_recontr_mean), 1)) latent_loss_reward = tf.reduce_mean(-0.5 * tf.reduce_sum( 1. + z_reward_log_sigma - tf.square(z_reward_mean) - tf.exp(z_reward_log_sigma), 1)) cvae_cost_reward = reconstr_loss_reward + latent_loss_reward with tf.name_scope('model_loss'): #change to predict the delta of s obs_model_loss = cvae_cost_obs + use_jole * 0.0001 * jole_loss reward_model_loss = cvae_cost_reward + use_jole * 0.000001 * jole_loss with tf.name_scope('minimize_obs_model_loss'): obs_train_op = self.obs_model_optimizer( self.obs_model_lr, name='ObsModelOptimizer').minimize( obs_model_loss, var_list=self.obs_model_generator.get_trainable_vars() + self.obs_model_recognition.get_trainable_vars()) reward_train_op = self.reward_model_optimizer( self.reward_model_lr, name='RewardModelOptimizer' ).minimize( reward_model_loss, var_list=self.reward_model_generator.get_trainable_vars() + self.reward_model_recognition.get_trainable_vars()) self.f_train_obs_model = tensor_utils.compile_function( inputs=[ next_obs, obs, actions, jole_obs, jole_actions, jole_clip_return_min, jole_clip_return_max, use_jole, reguzs, eps ], outputs=[obs_train_op, cvae_cost_obs, reconstr_loss_obs]) self.f_train_reward_model = tensor_utils.compile_function( inputs=[ reward, obs, actions, jole_obs, jole_actions, jole_clip_return_min, jole_clip_return_max, use_jole, reguzs, eps ], outputs=[ reward_train_op, cvae_cost_reward, reconstr_loss_reward ]) #Set up of seperate environment model training function #seperate obs cvae model sz_obs_mean, sz_obs_log_sigma = self.sepe_obs_model_recognition.get_fval_sym( obs, actions, next_obs, "obs_model_recoginition") sz_obs_sample = sz_obs_mean + tf.multiply( tf.sqrt(tf.exp(sz_obs_log_sigma)), eps) sobs_recontr_mean = self.sepe_obs_model_generator.get_fval_sym( obs, actions, sz_obs_sample, "obs_model_generator") sreconstr_loss_obs = tf.reduce_mean( tf.reduce_sum(tf.square(next_obs - sobs_recontr_mean), 1)) slatent_loss_obs = tf.reduce_mean(-0.5 * tf.reduce_sum( 1. + sz_obs_log_sigma - tf.square(sz_obs_mean) - tf.exp(sz_obs_log_sigma), 1)) scvae_cost_obs = sreconstr_loss_obs + slatent_loss_obs #seperate reward cvae model sz_reward_mean, sz_reward_log_sigma = self.sepe_reward_model_recognition.get_fval_sym( obs, actions, reward, "reward_model_recoginition") sz_reward_sample = sz_reward_mean + tf.multiply( tf.sqrt(tf.exp(sz_reward_log_sigma)), eps) sreward_recontr_mean = self.sepe_reward_model_generator.get_fval_sym( obs, actions, sz_reward_sample, "reward_model_generator") sreconstr_loss_reward = tf.reduce_mean( tf.reduce_sum(tf.square(reward - sreward_recontr_mean), 1)) slatent_loss_reward = tf.reduce_mean(-0.5 * tf.reduce_sum( sz_reward_log_sigma - tf.square(sz_reward_mean) - tf.exp(sz_reward_log_sigma), 1)) scvae_cost_reward = sreconstr_loss_reward + slatent_loss_reward with tf.name_scope('seperate_model_loss'): #change to predict the delta of s sobs_model_loss = scvae_cost_obs sreward_model_loss = scvae_cost_reward with tf.name_scope('minimize_sepertate_model_loss'): sepe_obs_train_op = self.obs_model_optimizer( self.obs_model_lr, name='SepeObsModelOptimizer').minimize( sobs_model_loss, var_list=self.sepe_obs_model_generator. get_trainable_vars() + self.sepe_obs_model_recognition.get_trainable_vars()) sepe_reward_train_op = self.reward_model_optimizer( self.reward_model_lr, name='SepeRewardModelOptimizer' ).minimize( sreward_model_loss, var_list=self.sepe_reward_model_generator. get_trainable_vars() + self.sepe_reward_model_recognition.get_trainable_vars()) self.f_train_sepe_obs_model = tensor_utils.compile_function( inputs=[next_obs, obs, actions, eps], outputs=[ sepe_obs_train_op, scvae_cost_obs, sreconstr_loss_obs ]) self.f_train_sepe_reward_model = tensor_utils.compile_function( inputs=[reward, obs, actions, eps], outputs=[ sepe_reward_train_op, scvae_cost_reward, sreconstr_loss_reward ]) # Copy the parameter of seperate env models when necessary with tf.name_scope('copy_sepe_env_models'): copy_sepe_obs_model_recognition_ops = tensor_utils.get_target_ops( self.sepe_obs_model_recognition.get_global_vars(), self.obs_model_recognition.get_global_vars()) copy_sepe_obs_model_generator_ops = tensor_utils.get_target_ops( self.sepe_obs_model_generator.get_global_vars(), self.obs_model_generator.get_global_vars()) copy_sepe_reward_model_recognition_ops = tensor_utils.get_target_ops( self.sepe_reward_model_recognition.get_global_vars(), self.reward_model_recognition.get_global_vars()) copy_sepe_reward_model_generator_ops = tensor_utils.get_target_ops( self.sepe_reward_model_generator.get_global_vars(), self.reward_model_generator.get_global_vars()) f_copy_sepe_obs_model = tensor_utils.compile_function( inputs=[], outputs=[ copy_sepe_obs_model_recognition_ops, copy_sepe_obs_model_generator_ops ]) f_copy_sepe_reward_model = tensor_utils.compile_function( inputs=[], outputs=[ copy_sepe_reward_model_recognition_ops, copy_sepe_reward_model_generator_ops ]) self.f_copy_sepe_reward_model = f_copy_sepe_reward_model self.f_copy_sepe_obs_model = f_copy_sepe_obs_model self.f_cal_jole_loss = f_cal_jole_loss self.f_train_policy = f_train_policy self.f_train_qf = f_train_qf self.f_init_target = f_init_target self.f_update_target = f_update_target