示例#1
0
 def test_get_target_ops_tau(self):
     var = tf.get_variable('var', [1],
                           initializer=tf.constant_initializer(1))
     target_var = tf.get_variable('target_var', [1],
                                  initializer=tf.constant_initializer(2))
     self.sess.run(tf.global_variables_initializer())
     assert target_var.eval() == 2
     init_ops, update_ops = get_target_ops([var], [target_var], tau=0.2)
     self.sess.run(update_ops)
     assert np.allclose(target_var.eval(), 1.8)
     self.sess.run(init_ops)
     assert np.allclose(target_var.eval(), 1)
示例#2
0
文件: ddpg.py 项目: wjssx/garage
    def init_opt(self):
        """Build the loss function and init the optimizer."""
        with tf.name_scope(self.name, 'DDPG'):
            # Create target policy and qf network
            self.target_policy_f_prob_online = tensor_utils.compile_function(
                inputs=[self.target_policy.model.networks['default'].input],
                outputs=self.target_policy.model.networks['default'].outputs)
            self.target_qf_f_prob_online = tensor_utils.compile_function(
                inputs=self.target_qf.model.networks['default'].inputs,
                outputs=self.target_qf.model.networks['default'].outputs)

            # Set up target init and update function
            with tf.name_scope('setup_target'):
                ops = tensor_utils.get_target_ops(
                    self.policy.get_global_vars(),
                    self.target_policy.get_global_vars(), self.tau)
                policy_init_ops, policy_update_ops = ops
                qf_init_ops, qf_update_ops = tensor_utils.get_target_ops(
                    self.qf.get_global_vars(),
                    self.target_qf.get_global_vars(), self.tau)
                target_init_op = policy_init_ops + qf_init_ops
                target_update_op = policy_update_ops + qf_update_ops

            f_init_target = tensor_utils.compile_function(
                inputs=[], outputs=target_init_op)
            f_update_target = tensor_utils.compile_function(
                inputs=[], outputs=target_update_op)

            with tf.name_scope('inputs'):
                if self.input_include_goal:
                    obs_dim = self.env_spec.observation_space.\
                        flat_dim_with_keys(['observation', 'desired_goal'])
                else:
                    obs_dim = self.env_spec.observation_space.flat_dim
                input_y = tf.compat.v1.placeholder(tf.float32,
                                                   shape=(None, 1),
                                                   name='input_y')
                obs = tf.compat.v1.placeholder(tf.float32,
                                               shape=(None, obs_dim),
                                               name='input_observation')
                actions = tf.compat.v1.placeholder(
                    tf.float32,
                    shape=(None, self.env_spec.action_space.flat_dim),
                    name='input_action')
            # Set up policy training function
            next_action = self.policy.get_action_sym(obs, name='policy_action')
            next_qval = self.qf.get_qval_sym(obs,
                                             next_action,
                                             name='policy_action_qval')
            with tf.name_scope('action_loss'):
                action_loss = -tf.reduce_mean(next_qval)
                if self.policy_weight_decay > 0.:
                    policy_reg = tc.layers.apply_regularization(
                        tc.layers.l2_regularizer(self.policy_weight_decay),
                        weights_list=self.policy.get_regularizable_vars())
                    action_loss += policy_reg

            with tf.name_scope('minimize_action_loss'):
                policy_train_op = self.policy_optimizer(
                    self.policy_lr, name='PolicyOptimizer').minimize(
                        action_loss, var_list=self.policy.get_trainable_vars())

            f_train_policy = tensor_utils.compile_function(
                inputs=[obs], outputs=[policy_train_op, action_loss])

            # Set up qf training function
            qval = self.qf.get_qval_sym(obs, actions, name='q_value')
            with tf.name_scope('qval_loss'):
                qval_loss = tf.reduce_mean(
                    tf.compat.v1.squared_difference(input_y, qval))
                if self.qf_weight_decay > 0.:
                    qf_reg = tc.layers.apply_regularization(
                        tc.layers.l2_regularizer(self.qf_weight_decay),
                        weights_list=self.qf.get_regularizable_vars())
                    qval_loss += qf_reg

            with tf.name_scope('minimize_qf_loss'):
                qf_train_op = self.qf_optimizer(
                    self.qf_lr, name='QFunctionOptimizer').minimize(
                        qval_loss, var_list=self.qf.get_trainable_vars())

            f_train_qf = tensor_utils.compile_function(
                inputs=[input_y, obs, actions],
                outputs=[qf_train_op, qval_loss, qval])

            self.f_train_policy = f_train_policy
            self.f_train_qf = f_train_qf
            self.f_init_target = f_init_target
            self.f_update_target = f_update_target
示例#3
0
    def init_opt(self):
        """
        Initialize the networks and Ops.

        Assume discrete space for dqn, so action dimension
        will always be action_space.n
        """
        action_dim = self.env_spec.action_space.n

        self.episode_rewards = []
        self.episode_qf_losses = []

        # build q networks
        with tf.name_scope(self.name, 'DQN'):
            action_t_ph = tf.compat.v1.placeholder(tf.int32,
                                                   None,
                                                   name='action')
            reward_t_ph = tf.compat.v1.placeholder(tf.float32,
                                                   None,
                                                   name='reward')
            done_t_ph = tf.compat.v1.placeholder(tf.float32, None, name='done')

            with tf.name_scope('update_ops'):
                target_update_op = tensor_utils.get_target_ops(
                    self.qf.get_global_vars(),
                    self.target_qf.get_global_vars())

            self._qf_update_ops = tensor_utils.compile_function(
                inputs=[], outputs=target_update_op)

            with tf.name_scope('td_error'):
                # Q-value of the selected action
                action = tf.one_hot(action_t_ph, action_dim)
                q_selected = tf.reduce_sum(
                    self.qf.q_vals * action,  # yapf: disable
                    axis=1)

                # r + Q'(s', argmax_a(Q(s', _)) - Q(s, a)
                if self.double_q:
                    target_qval_with_online_q = self.qf.get_qval_sym(
                        self.target_qf.input, self.qf.name)
                    future_best_q_val_action = tf.argmax(
                        target_qval_with_online_q, 1)
                    future_best_q_val = tf.reduce_sum(
                        self.target_qf.q_vals *
                        tf.one_hot(future_best_q_val_action, action_dim),
                        axis=1)
                else:
                    # r + max_a(Q'(s', _)) - Q(s, a)
                    future_best_q_val = tf.reduce_max(self.target_qf.q_vals,
                                                      axis=1)

                q_best_masked = (1.0 - done_t_ph) * future_best_q_val
                # if done, it's just reward
                # else reward + discount * future_best_q_val
                target_q_values = (reward_t_ph + self.discount * q_best_masked)

                # td_error = q_selected - tf.stop_gradient(target_q_values)
                loss = tf.compat.v1.losses.huber_loss(
                    q_selected, tf.stop_gradient(target_q_values))
                loss = tf.reduce_mean(loss)

            with tf.name_scope('optimize_ops'):
                optimizer = self.qf_optimizer(self.qf_lr)
                if self.grad_norm_clipping is not None:
                    gradients = optimizer.compute_gradients(
                        loss, var_list=self.qf.get_trainable_vars())
                    for i, (grad, var) in enumerate(gradients):
                        if grad is not None:
                            gradients[i] = (tf.clip_by_norm(
                                grad, self.grad_norm_clipping), var)
                        optimize_loss = optimizer.apply_gradients(gradients)
                else:
                    optimize_loss = optimizer.minimize(
                        loss, var_list=self.qf.get_trainable_vars())

            self._train_qf = tensor_utils.compile_function(
                inputs=[
                    self.qf.input, action_t_ph, reward_t_ph, done_t_ph,
                    self.target_qf.input
                ],
                outputs=[loss, optimize_loss])
示例#4
0
    def init_opt(self):
        """Build the loss function and init the optimizer."""
        with tf.name_scope(self._name):
            # Create target policy (actor) and qf (critic) networks
            with tf.name_scope('inputs'):
                obs_dim = self.env_spec.observation_space.flat_dim
                y = tf.compat.v1.placeholder(tf.float32,
                                             shape=(None, 1),
                                             name='input_y')
                obs = tf.compat.v1.placeholder(tf.float32,
                                               shape=(None, obs_dim),
                                               name='input_observation')
                actions = tf.compat.v1.placeholder(
                    tf.float32,
                    shape=(None, self.env_spec.action_space.flat_dim),
                    name='input_action')

            policy_network_outputs = self._target_policy.build(obs,
                                                               name='policy')
            target_qf_outputs = self._target_qf.build(obs, actions, name='qf')
            target_qf2_outputs = self._target_qf2.build(obs,
                                                        actions,
                                                        name='qf')

            self.target_policy_f_prob_online = tensor_utils.compile_function(
                inputs=[obs], outputs=policy_network_outputs)

            self.target_qf_f_prob_online = tensor_utils.compile_function(
                inputs=[obs, actions], outputs=target_qf_outputs)

            self.target_qf2_f_prob_online = tensor_utils.compile_function(
                inputs=[obs, actions], outputs=target_qf2_outputs)

            # Set up target init and update functions
            with tf.name_scope('setup_target'):
                policy_init_op, policy_update_op = tensor_utils.get_target_ops(
                    self.policy.get_global_vars(),
                    self._target_policy.get_global_vars(), self._tau)
                qf_init_ops, qf_update_ops = tensor_utils.get_target_ops(
                    self.qf.get_global_vars(),
                    self._target_qf.get_global_vars(), self._tau)
                qf2_init_ops, qf2_update_ops = tensor_utils.get_target_ops(
                    self.qf2.get_global_vars(),
                    self._target_qf2.get_global_vars(), self._tau)
                target_init_op = policy_init_op + qf_init_ops + qf2_init_ops
                target_update_op = (policy_update_op + qf_update_ops +
                                    qf2_update_ops)

            f_init_target = tensor_utils.compile_function(
                inputs=[], outputs=target_init_op)
            f_update_target = tensor_utils.compile_function(
                inputs=[], outputs=target_update_op)

            # Set up policy training function
            next_action = self.policy.build(obs, name='policy_action')
            next_qval = self.qf.build(obs,
                                      next_action,
                                      name='policy_action_qval')
            with tf.name_scope('action_loss'):
                action_loss = -tf.reduce_mean(next_qval)

            with tf.name_scope('minimize_action_loss'):
                policy_optimizer = make_optimizer(
                    self._policy_optimizer,
                    learning_rate=self._policy_lr,
                    name='PolicyOptimizer')
                policy_train_op = policy_optimizer.minimize(
                    action_loss, var_list=self.policy.get_trainable_vars())

            f_train_policy = tensor_utils.compile_function(
                inputs=[obs], outputs=[policy_train_op, action_loss])

            # Set up qf training function
            qval = self.qf.build(obs, actions, name='q_value')
            q2val = self.qf2.build(obs, actions, name='q2_value')
            with tf.name_scope('qval1_loss'):
                qval1_loss = tf.reduce_mean(tf.math.squared_difference(
                    y, qval))
            with tf.name_scope('qval2_loss'):
                qval2_loss = tf.reduce_mean(
                    tf.math.squared_difference(y, q2val))

            with tf.name_scope('minimize_qf_loss'):
                qf_optimizer = make_optimizer(self._qf_optimizer,
                                              learning_rate=self._qf_lr,
                                              name='QFunctionOptimizer')
                qf_train_op = qf_optimizer.minimize(
                    qval1_loss, var_list=self.qf.get_trainable_vars())
                qf2_train_op = qf_optimizer.minimize(
                    qval2_loss, var_list=self.qf2.get_trainable_vars())

            f_train_qf = tensor_utils.compile_function(
                inputs=[y, obs, actions],
                outputs=[qf_train_op, qval1_loss, qval])
            f_train_qf2 = tensor_utils.compile_function(
                inputs=[y, obs, actions],
                outputs=[qf2_train_op, qval2_loss, q2val])

            self.f_train_policy = f_train_policy
            self.f_train_qf = f_train_qf
            self.f_init_target = f_init_target
            self.f_update_target = f_update_target
            self.f_train_qf2 = f_train_qf2
示例#5
0
    def init_opt(self):
        """Build the loss function and init the optimizer."""
        with tf.name_scope(self.name, 'TD3'):
            # Create target policy (actor) and qf (critic) networks
            self.target_policy_f_prob_online = tensor_utils.compile_function(
                inputs=[self.target_policy.model.networks['default'].input],
                outputs=self.target_policy.model.networks['default'].outputs)

            self.target_qf_f_prob_online = tensor_utils.compile_function(
                inputs=self.target_qf.model.networks['default'].inputs,
                outputs=self.target_qf.model.networks['default'].outputs)

            self.target_qf2_f_prob_online = tensor_utils.compile_function(
                inputs=self.target_qf2.model.networks['default'].inputs,
                outputs=self.target_qf2.model.networks['default'].outputs)

            # Set up target init and update functions
            with tf.name_scope('setup_target'):
                policy_init_op, policy_update_op = tensor_utils.get_target_ops(
                    self.policy.get_global_vars(),
                    self.target_policy.get_global_vars(), self.tau)
                qf_init_ops, qf_update_ops = tensor_utils.get_target_ops(
                    self.qf.get_global_vars(),
                    self.target_qf.get_global_vars(), self.tau)
                qf2_init_ops, qf2_update_ops = tensor_utils.get_target_ops(
                    self.qf2.get_global_vars(),
                    self.target_qf2.get_global_vars(), self.tau)
                target_init_op = policy_init_op + qf_init_ops + qf2_init_ops
                target_update_op = (policy_update_op + qf_update_ops +
                                    qf2_update_ops)

            f_init_target = tensor_utils.compile_function(
                inputs=[], outputs=target_init_op)
            f_update_target = tensor_utils.compile_function(
                inputs=[], outputs=target_update_op)

            with tf.name_scope('inputs'):
                if self.input_include_goal:
                    obs_dim = self.env_spec.observation_space.\
                        flat_dim_with_keys(['observation', 'desired_goal'])
                else:
                    obs_dim = self.env_spec.observation_space.flat_dim
                y = tf.placeholder(tf.float32, shape=(None, 1), name='input_y')
                obs = tf.placeholder(tf.float32,
                                     shape=(None, obs_dim),
                                     name='input_observation')
                actions = tf.placeholder(
                    tf.float32,
                    shape=(None, self.env_spec.action_space.flat_dim),
                    name='input_action')

            # Set up policy training function
            next_action = self.policy.get_action_sym(obs, name='policy_action')
            next_qval = self.qf.get_qval_sym(obs,
                                             next_action,
                                             name='policy_action_qval')
            with tf.name_scope('action_loss'):
                action_loss = -tf.reduce_mean(next_qval)

            with tf.name_scope('minimize_action_loss'):
                policy_train_op = self.policy_optimizer(
                    self.policy_lr, name='PolicyOptimizer').minimize(
                        action_loss, var_list=self.policy.get_trainable_vars())

            f_train_policy = tensor_utils.compile_function(
                inputs=[obs], outputs=[policy_train_op, action_loss])

            # Set up qf training function
            qval = self.qf.get_qval_sym(obs, actions, name='q_value')
            q2val = self.qf2.get_qval_sym(obs, actions, name='q2_value')
            with tf.name_scope('qval1_loss'):
                qval1_loss = tf.reduce_mean(tf.squared_difference(y, qval))
            with tf.name_scope('qval2_loss'):
                qval2_loss = tf.reduce_mean(tf.squared_difference(y, q2val))

            with tf.name_scope('minimize_qf_loss'):
                qf_train_op = self.qf_optimizer(
                    self.qf_lr, name='QFunctionOptimizer').minimize(
                        qval1_loss, var_list=self.qf.get_trainable_vars())
                qf2_train_op = self.qf_optimizer(
                    self.qf_lr, name='QFunctionOptimizer').minimize(
                        qval2_loss, var_list=self.qf2.get_trainable_vars())

            f_train_qf = tensor_utils.compile_function(
                inputs=[y, obs, actions],
                outputs=[qf_train_op, qval1_loss, qval])
            f_train_qf2 = tensor_utils.compile_function(
                inputs=[y, obs, actions],
                outputs=[qf2_train_op, qval2_loss, q2val])

            self.f_train_policy = f_train_policy
            self.f_train_qf = f_train_qf
            self.f_init_target = f_init_target
            self.f_update_target = f_update_target
            self.f_train_qf2 = f_train_qf2
示例#6
0
    def init_opt(self):
        """Build the loss function and init the optimizer."""
        with tf.name_scope(self.name, 'JoLeDDPG'):
            # Create target policy and qf network
            self.target_policy_f_prob_online = tensor_utils.compile_function(
                inputs=[self.target_policy.model.networks['default'].input],
                outputs=self.target_policy.model.networks['default'].outputs)
            self.target_qf_f_prob_online = tensor_utils.compile_function(
                inputs=self.target_qf.model.networks['default'].inputs,
                outputs=self.target_qf.model.networks['default'].outputs)

            # Set up target init and update function
            with tf.name_scope('setup_target'):
                ops = tensor_utils.get_target_ops(
                    self.policy.get_global_vars(),
                    self.target_policy.get_global_vars(), self.tau)
                policy_init_ops, policy_update_ops = ops
                qf_init_ops, qf_update_ops = tensor_utils.get_target_ops(
                    self.qf.get_global_vars(),
                    self.target_qf.get_global_vars(), self.tau)
                target_init_op = policy_init_ops + qf_init_ops
                target_update_op = policy_update_ops + qf_update_ops

            f_init_target = tensor_utils.compile_function(
                inputs=[], outputs=target_init_op)
            f_update_target = tensor_utils.compile_function(
                inputs=[], outputs=target_update_op)

            with tf.name_scope('inputs'):
                if self.input_include_goal:
                    obs_dim = self.env_spec.observation_space.\
                        flat_dim_with_keys(['observation', 'desired_goal'])
                else:
                    obs_dim = self.env_spec.observation_space.flat_dim
                y = tf.compat.v1.placeholder(tf.float32,
                                             shape=(None, 1),
                                             name='input_y')
                obs = tf.compat.v1.placeholder(tf.float32,
                                               shape=(None, obs_dim),
                                               name='input_observation')
                actions = tf.compat.v1.placeholder(
                    tf.float32,
                    shape=(None, self.env_spec.action_space.flat_dim),
                    name='input_action')
                next_obs = tf.compat.v1.placeholder(tf.float32,
                                                    shape=(None, obs_dim),
                                                    name='next_observation')
                reward = tf.compat.v1.placeholder(tf.float32,
                                                  shape=(None, 1),
                                                  name='reward')

            # Set up policy training function
            next_action = self.policy.get_action_sym(obs, name='policy_action')
            next_qval = self.qf.get_qval_sym(obs,
                                             next_action,
                                             name='policy_action_qval')
            with tf.name_scope('action_loss'):
                action_loss = -tf.reduce_mean(next_qval)
                if self.policy_weight_decay > 0.:
                    policy_reg = tc.layers.apply_regularization(
                        tc.layers.l2_regularizer(self.policy_weight_decay),
                        weights_list=self.policy.get_regularizable_vars())
                    action_loss += policy_reg

            with tf.name_scope('minimize_action_loss'):
                policy_train_op = self.policy_optimizer(
                    self.policy_lr, name='PolicyOptimizer').minimize(
                        action_loss, var_list=self.policy.get_trainable_vars())

            f_train_policy = tensor_utils.compile_function(
                inputs=[obs], outputs=[policy_train_op, action_loss])

            # Set up qf training function
            qval = self.qf.get_qval_sym(obs, actions, name='q_value')
            with tf.name_scope('qval_loss'):
                qval_loss = tf.reduce_mean(
                    tf.compat.v1.squared_difference(y, qval))
                if self.qf_weight_decay > 0.:
                    qf_reg = tc.layers.apply_regularization(
                        tc.layers.l2_regularizer(self.qf_weight_decay),
                        weights_list=self.qf.get_regularizable_vars())
                    qval_loss += qf_reg

            with tf.name_scope('minimize_qf_loss'):
                qf_train_op = self.qf_optimizer(
                    self.qf_lr, name='QFunctionOptimizer').minimize(
                        qval_loss, var_list=self.qf.get_trainable_vars())

            f_train_qf = tensor_utils.compile_function(
                inputs=[y, obs, actions],
                outputs=[qf_train_op, qval_loss, qval])

            mean, var = tf.nn.moments(obs, axes=[0])
            #Set up of environment model training function
            predicted_next_obs = self.obs_model.get_fval_sym(obs,
                                                             actions,
                                                             name='obs_value')
            predicted_reward = self.reward_model.get_fval_sym(
                obs, actions, name='reward_value')
            with tf.name_scope('model_loss'):
                #change to predict the delta of s
                obs_model_loss = tf.reduce_mean(
                    tf.compat.v1.squared_difference(next_obs - obs,
                                                    predicted_next_obs))
                reward_model_loss = tf.reduce_mean(
                    tf.compat.v1.squared_difference(reward, predicted_reward))

            with tf.name_scope('minimize_obs_model_loss'):
                obs_train_op = self.obs_model_optimizer(
                    self.obs_model_lr, name='ObsModelOptimizer').minimize(
                        obs_model_loss,
                        var_list=self.obs_model.get_trainable_vars())
                reward_train_op = self.reward_model_optimizer(
                    self.reward_model_lr,
                    name='RewardModelOptimizer').minimize(
                        reward_model_loss,
                        var_list=self.reward_model.get_trainable_vars())

            f_train_obs_model = tensor_utils.compile_function(
                inputs=[next_obs, obs, actions],
                outputs=[obs_train_op, obs_model_loss])
            f_train_reward_model = tensor_utils.compile_function(
                inputs=[reward, obs, actions],
                outputs=[reward_train_op, reward_model_loss])
            f_obs_model_predict = tensor_utils.compile_function(
                inputs=[obs, actions],
                outputs=[predicted_next_obs, obs + predicted_next_obs])
            f_reward_model_predict = tensor_utils.compile_function(
                inputs=[obs, actions], outputs=[predicted_reward])

            #Set up of seperate environment model training function
            sepe_predicted_next_obs = self.sepe_obs_model.get_fval_sym(
                obs, actions, name='sepe_obs_value')
            sepe_predicted_reward = self.sepe_reward_model.get_fval_sym(
                obs, actions, name='sepe_reward_value')
            with tf.name_scope('model_loss'):
                #change to predict the delta of s
                sepe_obs_model_loss = tf.reduce_mean(
                    tf.compat.v1.squared_difference(next_obs - obs,
                                                    sepe_predicted_next_obs))
                sepe_reward_model_loss = tf.reduce_mean(
                    tf.compat.v1.squared_difference(reward,
                                                    sepe_predicted_reward))

            with tf.name_scope('minimize_obs_model_loss'):
                sepe_obs_train_op = self.obs_model_optimizer(
                    self.obs_model_lr, name='SepeObsModelOptimizer').minimize(
                        sepe_obs_model_loss,
                        var_list=self.sepe_obs_model.get_trainable_vars())
                sepe_reward_train_op = self.reward_model_optimizer(
                    self.reward_model_lr,
                    name='SepeRewardModelOptimizer').minimize(
                        sepe_reward_model_loss,
                        var_list=self.sepe_reward_model.get_trainable_vars())

            f_train_sepe_obs_model = tensor_utils.compile_function(
                inputs=[next_obs, obs, actions],
                outputs=[sepe_obs_train_op, sepe_obs_model_loss])
            f_train_sepe_reward_model = tensor_utils.compile_function(
                inputs=[reward, obs, actions],
                outputs=[sepe_reward_train_op, sepe_reward_model_loss])

            self.f_train_sepe_obs_model = f_train_sepe_obs_model
            self.f_train_sepe_reward_model = f_train_sepe_reward_model

            # Copy the parameter of seperate env models when necessary
            with tf.name_scope('copy_sepe_env_models'):
                copy_sepe_obs_model_ops = tensor_utils.get_target_ops(
                    self.sepe_obs_model.get_global_vars(),
                    self.obs_model.get_global_vars())

                copy_sepe_reward_model_ops = tensor_utils.get_target_ops(
                    self.sepe_reward_model.get_global_vars(),
                    self.reward_model.get_global_vars())

            f_copy_sepe_obs_model = tensor_utils.compile_function(
                inputs=[], outputs=copy_sepe_obs_model_ops)
            f_copy_sepe_reward_model = tensor_utils.compile_function(
                inputs=[], outputs=copy_sepe_reward_model_ops)

            self.f_copy_sepe_reward_model = f_copy_sepe_reward_model
            self.f_copy_sepe_obs_model = f_copy_sepe_obs_model

            predicted_next_action = self.target_policy.get_action_sym(
                obs + predicted_next_obs, name='policy_jole')
            qval_jole = predicted_reward + self.discount * self.target_qf.get_qval_sym(
                obs + predicted_next_obs,
                predicted_next_action,
                name="qval_jole")
            with tf.name_scope('jole_loss'):
                jole_loss = tf.reduce_mean(
                    tf.compat.v1.squared_difference(qval, qval_jole))

            with tf.name_scope('minize_jole_loss'):
                jole_train_op_qf = self.jole_optimizer(
                    self.jole_lr, name="JoleOptimizer").minimize(
                        jole_loss, var_list=self.qf.get_trainable_vars())
                jole_train_op_reward = self.jole_optimizer(
                    self.jole_lr * 0.001, name="JoleOptimizer").minimize(
                        jole_loss,
                        var_list=self.reward_model.get_trainable_vars())
                jole_train_op_obs = self.jole_optimizer(
                    self.jole_lr * 0.00001, name="JoleOptimizer").minimize(
                        jole_loss,
                        var_list=self.obs_model.get_trainable_vars())

            f_train_jole = tensor_utils.compile_function(
                inputs=[obs, actions],
                outputs=[
                    jole_train_op_qf, jole_train_op_reward, jole_train_op_obs,
                    jole_loss
                ])

            f_cal_jole_loss = tensor_utils.compile_function(
                inputs=[obs, actions], outputs=[jole_loss])

            self.f_train_jole = f_train_jole
            self.f_cal_jole_loss = f_cal_jole_loss

            self.f_train_reward_model = f_train_reward_model
            self.f_train_obs_model = f_train_obs_model

            self.f_train_policy = f_train_policy
            self.f_train_qf = f_train_qf
            self.f_init_target = f_init_target
            self.f_update_target = f_update_target

            self.f_obs_model_predict = f_obs_model_predict
            self.f_reward_model_predict = f_reward_model_predict
示例#7
0
文件: ddpg.py 项目: thanhkaist/garage
    def init_opt(self):
        """Build the loss function and init the optimizer."""
        with tf.name_scope(self._name):
            # Create target policy and qf network
            with tf.name_scope('inputs'):
                obs_dim = self.env_spec.observation_space.flat_dim
                input_y = tf.compat.v1.placeholder(tf.float32,
                                                   shape=(None, 1),
                                                   name='input_y')
                obs = tf.compat.v1.placeholder(tf.float32,
                                               shape=(None, obs_dim),
                                               name='input_observation')
                actions = tf.compat.v1.placeholder(
                    tf.float32,
                    shape=(None, self.env_spec.action_space.flat_dim),
                    name='input_action')

            policy_network_outputs = self._target_policy.get_action_sym(
                obs, name='policy')
            target_qf_outputs = self._target_qf.get_qval_sym(obs,
                                                             actions,
                                                             name='qf')

            self.target_policy_f_prob_online = tensor_utils.compile_function(
                inputs=[obs], outputs=policy_network_outputs)
            self.target_qf_f_prob_online = tensor_utils.compile_function(
                inputs=[obs, actions], outputs=target_qf_outputs)

            # Set up target init and update function
            with tf.name_scope('setup_target'):
                ops = tensor_utils.get_target_ops(
                    self.policy.get_global_vars(),
                    self._target_policy.get_global_vars(), self._tau)
                policy_init_ops, policy_update_ops = ops
                qf_init_ops, qf_update_ops = tensor_utils.get_target_ops(
                    self._qf.get_global_vars(),
                    self._target_qf.get_global_vars(), self._tau)
                target_init_op = policy_init_ops + qf_init_ops
                target_update_op = policy_update_ops + qf_update_ops

            f_init_target = tensor_utils.compile_function(
                inputs=[], outputs=target_init_op)
            f_update_target = tensor_utils.compile_function(
                inputs=[], outputs=target_update_op)

            with tf.name_scope('inputs'):
                obs_dim = self.env_spec.observation_space.flat_dim
                input_y = tf.compat.v1.placeholder(tf.float32,
                                                   shape=(None, 1),
                                                   name='input_y')
                obs = tf.compat.v1.placeholder(tf.float32,
                                               shape=(None, obs_dim),
                                               name='input_observation')
                actions = tf.compat.v1.placeholder(
                    tf.float32,
                    shape=(None, self.env_spec.action_space.flat_dim),
                    name='input_action')
            # Set up policy training function
            next_action = self.policy.get_action_sym(obs, name='policy_action')
            next_qval = self._qf.get_qval_sym(obs,
                                              next_action,
                                              name='policy_action_qval')
            with tf.name_scope('action_loss'):
                action_loss = -tf.reduce_mean(next_qval)
                if self._policy_weight_decay > 0.:
                    regularizer = tf.keras.regularizers.l2(
                        self._policy_weight_decay)
                    for var in self.policy.get_regularizable_vars():
                        policy_reg = regularizer(var)
                        action_loss += policy_reg

            with tf.name_scope('minimize_action_loss'):
                policy_optimizer = make_optimizer(
                    self._policy_optimizer,
                    learning_rate=self._policy_lr,
                    name='PolicyOptimizer')
                policy_train_op = policy_optimizer.minimize(
                    action_loss, var_list=self.policy.get_trainable_vars())

            f_train_policy = tensor_utils.compile_function(
                inputs=[obs], outputs=[policy_train_op, action_loss])

            # Set up qf training function
            qval = self._qf.get_qval_sym(obs, actions, name='q_value')
            with tf.name_scope('qval_loss'):
                qval_loss = tf.reduce_mean(
                    tf.compat.v1.squared_difference(input_y, qval))
                if self._qf_weight_decay > 0.:
                    regularizer = tf.keras.regularizers.l2(
                        self._qf_weight_decay)
                    for var in self._qf.get_regularizable_vars():
                        qf_reg = regularizer(var)
                        qval_loss += qf_reg

            with tf.name_scope('minimize_qf_loss'):
                qf_optimizer = make_optimizer(self._qf_optimizer,
                                              learning_rate=self._qf_lr,
                                              name='QFunctionOptimizer')
                qf_train_op = qf_optimizer.minimize(
                    qval_loss, var_list=self._qf.get_trainable_vars())

            f_train_qf = tensor_utils.compile_function(
                inputs=[input_y, obs, actions],
                outputs=[qf_train_op, qval_loss, qval])

            self.f_train_policy = f_train_policy
            self.f_train_qf = f_train_qf
            self.f_init_target = f_init_target
            self.f_update_target = f_update_target
示例#8
0
    def init_opt(self):
        """
        Initialize the networks and Ops.

        Assume discrete space for dqn, so action dimension
        will always be action_space.n
        """
        action_dim = self.env_spec.action_space.n
        obs_dim = self.env_spec.observation_space.flat_dim

        self.episode_rewards = []
        self.episode_qf_losses = []

        with tf.name_scope(self.name, "input"):
            action_t_ph = tf.compat.v1.placeholder(tf.int32,
                                                   None,
                                                   name='action')
            reward_t_ph = tf.compat.v1.placeholder(tf.float32,
                                                   None,
                                                   name='reward')
            done_t_ph = tf.compat.v1.placeholder(tf.float32, None, name='done')
            action = tf.one_hot(action_t_ph, action_dim)
            next_obs = tf.compat.v1.placeholder(tf.float32, (None, obs_dim),
                                                name='next_observations')

            jole_obs = tf.compat.v1.placeholder(tf.float32, (None, obs_dim),
                                                name='jole_input_observations')
            jole_actions_discrete = tf.compat.v1.placeholder(
                tf.int32, None, name='jole_input_action')
            jole_actions = tf.one_hot(jole_actions_discrete, action_dim)
            jole_clip_return_min = tf.compat.v1.placeholder(
                tf.float32, shape=(), name="jole_clip_return_min")
            jole_clip_return_max = tf.compat.v1.placeholder(
                tf.float32, shape=(), name="jole_clip_return_max")
            use_jole = tf.compat.v1.placeholder(tf.float32,
                                                shape=(),
                                                name="use_jole")
            obs = self.qf.input

        # set up jole
        with tf.name_scope(self.name, "jole"):
            #get Q(s,a)
            jole_qval = tf.reduce_sum(
                self.qf.get_qval_sym(jole_obs, name='jole_q_value') *
                jole_actions,
                axis=1)
            # get predicted next observations and actions
            jole_predicted_next_obs = tf.reshape(tf.reduce_sum(
                tf.reshape(self.obs_model.get_fval_sym(jole_obs,
                                                       name='jole_obs_value'),
                           shape=(-1, action_dim, obs_dim)) *
                tf.expand_dims(jole_actions, -1),
                axis=1),
                                                 shape=(-1, obs_dim))
            jole_predicted_reward = tf.reduce_sum(
                self.reward_model.get_fval_sym(
                    jole_obs, name='jole_reward_value') * jole_actions,
                axis=1)
            jole_predicted_terminal = self.get_terminal_status(
                jole_predicted_next_obs)

            #jole_predicted_terminal = 0
            #jole_predicted_terminal = tf.argmax(self.terminal_model.get_fval_sym(jole_predicted_next_obs, name='jole_terminal_value'), axis=-1)

            # r + Q'(s', argmax_a(Q(s', _)) - Q(s, a)
            if self.double_q:
                jole_target_qval_with_online_q = self.qf.get_qval_sym(
                    jole_predicted_next_obs, name="jole_next_obs_value")
                jole_future_best_q_val_action = tf.argmax(
                    jole_target_qval_with_online_q, 1)
                jole_future_best_q_val = tf.reduce_sum(
                    self.target_qf.get_qval_sym(jole_predicted_next_obs,
                                                name="jole_next_obs_value") *
                    tf.one_hot(jole_future_best_q_val_action, action_dim),
                    axis=1)
            else:
                # r + max_a(Q'(s', _)) - Q(s, a)
                jole_future_best_q_val = tf.reduce_max(
                    self.target_qf.get_qval_sym(jole_predicted_next_obs,
                                                name="jole_next_obs_value"),
                    axis=1)
            #jole_done_t_ph = tf.condition
            jole_q_best_masked = (1.0 - tf.cast(
                jole_predicted_terminal, tf.float32)) * jole_future_best_q_val
            #jole_q_best_masked = jole_future_best_q_val
            # if done, it's just reward
            # else reward + discount * future_best_q_val
            jole_target_q_values_before_clip = (
                jole_predicted_reward + self.discount * jole_q_best_masked)
            jole_target_q_values = jole_target_q_values_before_clip  #tf.clip_by_value(jole_target_q_values_before_clip, jole_clip_return_min, jole_clip_return_max)

            jole_loss = tf.reduce_mean(
                tf.compat.v1.squared_difference(jole_qval,
                                                jole_target_q_values))

            self.f_cal_jole_loss = tensor_utils.compile_function(
                inputs=[
                    jole_obs, jole_actions_discrete, jole_clip_return_min,
                    jole_clip_return_max, use_jole
                ],
                outputs=[
                    jole_loss, jole_qval, jole_target_q_values,
                    jole_target_q_values_before_clip
                ])

        #train the env model
        with tf.name_scope(self.name, "env_model"):

            predicted_next_obs = tf.reduce_sum(
                tf.reshape(self.obs_model.get_fval_sym(obs, name='obs_value'),
                           shape=(-1, action_dim, obs_dim)) *
                tf.expand_dims(action, -1),
                axis=1)
            predicted_reward = tf.reduce_sum(
                self.reward_model.get_fval_sym(obs, name='reward_value') *
                action,
                axis=1)

            #change to predict the delta of s
            original_obs_model_loss = tf.reduce_mean(
                tf.compat.v1.squared_difference(next_obs, predicted_next_obs))
            obs_model_loss = original_obs_model_loss + use_jole * 0.0001 * jole_loss
            original_reward_model_loss = tf.reduce_mean(
                tf.compat.v1.squared_difference(reward_t_ph, predicted_reward))
            reward_model_loss = original_reward_model_loss + use_jole * 0.0001 * jole_loss

            predicted_terminal = self.terminal_model.get_fval_sym(
                next_obs, name="terminal_value")
            terminal_model_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits=predicted_terminal,
                labels=tf.cast(tf.squeeze(done_t_ph), dtype=tf.int32))

            terminal_model_accurate = tf.reduce_sum(1 - tf.abs(
                tf.argmax(predicted_terminal, axis=-1) -
                tf.cast(tf.squeeze(done_t_ph), dtype=tf.int64)))

            with tf.name_scope('minimize_obs_model_loss'):
                obs_train_op = self.obs_model_optimizer(
                    self.obs_model_lr, name='ObsModelOptimizer').minimize(
                        obs_model_loss,
                        var_list=self.obs_model.get_trainable_vars())
                reward_train_op = self.reward_model_optimizer(
                    self.reward_model_lr,
                    name='RewardModelOptimizer').minimize(
                        reward_model_loss,
                        var_list=self.reward_model.get_trainable_vars())
                terminal_train_op = self.terminal_model_optimizer(
                    self.terminal_model_lr,
                    name='TerminalModelOptimizer').minimize(
                        terminal_model_loss,
                        var_list=self.terminal_model.get_trainable_vars())

            self.f_train_obs_model = tensor_utils.compile_function(
                inputs=[
                    next_obs, obs, action_t_ph, jole_obs,
                    jole_actions_discrete, jole_clip_return_min,
                    jole_clip_return_max, use_jole
                ],
                outputs=[
                    obs_train_op, obs_model_loss, original_obs_model_loss
                ])
            self.f_train_reward_model = tensor_utils.compile_function(
                inputs=[
                    reward_t_ph, obs, action_t_ph, jole_obs,
                    jole_actions_discrete, jole_clip_return_min,
                    jole_clip_return_max, use_jole
                ],
                outputs=[
                    reward_train_op, reward_model_loss,
                    original_reward_model_loss
                ])
            self.f_train_terminal_model = tensor_utils.compile_function(
                inputs=[next_obs, done_t_ph],
                outputs=[
                    terminal_train_op, terminal_model_loss,
                    terminal_model_accurate
                ])
            self.f_obs_model_predict = tensor_utils.compile_function(
                inputs=[obs, action_t_ph],
                outputs=[predicted_next_obs - obs, predicted_next_obs])
            self.f_reward_model_predict = tensor_utils.compile_function(
                inputs=[obs, action_t_ph], outputs=[predicted_reward])
            self.f_terminal_model_predict = tensor_utils.compile_function(
                inputs=[next_obs],
                outputs=[
                    predicted_terminal,
                    tf.argmax(predicted_terminal, axis=-1)
                ])

            sepe_predicted_next_obs = tf.reduce_sum(tf.reshape(
                self.sepe_obs_model.get_fval_sym(obs, name='obs_value'),
                shape=(-1, action_dim, obs_dim)) * tf.expand_dims(action, -1),
                                                    axis=1)
            sepe_predicted_reward = tf.reduce_sum(
                self.sepe_reward_model.get_fval_sym(obs, name='reward_value') *
                action,
                axis=1)
            #change to predict the delta of s
            sepe_obs_model_loss = tf.reduce_mean(
                tf.compat.v1.squared_difference(next_obs,
                                                sepe_predicted_next_obs))
            sepe_reward_model_loss = tf.reduce_mean(
                tf.compat.v1.squared_difference(reward_t_ph,
                                                sepe_predicted_reward))

            with tf.name_scope('minimize_sepe_obs_model_loss'):
                sepe_obs_train_op = self.obs_model_optimizer(
                    self.obs_model_lr, name='SepeObsModelOptimizer').minimize(
                        sepe_obs_model_loss,
                        var_list=self.sepe_obs_model.get_trainable_vars())
                sepe_reward_train_op = self.reward_model_optimizer(
                    self.reward_model_lr,
                    name='SepeRewardModelOptimizer').minimize(
                        sepe_reward_model_loss,
                        var_list=self.sepe_reward_model.get_trainable_vars())

            f_train_sepe_obs_model = tensor_utils.compile_function(
                inputs=[next_obs, obs, action_t_ph],
                outputs=[sepe_obs_train_op, sepe_obs_model_loss])
            f_train_sepe_reward_model = tensor_utils.compile_function(
                inputs=[reward_t_ph, obs, action_t_ph],
                outputs=[sepe_reward_train_op, sepe_reward_model_loss])

            self.f_train_sepe_obs_model = f_train_sepe_obs_model
            self.f_train_sepe_reward_model = f_train_sepe_reward_model

            # Copy the parameter of seperate env models when necessary
            with tf.name_scope('copy_sepe_env_models'):
                copy_sepe_obs_model_ops = tensor_utils.get_target_ops(
                    self.sepe_obs_model.get_global_vars(),
                    self.obs_model.get_global_vars())

                copy_sepe_reward_model_ops = tensor_utils.get_target_ops(
                    self.sepe_reward_model.get_global_vars(),
                    self.reward_model.get_global_vars())

            self.f_copy_sepe_obs_model = tensor_utils.compile_function(
                inputs=[], outputs=copy_sepe_obs_model_ops)
            self.f_copy_sepe_reward_model = tensor_utils.compile_function(
                inputs=[], outputs=copy_sepe_reward_model_ops)

        # build q networks
        with tf.name_scope(self.name, 'DQN'):
            with tf.name_scope('update_ops'):
                target_update_op = tensor_utils.get_target_ops(
                    self.qf.get_global_vars(),
                    self.target_qf.get_global_vars())

            self._qf_update_ops = tensor_utils.compile_function(
                inputs=[], outputs=target_update_op)

            with tf.name_scope('td_error'):
                # Q-value of the selected action
                q_selected = tf.reduce_sum(
                    self.qf.q_vals * action,  # yapf: disable
                    axis=1)

                # r + Q'(s', argmax_a(Q(s', _)) - Q(s, a)
                if self.double_q:
                    target_qval_with_online_q = self.qf.get_qval_sym(
                        self.target_qf.input, self.qf.name)
                    future_best_q_val_action = tf.argmax(
                        target_qval_with_online_q, 1)
                    future_best_q_val = tf.reduce_sum(
                        self.target_qf.q_vals *
                        tf.one_hot(future_best_q_val_action, action_dim),
                        axis=1)
                else:
                    # r + max_a(Q'(s', _)) - Q(s, a)
                    future_best_q_val = tf.reduce_max(self.target_qf.q_vals,
                                                      axis=1)

                q_best_masked = (1.0 - done_t_ph) * future_best_q_val
                # if done, it's just reward
                # else reward + discount * future_best_q_val
                target_q_values = (reward_t_ph + self.discount * q_best_masked)

                # td_error = q_selected - tf.stop_gradient(target_q_values)
                loss = tf.reduce_mean(
                    tf.compat.v1.squared_difference(
                        tf.stop_gradient(target_q_values), q_selected))
                #loss = tf.compat.v1.losses.huber_loss(
                #    q_selected, tf.stop_gradient(target_q_values))
                #loss = tf.reduce_mean(loss)
                loss += use_jole * 0.2 * jole_loss

            with tf.name_scope('optimize_ops'):
                optimizer = self.qf_optimizer(self.qf_lr)
                if self.grad_norm_clipping is not None:
                    gradients = optimizer.compute_gradients(
                        loss, var_list=self.qf.get_trainable_vars())
                    for i, (grad, var) in enumerate(gradients):
                        if grad is not None:
                            gradients[i] = (tf.clip_by_norm(
                                grad, self.grad_norm_clipping), var)
                        optimize_loss = optimizer.apply_gradients(gradients)
                else:
                    optimize_loss = optimizer.minimize(
                        loss, var_list=self.qf.get_trainable_vars())

            self._train_qf = tensor_utils.compile_function(
                inputs=[
                    self.qf.input, action_t_ph, reward_t_ph, done_t_ph,
                    self.target_qf.input, jole_obs, jole_actions_discrete,
                    use_jole, jole_clip_return_max, jole_clip_return_min
                ],
                outputs=[loss, optimize_loss, q_selected, target_q_values])

            for variable in tf.trainable_variables():
                print(variable)
    def init_opt(self):
        """Build the loss function and init the optimizer."""
        with tf.name_scope(self.name, 'JoLeDDPG'):
            # Create target policy and qf network
            self.target_policy_f_prob_online = tensor_utils.compile_function(
                inputs=[self.target_policy.model.networks['default'].input],
                outputs=self.target_policy.model.networks['default'].outputs)
            self.target_qf_f_prob_online = tensor_utils.compile_function(
                inputs=self.target_qf.model.networks['default'].inputs,
                outputs=self.target_qf.model.networks['default'].outputs)

            # Set up target init and update function
            with tf.name_scope('setup_target'):
                ops = tensor_utils.get_target_ops(
                    self.policy.get_global_vars(),
                    self.target_policy.get_global_vars(), self.tau)
                policy_init_ops, policy_update_ops = ops
                qf_init_ops, qf_update_ops = tensor_utils.get_target_ops(
                    self.qf.get_global_vars(),
                    self.target_qf.get_global_vars(), self.tau)
                target_init_op = policy_init_ops + qf_init_ops
                target_update_op = policy_update_ops + qf_update_ops

            f_init_target = tensor_utils.compile_function(
                inputs=[], outputs=target_init_op)
            f_update_target = tensor_utils.compile_function(
                inputs=[], outputs=target_update_op)

            with tf.name_scope('inputs'):
                if self.input_include_goal:
                    obs_dim = self.env_spec.observation_space.\
                        flat_dim_with_keys(['observation', 'desired_goal'])
                else:
                    obs_dim = self.env_spec.observation_space.flat_dim
                y = tf.compat.v1.placeholder(tf.float32,
                                             shape=(None, 1),
                                             name='input_y')
                obs = tf.compat.v1.placeholder(tf.float32,
                                               shape=(None, obs_dim),
                                               name='input_observation')
                actions = tf.compat.v1.placeholder(
                    tf.float32,
                    shape=(None, self.env_spec.action_space.flat_dim),
                    name='input_action')
                next_obs = tf.compat.v1.placeholder(tf.float32,
                                                    shape=(None, obs_dim),
                                                    name='next_observation')
                reward = tf.compat.v1.placeholder(tf.float32,
                                                  shape=(None, 1),
                                                  name='reward')

                jole_obs = tf.compat.v1.placeholder(
                    tf.float32,
                    shape=(None, obs_dim),
                    name='jole_input_observation')
                jole_actions = tf.compat.v1.placeholder(
                    tf.float32,
                    shape=(None, self.env_spec.action_space.flat_dim),
                    name='jole_input_action')

                jole_clip_return_min = tf.compat.v1.placeholder(
                    tf.float32, shape=(), name="jole_clip_return_min")
                jole_clip_return_max = tf.compat.v1.placeholder(
                    tf.float32, shape=(), name="jole_clip_return_max")
                use_jole = tf.compat.v1.placeholder(tf.float32,
                                                    shape=(),
                                                    name="use_jole")

                reguzs = tf.compat.v1.placeholder(tf.float32,
                                                  shape=(self.num_z, None,
                                                         self.dim_z),
                                                  name='reguzs')
                eps = tf.compat.v1.placeholder(tf.float32,
                                               shape=(None, self.dim_z),
                                               name='eps')

            # Set up policy training function
            next_action = self.policy.get_action_sym(obs, name='policy_action')
            next_qval = self.qf.get_qval_sym(obs,
                                             next_action,
                                             name='policy_action_qval')
            with tf.name_scope('action_loss'):
                action_loss = -tf.reduce_mean(next_qval)
                if self.policy_weight_decay > 0.:
                    policy_reg = tc.layers.apply_regularization(
                        tc.layers.l2_regularizer(self.policy_weight_decay),
                        weights_list=self.policy.get_regularizable_vars())
                    action_loss += policy_reg

            with tf.name_scope('minimize_action_loss'):
                policy_train_op = self.policy_optimizer(
                    self.policy_lr, name='PolicyOptimizer').minimize(
                        action_loss, var_list=self.policy.get_trainable_vars())

            f_train_policy = tensor_utils.compile_function(
                inputs=[obs], outputs=[policy_train_op, action_loss])

            #get jole loss
            jole_qval = self.qf.get_qval_sym(
                jole_obs, jole_actions, name='jole_q_value')  #[batch_size]
            jole_ys_samples_before_clip = []
            jole_ys_samples = []
            for i in range(self.num_z):
                regu_z = reguzs[i]
                jole_predicted_next_obs = self.obs_model_generator.get_fval_sym(
                    jole_obs,
                    jole_actions,
                    regu_z,
                    name="jole_obs_value{}".format(i))
                jole_predicted_reward = self.reward_model_generator.get_fval_sym(
                    jole_obs,
                    jole_actions,
                    regu_z,
                    name="jole_reward{}".format(i))
                jole_predicted_next_action = self.target_policy.get_action_sym(
                    jole_predicted_next_obs,
                    name='jole_policy_action{}'.format(i))
                jole_ys_before_clip = jole_predicted_reward + self.discount * self.target_qf.get_qval_sym(
                    jole_predicted_next_obs,
                    jole_predicted_next_action,
                    name="jole_ys_{}".format(i))
                jole_ys_sample = jole_ys_before_clip  #tf.clip_by_value(jole_ys_before_clip, jole_clip_return_min, jole_clip_return_max)
                jole_ys_samples_before_clip.append(jole_ys_before_clip)
                jole_ys_samples.append(jole_ys_sample)
            jole_ys = tf.reduce_mean(jole_ys_samples, axis=0)  #[batch_size]
            jole_ys_before_clip = tf.reduce_mean(jole_ys_samples_before_clip,
                                                 axis=0)
            with tf.name_scope('jole_loss'):
                jole_loss = tf.reduce_mean(
                    tf.compat.v1.squared_difference(jole_qval, jole_ys))

            f_cal_jole_loss = tensor_utils.compile_function(
                inputs=[
                    jole_obs, jole_actions, jole_clip_return_min,
                    jole_clip_return_max, use_jole, reguzs
                ],
                outputs=[jole_loss, jole_qval, jole_ys, jole_ys_before_clip])

            # Set up qf training function
            qval = self.qf.get_qval_sym(obs, actions, name='q_value')
            with tf.name_scope('qval_loss'):
                qval_loss = tf.reduce_mean(
                    tf.compat.v1.squared_difference(y, qval))
                if self.qf_weight_decay > 0.:
                    qf_reg = tc.layers.apply_regularization(
                        tc.layers.l2_regularizer(self.qf_weight_decay),
                        weights_list=self.qf.get_regularizable_vars())
                    qval_loss += qf_reg
                qval_loss += use_jole * 0.2 * jole_loss

            with tf.name_scope('minimize_qf_loss'):
                qf_train_op = self.qf_optimizer(
                    self.qf_lr, name='QFunctionOptimizer').minimize(
                        qval_loss, var_list=self.qf.get_trainable_vars())

            f_train_qf = tensor_utils.compile_function(
                inputs=[
                    y, obs, actions, jole_obs, jole_actions,
                    jole_clip_return_min, jole_clip_return_max, use_jole,
                    reguzs
                ],
                outputs=[qf_train_op, qval_loss, qval])

            #obs cvae model
            z_obs_mean, z_obs_log_sigma = self.obs_model_recognition.get_fval_sym(
                obs, actions, next_obs, "obs_model_recoginition")
            z_obs_sample = z_obs_mean + tf.multiply(
                tf.sqrt(tf.exp(z_obs_log_sigma)), eps)
            obs_recontr_mean = self.obs_model_generator.get_fval_sym(
                obs, actions, z_obs_sample, "obs_model_generator")
            reconstr_loss_obs = tf.reduce_mean(
                tf.reduce_sum(tf.square(next_obs - obs_recontr_mean), 1))
            latent_loss_obs = tf.reduce_mean(-0.5 * tf.reduce_sum(
                1. + z_obs_log_sigma - tf.square(z_obs_mean) -
                tf.exp(z_obs_log_sigma), 1))
            cvae_cost_obs = reconstr_loss_obs + latent_loss_obs

            #reward cvae model
            z_reward_mean, z_reward_log_sigma = self.reward_model_recognition.get_fval_sym(
                obs, actions, reward, "reward_model_recoginition")
            z_reward_sample = z_reward_mean + tf.multiply(
                tf.sqrt(tf.exp(z_reward_log_sigma)), eps)
            reward_recontr_mean = self.reward_model_generator.get_fval_sym(
                obs, actions, z_reward_sample, "reward_model_generator")
            reconstr_loss_reward = tf.reduce_mean(
                tf.reduce_sum(tf.square(reward - reward_recontr_mean), 1))
            latent_loss_reward = tf.reduce_mean(-0.5 * tf.reduce_sum(
                1. + z_reward_log_sigma - tf.square(z_reward_mean) -
                tf.exp(z_reward_log_sigma), 1))
            cvae_cost_reward = reconstr_loss_reward + latent_loss_reward

            with tf.name_scope('model_loss'):
                #change to predict the delta of s
                obs_model_loss = cvae_cost_obs + use_jole * 0.0001 * jole_loss
                reward_model_loss = cvae_cost_reward + use_jole * 0.000001 * jole_loss

            with tf.name_scope('minimize_obs_model_loss'):
                obs_train_op = self.obs_model_optimizer(
                    self.obs_model_lr, name='ObsModelOptimizer').minimize(
                        obs_model_loss,
                        var_list=self.obs_model_generator.get_trainable_vars()
                        + self.obs_model_recognition.get_trainable_vars())
                reward_train_op = self.reward_model_optimizer(
                    self.reward_model_lr, name='RewardModelOptimizer'
                ).minimize(
                    reward_model_loss,
                    var_list=self.reward_model_generator.get_trainable_vars() +
                    self.reward_model_recognition.get_trainable_vars())

            self.f_train_obs_model = tensor_utils.compile_function(
                inputs=[
                    next_obs, obs, actions, jole_obs, jole_actions,
                    jole_clip_return_min, jole_clip_return_max, use_jole,
                    reguzs, eps
                ],
                outputs=[obs_train_op, cvae_cost_obs, reconstr_loss_obs])
            self.f_train_reward_model = tensor_utils.compile_function(
                inputs=[
                    reward, obs, actions, jole_obs, jole_actions,
                    jole_clip_return_min, jole_clip_return_max, use_jole,
                    reguzs, eps
                ],
                outputs=[
                    reward_train_op, cvae_cost_reward, reconstr_loss_reward
                ])

            #Set up of seperate environment model training function
            #seperate obs cvae model
            sz_obs_mean, sz_obs_log_sigma = self.sepe_obs_model_recognition.get_fval_sym(
                obs, actions, next_obs, "obs_model_recoginition")
            sz_obs_sample = sz_obs_mean + tf.multiply(
                tf.sqrt(tf.exp(sz_obs_log_sigma)), eps)
            sobs_recontr_mean = self.sepe_obs_model_generator.get_fval_sym(
                obs, actions, sz_obs_sample, "obs_model_generator")
            sreconstr_loss_obs = tf.reduce_mean(
                tf.reduce_sum(tf.square(next_obs - sobs_recontr_mean), 1))
            slatent_loss_obs = tf.reduce_mean(-0.5 * tf.reduce_sum(
                1. + sz_obs_log_sigma - tf.square(sz_obs_mean) -
                tf.exp(sz_obs_log_sigma), 1))
            scvae_cost_obs = sreconstr_loss_obs + slatent_loss_obs

            #seperate reward cvae model
            sz_reward_mean, sz_reward_log_sigma = self.sepe_reward_model_recognition.get_fval_sym(
                obs, actions, reward, "reward_model_recoginition")
            sz_reward_sample = sz_reward_mean + tf.multiply(
                tf.sqrt(tf.exp(sz_reward_log_sigma)), eps)
            sreward_recontr_mean = self.sepe_reward_model_generator.get_fval_sym(
                obs, actions, sz_reward_sample, "reward_model_generator")
            sreconstr_loss_reward = tf.reduce_mean(
                tf.reduce_sum(tf.square(reward - sreward_recontr_mean), 1))
            slatent_loss_reward = tf.reduce_mean(-0.5 * tf.reduce_sum(
                sz_reward_log_sigma - tf.square(sz_reward_mean) -
                tf.exp(sz_reward_log_sigma), 1))
            scvae_cost_reward = sreconstr_loss_reward + slatent_loss_reward

            with tf.name_scope('seperate_model_loss'):
                #change to predict the delta of s
                sobs_model_loss = scvae_cost_obs
                sreward_model_loss = scvae_cost_reward

            with tf.name_scope('minimize_sepertate_model_loss'):
                sepe_obs_train_op = self.obs_model_optimizer(
                    self.obs_model_lr, name='SepeObsModelOptimizer').minimize(
                        sobs_model_loss,
                        var_list=self.sepe_obs_model_generator.
                        get_trainable_vars() +
                        self.sepe_obs_model_recognition.get_trainable_vars())
                sepe_reward_train_op = self.reward_model_optimizer(
                    self.reward_model_lr, name='SepeRewardModelOptimizer'
                ).minimize(
                    sreward_model_loss,
                    var_list=self.sepe_reward_model_generator.
                    get_trainable_vars() +
                    self.sepe_reward_model_recognition.get_trainable_vars())

            self.f_train_sepe_obs_model = tensor_utils.compile_function(
                inputs=[next_obs, obs, actions, eps],
                outputs=[
                    sepe_obs_train_op, scvae_cost_obs, sreconstr_loss_obs
                ])
            self.f_train_sepe_reward_model = tensor_utils.compile_function(
                inputs=[reward, obs, actions, eps],
                outputs=[
                    sepe_reward_train_op, scvae_cost_reward,
                    sreconstr_loss_reward
                ])

            # Copy the parameter of seperate env models when necessary
            with tf.name_scope('copy_sepe_env_models'):
                copy_sepe_obs_model_recognition_ops = tensor_utils.get_target_ops(
                    self.sepe_obs_model_recognition.get_global_vars(),
                    self.obs_model_recognition.get_global_vars())

                copy_sepe_obs_model_generator_ops = tensor_utils.get_target_ops(
                    self.sepe_obs_model_generator.get_global_vars(),
                    self.obs_model_generator.get_global_vars())

                copy_sepe_reward_model_recognition_ops = tensor_utils.get_target_ops(
                    self.sepe_reward_model_recognition.get_global_vars(),
                    self.reward_model_recognition.get_global_vars())

                copy_sepe_reward_model_generator_ops = tensor_utils.get_target_ops(
                    self.sepe_reward_model_generator.get_global_vars(),
                    self.reward_model_generator.get_global_vars())

            f_copy_sepe_obs_model = tensor_utils.compile_function(
                inputs=[],
                outputs=[
                    copy_sepe_obs_model_recognition_ops,
                    copy_sepe_obs_model_generator_ops
                ])
            f_copy_sepe_reward_model = tensor_utils.compile_function(
                inputs=[],
                outputs=[
                    copy_sepe_reward_model_recognition_ops,
                    copy_sepe_reward_model_generator_ops
                ])

            self.f_copy_sepe_reward_model = f_copy_sepe_reward_model
            self.f_copy_sepe_obs_model = f_copy_sepe_obs_model

            self.f_cal_jole_loss = f_cal_jole_loss

            self.f_train_policy = f_train_policy
            self.f_train_qf = f_train_qf
            self.f_init_target = f_init_target
            self.f_update_target = f_update_target