Python minimize_and_clip 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: cs285.infrastructure.dqn_utils

메소드/함수: minimize_and_clip

hotexamples.com에서의 예제들: 2

Python minimize_and_clip - 2개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 cs285.infrastructure.dqn_utils.minimize_and_clip에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: dqn_critic.py 프로젝트: kdh0429/homework_fall2019

    def _build(self, q_func):

        #####################

        # q values, created with the placeholder that holds CURRENT obs (i.e., t)
        self.q_t_values = q_func(self.obs_t_ph,
                                 self.ac_dim,
                                 scope='q_func',
                                 reuse=False)
        self.q_t = tf.reduce_sum(self.q_t_values *
                                 tf.one_hot(self.act_t_ph, self.ac_dim),
                                 axis=1)

        #####################

        # target q values, created with the placeholder that holds NEXT obs (i.e., t+1)
        q_tp1_values = q_func(self.obs_tp1_ph,
                              self.ac_dim,
                              scope='target_q_func',
                              reuse=False)

        if self.double_q:
            # You must fill this part for Q2 of the Q-learning potion of the homework.
            # In double Q-learning, the best action is selected using the Q-network that
            # is being updated, but the Q-value for this action is obtained from the
            # target Q-network. See page 5 of https://arxiv.org/pdf/1509.06461.pdf for more details.
            q_tp1 = tf.reduce_sum(
                q_tp1_values *
                tf.one_hot(tf.argmax(self.q_t_values, axis=1), self.ac_dim),
                axis=1)
        else:
            # q values of the next timestep
            q_tp1 = tf.reduce_max(q_tp1_values, axis=1)

        #####################

        # TODO calculate the targets for the Bellman error
        # HINT1: as you saw in lecture, this would be:
        #currentReward + self.gamma * qValuesOfNextTimestep * (1 - self.done_mask_ph)
        # HINT2: see above, where q_tp1 is defined as the q values of the next timestep
        # HINT3: see the defined placeholders and look for the one that holds current rewards
        target_q_t = self.rew_t_ph + self.gamma * q_tp1 * (1 -
                                                           self.done_mask_ph)
        target_q_t = tf.stop_gradient(target_q_t)

        #####################

        # TODO compute the Bellman error (i.e. TD error between q_t and target_q_t)
        # Note that this scalar-valued tensor later gets passed into the optimizer, to be minimized
        # HINT: use reduce mean of huber_loss (from infrastructure/dqn_utils.py) instead of squared error
        self.total_error = tf.reduce_mean(huber_loss(self.q_t - target_q_t))

        #####################

        # TODO these variables should all of the
        # variables of the Q-function network and target network, respectively
        # HINT1: see the "scope" under which the variables were constructed in the lines at the top of this function
        # HINT2: use tf.get_collection to look for all variables under a certain scope
        q_func_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                        scope='q_func')
        target_q_func_vars = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, scope='target_q_func')

        #####################

        # train_fn will be called in order to train the critic (by minimizing the TD error)
        self.learning_rate = tf.placeholder(tf.float32, (),
                                            name="learning_rate")
        optimizer = self.optimizer_spec.constructor(
            learning_rate=self.learning_rate, **self.optimizer_spec.kwargs)
        self.train_fn = minimize_and_clip(optimizer,
                                          self.total_error,
                                          var_list=q_func_vars,
                                          clip_val=self.grad_norm_clipping)

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_fn = []
        for var, var_target in zip(
                sorted(q_func_vars, key=lambda v: v.name),
                sorted(target_q_func_vars, key=lambda v: v.name)):
            update_target_fn.append(var_target.assign(var))
        self.update_target_fn = tf.group(*update_target_fn)

예제 #2

파일 보기

파일: dqn_critic.py 프로젝트: KaiChen1998/Berkeley-CS285-19autumn

    def _build(self, q_func):

        #####################

        # q values, created with the placeholder that holds CURRENT obs (i.e., t)
        # online network: Q_phi(s, a)
        self.q_t_values = q_func(
            self.obs_t_ph, self.ac_dim, scope='q_func',
            reuse=False)  # reuse = False means to be an independant model
        self.q_t = tf.reduce_sum(self.q_t_values *
                                 tf.one_hot(self.act_t_ph, self.ac_dim),
                                 axis=1)  # act like softmax

        #####################

        # target q values, created with the placeholder that holds NEXT obs (i.e., t+1)
        # vector for a': Q_phi'(s', a')
        q_tp1_values = q_func(self.obs_tp1_ph,
                              self.ac_dim,
                              scope='target_q_func',
                              reuse=False)

        if self.double_q:
            # You must fill this part for Q2 of the Q-learning potion of the homework.
            # In double Q-learning, the best action is selected using the Q-network that
            # is being updated, but the Q-value for this action is obtained from the
            # target Q-network. See page 5 of https://arxiv.org/pdf/1509.06461.pdf for more details.

            # Q_phi'(s', argmax_a'(Q_phi(s', a')))
            q_t_values_for_tp1 = q_func(self.obs_tp1_ph,
                                        self.ac_dim,
                                        scope='q_func',
                                        reuse=True)  # reuse the training model
            num_sample = tf.shape(self.obs_tp1_ph)[0]
            index = tf.stack([
                tf.range(num_sample),
                tf.cast(tf.argmax(q_t_values_for_tp1, axis=1), tf.int32)
            ],
                             axis=1)  # build index
            q_tp1 = tf.gather_nd(q_tp1_values, index)

        else:
            # q values of the next timestep
            # Q_phi'(s', argmax_a'(Q_phi'(s', a')))
            q_tp1 = tf.reduce_max(q_tp1_values, axis=1)

        #####################

        # TODO calculate the targets for the Bellman error
        # HINT1: as you saw in lecture, this would be:
        #currentReward + self.gamma * qValuesOfNextTimestep * (1 - self.done_mask_ph)
        # HINT2: see above, where q_tp1 is defined as the q values of the next timestep
        # HINT3: see the defined placeholders and look for the one that holds current rewards
        # 这里target的定义是计算图
        # AC当中target定义是直接给numpy
        target_q_t = self.rew_t_ph + (1 -
                                      self.done_mask_ph) * self.gamma * q_tp1
        target_q_t = tf.stop_gradient(
            target_q_t
        )  # when doing (prediction - true) don't let gradient flow into true

        #####################

        # TODO compute the Bellman error (i.e. TD error between q_t and target_q_t)
        # Note that this scalar-valued tensor later gets passed into the optimizer, to be minimized
        # HINT: use reduce mean of huber_loss (from infrastructure/dqn_utils.py) instead of squared error
        # 而不是mean square
        self.total_error = tf.reduce_mean(huber_loss(self.q_t - target_q_t))

        #####################

        # TODO these variables should all of the
        # variables of the Q-function network and target network, respectively
        # HINT1: see the "scope" under which the variables were constructed in the lines at the top of this function
        # HINT2: use tf.get_collection to look for all variables under a certain scope
        q_func_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                        scope='q_func')
        target_q_func_vars = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, scope='target_q_func')

        #####################

        # train_fn will be called in order to train the critic (by minimizing the TD error)
        self.learning_rate = tf.placeholder(tf.float32, (),
                                            name="learning_rate")
        optimizer = self.optimizer_spec.constructor(
            learning_rate=self.learning_rate, **self.optimizer_spec.kwargs)
        self.train_fn = minimize_and_clip(optimizer,
                                          self.total_error,
                                          var_list=q_func_vars,
                                          clip_val=self.grad_norm_clipping)
        #####################

        # update_target_fn will be called periodically to copy Q network to target Q network
        update_target_fn = []
        for var, var_target in zip(
                sorted(q_func_vars, key=lambda v: v.name),
                sorted(target_q_func_vars, key=lambda v: v.name)):
            if (not self.hparams['use_polyak']):
                update_target_fn.append(var_target.assign(var))
            else:
                update_target_fn.append(
                    var_target.assign(0.0001 * var + 0.9999 * var_target))

        self.update_target_fn = tf.group(
            *update_target_fn)  # 总的赋值操作，tf.group往往用来组合op