Python select_item示例，chainer.functions.select_item Python示例

示例#1

0

显示文件

文件： main.py 项目： sagerpascal/deep-rl-bootcamp-lab3

 def compute_double_q_learning_loss(self, l_obs, l_act, l_rew, l_next_obs,
                                    l_done):
     """
     :param l_obs: A chainer variable holding a list of observations. Should be of shape N * |S|.
     :param l_act: A chainer variable holding a list of actions. Should be of shape N.
     :param l_rew: A chainer variable holding a list of rewards. Should be of shape N.
     :param l_next_obs: A chainer variable holding a list of observations at the next time step. Should be of
     shape N * |S|.
     :param l_done: A chainer variable holding a list of binary values (indicating whether episode ended after this
     time step). Should be of shape N.
     :return: A chainer variable holding a scalar loss.
     """
     # Hint: You may want to make use of the following fields: self._discount, self._q, self._qt
     # Hint2: Q-function can be called by self._q.forward(argument)
     # Hint3: You might also find https://docs.chainer.org/en/stable/reference/generated/chainer.functions.select_item.html useful
     "*** YOUR CODE HERE ***"
     reward = F.cast(l_rew, np.float32)
     q_forwarded = self._q.forward(l_next_obs)
     qt_forwarded = self._qt.forward(l_next_obs)
     y_non_terminal = reward + self._discount * F.select_item(
         qt_forwarded, F.argmax(q_forwarded, axis=1))
     y_terminal = reward
     y = F.select_item(F.stack([y_non_terminal, y_terminal], axis=1),
                       F.cast(l_done, np.int32))
     Q = F.select_item(self._q.forward(l_obs), l_act)
     return F.mean(F.square(y - Q))

示例#2

0

显示文件

文件： learners.py 项目： keisuke-nakata/myrl

 def _compute_q_y(self, batch_state, batch_action, batch_reward, batch_done, batch_next_state):
     with chainer.no_backprop_mode():
         batch_max = F.argmax(self.network(batch_next_state), axis=1)
         batch_target_q = self.target_network(batch_next_state)
         batch_y = batch_reward + self.discount * (1 - batch_done) * F.select_item(batch_target_q, batch_max)
     batch_q = F.select_item(self.network(batch_state), batch_action)
     return batch_y, batch_q

示例#3

0

显示文件

    def compute_loss(self, input_vocab, output_vocab, window_words,
                     hidden_states):
        g, rnn_distribution, a = self.decode_one_step(input_vocab,
                                                      window_words,
                                                      hidden_states)
        # define p_vocab as 0 if output word is not in vocab
        p_vocab = F.select_item(
            rnn_distribution,
            xp.array(
                [self.vocab[output_vocab]],
                dtype=xp.int32)) if output_vocab in self.vocab else Variable(
                    xp.array([0.0], dtype=xp.float32))

        # compute cross entropy
        indexes = [i for i, x in enumerate(window_words) if x == output_vocab]
        exist_var = Variable(xp.array([0], dtype=xp.float32))
        for idx in indexes:
            exist_var += F.select_item(a, xp.array([idx], dtype=xp.int32))
        p_ptr = F.cast(exist_var, xp.float32) if indexes else Variable(
            xp.array([0.0], dtype=xp.float32))
        cross_entropy = -F.log(
            F.linear_interpolate(g, p_vocab, p_ptr) +
            Variable(xp.array([0.01], dtype=xp.float32)))

        # compute attention loss
        attention_loss = F.cast(-F.log(g + exist_var),
                                xp.float32) if indexes else Variable(
                                    xp.array([0.0], dtype=xp.float32))
        return cross_entropy + attention_loss

示例#4

0

显示文件

文件： dqn_cartpole.py 项目： pfnet/chainer

def update(Q, target_Q, opt, samples, gamma=0.99, target_type='double_dqn'):
    """Update a Q-function with given samples and a target Q-function."""
    dtype = chainer.get_dtype()
    xp = Q.xp
    obs = xp.asarray([sample[0] for sample in samples], dtype=dtype)
    action = xp.asarray([sample[1] for sample in samples], dtype=np.int32)
    reward = xp.asarray([sample[2] for sample in samples], dtype=dtype)
    done = xp.asarray([sample[3] for sample in samples], dtype=dtype)
    obs_next = xp.asarray([sample[4] for sample in samples], dtype=dtype)
    # Predicted values: Q(s,a)
    y = F.select_item(Q(obs), action)
    # Target values: r + gamma * max_b Q(s',b)
    with chainer.no_backprop_mode():
        if target_type == 'dqn':
            next_q = F.max(target_Q(obs_next), axis=1)
        elif target_type == 'double_dqn':
            next_q = F.select_item(target_Q(obs_next),
                                   F.argmax(Q(obs_next), axis=1))
        else:
            raise ValueError('Unsupported target_type: {}'.format(target_type))
        target = reward + gamma * (1 - done) * next_q
    loss = mean_clipped_loss(y, target)
    Q.cleargrads()
    loss.backward()
    opt.update()

示例#5

0

显示文件

    def compute_double_q_learning_loss(self, l_obs, l_act, l_rew, l_next_obs,
                                       l_done):
        """
        :param l_obs: A chainer variable holding a list of observations. Should be of shape N * |S|.
        :param l_act: A chainer variable holding a list of actions. Should be of shape N.
        :param l_rew: A chainer variable holding a list of rewards. Should be of shape N.
        :param l_next_obs: A chainer variable holding a list of observations at the next time step. Should be of
        shape N * |S|.
        :param l_done: A chainer variable holding a list of binary values (indicating whether episode ended after this
        time step). Should be of shape N.
        :return: A chainer variable holding a scalar loss.
        """
        # Hint: You may want to make use of the following fields: self._discount, self._q, self._qt
        # Hint2: Q-function can be called by self._q.forward(argument)
        # Hint3: You might also find https://docs.chainer.org/en/stable/reference/generated/chainer.functions.select_item.html useful
        "*** YOUR CODE HERE ***"

        # Compute Target
        action = F.argmax(self._q.forward(l_next_obs), axis=1)
        qt_vals = self._qt.forward(l_next_obs)
        qt_vals = F.select_item(qt_vals, action)
        y = l_rew + (1 - l_done) * self._discount * qt_vals

        # Compute Q
        q = self._q.forward(l_obs)
        q = F.select_item(q, l_act)

        # Compute Loss
        loss = F.mean_squared_error(y, q)

        return loss

示例#6

0

显示文件

文件： main.py 项目： macandro96/DeepRL_Bootcamp

    def compute_double_q_learning_loss(self, l_obs, l_act, l_rew, l_next_obs,
                                       l_done):
        """
        :param l_obs: A chainer variable holding a list of observations. Should be of shape N * |S|.
        :param l_act: A chainer variable holding a list of actions. Should be of shape N.
        :param l_rew: A chainer variable holding a list of rewards. Should be of shape N.
        :param l_next_obs: A chainer variable holding a list of observations at the next time step. Should be of
        shape N * |S|.
        :param l_done: A chainer variable holding a list of binary values (indicating whether episode ended after this
        time step). Should be of shape N.
        :return: A chainer variable holding a scalar loss.
        """
        # Hint: You may want to make use of the following fields: self._discount, self._q, self._qt
        # Hint2: Q-function can be called by self._q.forward(argument)
        # Hint3: You might also find https://docs.chainer.org/en/stable/reference/generated/chainer.functions.select_item.html useful
        loss = C.Variable(np.array([0.]))  # TODO: replace this line
        target_net_out = self._qt.forward(l_next_obs)
        current_q_val = self._q.forward(l_obs)
        current_q_val_a = F.select_item(current_q_val, l_act)
        next_q_val = self._q.forward(l_next_obs)
        max_val_action = F.argmax(next_q_val, 1)

        target_net_out_max = F.select_item(target_net_out, max_val_action)

        target_vals = l_rew + self._discount * target_net_out_max * (1 -
                                                                     l_done)
        loss_vec = target_vals - current_q_val_a
        loss = F.average(F.square(loss_vec))
        # print(max_val_action.shape)
        "*** YOUR CODE HERE ***"
        return loss

示例#7

0

显示文件

    def train(self, x, cluster_array, class_array, partition):
        """
        :param x: instance in a minibatch
        :param cluster_array: the array of instances' cluster
        :param class_array: the array of instances' class
        (class means the number in the cluster)
        :param partition:
        :return: xp array, xp array
        """
        h = self.model.conv(x)
        cluster_output = F.softmax(self.model.cluster(h))
        cluster_output = F.select_item(cluster_output, cluster_array)

        class_output = None

        for cluster in range(self.num_clusters):
            if partition[cluster] == partition[cluster+1]:
                continue
            output = F.softmax(self[cluster](h[partition[cluster]:partition[cluster + 1]]))
            output = F.select_item(output, class_array[partition[cluster]:partition[cluster + 1]])
            if class_output is None:
                class_output = output
            else:
                class_output = F.concat((class_output, output), axis=0)

        return cluster_output * class_output, cluster_output, class_output

示例#8

0

显示文件

    def compute_double_q_learning_loss(self, l_obs, l_act, l_rew, l_next_obs, l_done):
        """
        :param l_obs: A chainer variable holding a list of observations. Should be of shape N * |S|.
        :param l_act: A chainer variable holding a list of actions. Should be of shape N.
        :param l_rew: A chainer variable holding a list of rewards. Should be of shape N.
        :param l_next_obs: A chainer variable holding a list of observations at the next time step. Should be of
        shape N * |S|.
        :param l_done: A chainer variable holding a list of binary values (indicating whether episode ended after this
        time step). Should be of shape N.
        :return: A chainer variable holding a scalar loss.
        """
        # Hint: You may want to make use of the following fields: self._discount, self._q, self._qt
        # Hint2: Q-function can be called by self._q.forward(argument)
        # Hint3: You might also find https://docs.chainer.org/en/stable/reference/generated/chainer.functions.select_item.html useful
        # this is the same as the last excercise but we get are max of Q, then apply it to Qt

        q_forwarded = self._q.forward(l_next_obs)
        qt_forwarded = self._qt.forward(l_next_obs)

        a_q_forwarded = F.argmax(q_forwarded, axis=1)
        Q_dual = F.select_item(qt_forwarded, a_q_forwarded)
        y = l_rew + (1 - l_done) * (self._discount * Q_dual)
        Q = F.select_item(self._q.forward(l_obs), l_act)
        loss = F.mean(F.square(y - Q))
        return loss

示例#9

0

显示文件

文件： main.py 项目： amalF/DeepRL-Bootcamp

    def compute_double_q_learning_loss(self, l_obs, l_act, l_rew, l_next_obs,
                                       l_done):
        """
        :param l_obs: A chainer variable holding a list of observations. Should be of shape N * |S|.
        :param l_act: A chainer variable holding a list of actions. Should be of shape N.
        :param l_rew: A chainer variable holding a list of rewards. Should be of shape N.
        :param l_next_obs: A chainer variable holding a list of observations at the next time step. Should be of
        shape N * |S|.
        :param l_done: A chainer variable holding a list of binary values (indicating whether episode ended after this
        time step). Should be of shape N.
        :return: A chainer variable holding a scalar loss.
        """
        # Hint: You may want to make use of the following fields: self._discount, self._q, self._qt
        # Hint2: Q-function can be called by self._q.forward(argument)
        # Hint3: You might also find https://docs.chainer.org/en/stable/reference/generated/chainer.functions.select_item.html useful
        obs_q_value = F.select_item(self._q.forward(l_obs), l_act)
        target_q_value = np.zeros(l_done.shape[0])
        for i in range(l_done.shape[0]):
            if l_done[i] == True:
                target_q_value[i] = l_rew[i]
            else:
                q_value_next = self._q.forward(
                    F.expand_dims(l_next_obs[i], axis=0))
                max_idx = F.argmax(q_value_next)
                target_value = self._qt.forward(
                    F.expand_dims(l_next_obs[i], axis=0))
                max_value = F.select_item(target_value,
                                          np.array([max_idx.data]))
                target_q_value[i] = l_rew[i] + self._discount * max_value.data

        loss = F.mean_squared_error(F.cast(target_q_value, np.float32),
                                    F.cast(obs_q_value, np.float32))
        return loss

示例#10

0

显示文件

文件： DQN.py 项目： HarriBellThomas/VDQN

    def __update(self, _q, _qTarget, optimiser, samples, gamma=0.99):
        """Update a Q-function with given samples and a target Q-function."""
        # self.__debug("Running update...")

        currentStates = _q.xp.asarray(samples["states"], dtype=np.float32)
        actions = _q.xp.asarray(samples["actions"], dtype=np.int32)
        rewards = _q.xp.asarray(samples["rewards"], dtype=np.float32)
        completes = _q.xp.asarray(samples["completes"], dtype=np.float32)
        nextState = _q.xp.asarray(samples["nextStates"], dtype=np.float32)

        # Predicted values: Q(s,a)
        predictions = functions.select_item(_q(currentStates), actions)

        # Target values: r + gamma * max_b Q(s',b)
        with chainer.no_backprop_mode():
            if self.__doubleDQN:
                _qNext = functions.select_item(
                    _qTarget(nextState), functions.argmax(_q(nextState),
                                                          axis=1))
            else:
                _qNext = functions.max(_qTarget(nextState), axis=1)

            target = rewards + gamma * (1 - completes) * _qNext

        loss = functions.mean(
            functions.huber_loss(predictions, target, delta=1.0, reduce='no'))
        _q.cleargrads()
        loss.backward()
        optimiser.update()

示例#11

0

显示文件

文件： main.py 项目： rlagywjd802/fastai

    def compute_double_q_learning_loss(self, l_obs, l_act, l_rew, l_next_obs,
                                       l_done):
        """
        :param l_obs: A chainer variable holding a list of observations. Should be of shape N * |S|.
        :param l_act: A chainer variable holding a list of actions. Should be of shape N.
        :param l_rew: A chainer variable holding a list of rewards. Should be of shape N.
        :param l_next_obs: A chainer variable holding a list of observations at the next time step. Should be of
        shape N * |S|.
        :param l_done: A chainer variable holding a list of binary values (indicating whether episode ended after this
        time step). Should be of shape N.
        :return: A chainer variable holding a scalar loss.
        """
        # Hint: You may want to make use of the following fields: self._discount, self._q, self._qt
        # Hint2: Q-function can be called by self._q.forward(argument)
        # Hint3: You might also find https://docs.chainer.org/en/stable/reference/generated/chainer.functions.select_item.html useful
        "*** YOUR CODE HERE ***"
        # this is the same as the last excercise but we get are max of Q, then apply it to Qt
        # First the normal Q
        Q_s_a = F.select_item(
            self._q.forward(l_obs),
            l_act)  # current quality, current_state, current best action

        Q_sn = self._q.forward(l_next_obs)  # current quality, next state
        a_q_sn = F.argmax(
            Q_sn, axis=1)  # action that's ideal on current Q, next state

        Qt_sn = self._qt.forward(l_next_obs)  # next quality, next state

        # Now the dual Q, this is the next quality, for the next state, but taking the action that were best for the current quality in the next state. Can be hard to keep track of!
        Q_dual = F.select_item(Qt_sn, a_q_sn)
        y = l_rew + (1 - l_done) * (self._discount * Q_dual)
        loss = F.mean((y - Q_s_a)**2)
        return loss

        return loss

示例#12

0

显示文件

文件： main.py 项目： sezhiyanhari/RL-Dylan-Hari-Sophia

    def compute_double_q_learning_loss(self, l_obs, l_act, l_rew, l_next_obs, l_done):
        """
        :param l_obs: A chainer variable holding a list of observations. Should be of shape N * |S|.
        :param l_act: A chainer variable holding a list of actions. Should be of shape N.
        :param l_rew: A chainer variable holding a list of rewards. Should be of shape N.
        :param l_next_obs: A chainer variable holding a list of observations at the next time step. Should be of
        shape N * |S|.
        :param l_done: A chainer variable holding a list of binary values (indicating whether episode ended after this
        time step). Should be of shape N.
        :return: A chainer variable holding a scalar loss.
        """
        # Hint: You may want to make use of the following fields: self._discount, self._q, self._qt
        # Hint2: Q-function can be called by self._q.forward(argument)
        # Hint3: You might also find https://docs.chainer.org/en/stable/reference/generated/chainer.functions.select_item.html useful
         # TODO: replace this line

        feed_forward_learner = self._q.forward(l_obs)
        q_learner = F.select_item(feed_forward_learner, l_act)

        action_q_values = self._q.forward(l_next_obs) 
        best_action = F.argmax(action_q_values, axis=1)
        feed_forward_target = self._qt.forward(l_next_obs)
        q_target = F.select_item(feed_forward_target, best_action)

        terminate = F.cast(l_done, bool)
        l_rew = F.cast(l_rew, "float32")
        final_target = F.where(terminate, l_rew, l_rew + self._discount * q_target).data
        loss = F.mean_squared_error(final_target, q_learner)

        return loss

示例#13

0

显示文件

文件： main.py 项目： Neo-47/Deep-RL-Bootcamp

    def compute_double_q_learning_loss(self, l_obs, l_act, l_rew, l_next_obs,
                                       l_done):
        """
        :param l_obs: A chainer variable holding a list of observations. Should be of shape N * |S|.
        :param l_act: A chainer variable holding a list of actions. Should be of shape N.
        :param l_rew: A chainer variable holding a list of rewards. Should be of shape N.
        :param l_next_obs: A chainer variable holding a list of observations at the next time step. Should be of
        shape N * |S|.
        :param l_done: A chainer variable holding a list of binary values (indicating whether episode ended after this
        time step). Should be of shape N.
        :return: A chainer variable holding a scalar loss.
        """
        # Hint: You may want to make use of the following fields: self._discount, self._q, self._qt
        # Hint2: Q-function can be called by self._q.forward(argument)
        # Hint3: You might also find https://docs.chainer.org/en/stable/reference/generated/chainer.functions.select_item.html useful
        #loss = C.Variable(np.array([0.]))  # TODO: replace this line
        "*** YOUR CODE HERE ***"

        q_values = self._q.forward(l_next_obs)

        maxes = F.argmax(q_values, axis=1)

        q_values = self._qt.forward(l_next_obs)

        Qs = F.select_item(q_values, maxes)

        target = l_rew + (1 - l_done) * self._discount * Qs

        Q_s = F.select_item(self._q.forward(l_obs), l_act)

        loss = F.mean_squared_error(target, Q_s)

        return loss

示例#14

0

显示文件

    def compute_double_q_learning_loss(self, l_obs, l_act, l_rew, l_next_obs,
                                       l_done):
        """
        :param l_obs: A chainer variable holding a list of observations. Should be of shape N * |S|.
        :param l_act: A chainer variable holding a list of actions. Should be of shape N.
        :param l_rew: A chainer variable holding a list of rewards. Should be of shape N.
        :param l_next_obs: A chainer variable holding a list of observations at the next time step. Should be of
        shape N * |S|.
        :param l_done: A chainer variable holding a list of binary values (indicating whether episode ended after this
        time step). Should be of shape N.
        :return: A chainer variable holding a scalar loss.
        """
        # Hint: You may want to make use of the following fields: self._discount, self._q, self._qt
        # Hint2: Q-function can be called by self._q.forward(argument)
        # Hint3: You might also find
        # https://docs.chainer.org/en/stable/reference/generated/chainer.functions.select_item.html
        # useful
        # loss = C.Variable(np.array([0.]))

        # compute target q value named y
        q_next = self._qt.forward(l_next_obs)
        a_next = F.argmax(self._q.forward(l_next_obs), axis=1)
        q_act_next = F.select_item(q_next, a_next)

        y = l_rew + self._discount * q_act_next * (1 - l_done)

        # compute mean square loss function
        q = self._q.forward(l_obs)
        q_act = F.select_item(q, l_act)

        loss = F.mean(F.square(q_act - y))

        return loss

示例#15

0

显示文件

文件： main.py 项目： sebastianhaeni/deeprlbootcamp-labs

    def compute_double_q_learning_loss(self, l_obs, l_act, l_rew, l_next_obs, l_done):
        """
        :param l_obs: A chainer variable holding a list of observations. Should be of shape N * |S|.
        :param l_act: A chainer variable holding a list of actions. Should be of shape N.
        :param l_rew: A chainer variable holding a list of rewards. Should be of shape N.
        :param l_next_obs: A chainer variable holding a list of observations at the next time step. Should be of
        shape N * |S|.
        :param l_done: A chainer variable holding a list of binary values (indicating whether episode ended after this
        time step). Should be of shape N.
        :return: A chainer variable holding a scalar loss.
        """
        # Hint: You may want to make use of the following fields: self._discount, self._q, self._qt
        # Hint2: Q-function can be called by self._q.forward(argument)
        # Hint3: You might also find https://docs.chainer.org/en/stable/reference/generated/chainer.functions.select_item.html useful
        "*** YOUR CODE HERE ***"

        ####################################################################################################

        target_action = F.argmax(self._q.forward(l_next_obs), axis=1)

        y = l_rew + (1 - l_done) * self._discount * F.select_item(self._qt.forward(l_next_obs), target_action)

        q = F.select_item(self._q.forward(l_obs), l_act) # same as before

        loss = F.mean_squared_error(y, q) # same as before

        # Based on the performance of the Double DQN algorithm, we agree that the performance gain is
        # not obvious (refer to "Results.pdf"). We would even add, that its improvement in performance is
        # less stable.

        ####################################################################################################

        return loss

示例#16

0

显示文件

    def compute_double_q_learning_loss(self, l_obs, l_act, l_rew, l_next_obs, l_done):
        """
        :param l_obs: A chainer variable holding a list of observations. Should be of shape N * |S|.
        :param l_act: A chainer variable holding a list of actions. Should be of shape N.
        :param l_rew: A chainer variable holding a list of rewards. Should be of shape N.
        :param l_next_obs: A chainer variable holding a list of observations at the next time step. Should be of
        shape N * |S|.
        :param l_done: A chainer variable holding a list of binary values (indicating whether episode ended after this
        time step). Should be of shape N.
        :return: A chainer variable holding a scalar loss.
        """
        # Hint: You may want to make use of the following fields: self._discount, self._q, self._qt
        # Hint2: Q-function can be called by self._q.forward(argument)
        # Hint3: You might also find https://docs.chainer.org/en/stable/reference/generated/chainer.functions.select_item.html useful
        "*** YOUR CODE HERE ***"

        # Find the quality of this state and this action.
        Q = F.select_item(self._q.forward(l_obs), l_act)

        # Find the quality of the next state using our current model.
        Q_next = self._q.forward(l_next_obs)
        action = F.argmax(self._q.forward(l_next_obs), axis=1)

        # Find the greedy quality values with the next state.
        Q_target = self._qt.forward(l_next_obs)

        # The dual Q, the next quality for the next state, but using the
        # actions that were best for the current quality in the current state.
        dual_q = F.select_item(Q_target, action)

        # Find y.
        y = l_rew + (1 - l_done) * (self._discount * dual_q)

        loss = F.mean((y - Q) ** 2)
        return loss

示例#17

0

显示文件

文件： main.py 项目： jmrf/deeprlbootcamp

    def compute_double_q_learning_loss(self, l_obs, l_act, l_rew, l_next_obs,
                                       l_done):
        """
        :param l_obs: A chainer variable holding a list of observations. Should be of shape N * |S|.
        :param l_act: A chainer variable holding a list of actions. Should be of shape N.
        :param l_rew: A chainer variable holding a list of rewards. Should be of shape N.
        :param l_next_obs: A chainer variable holding a list of observations at the next time step. Should be of
        shape N * |S|.
        :param l_done: A chainer variable holding a list of binary values (indicating whether episode ended after this
        time step). Should be of shape N.
        :return: A chainer variable holding a scalar loss.
        """
        # Hint: You may want to make use of the following fields: self._discount, self._q, self._qt
        # Hint2: Q-function can be called by self._q.forward(argument)
        # Hint3: You might also find https://docs.chainer.org/en/stable/reference/generated/chainer.functions.select_item.html useful
        loss = C.Variable(np.array([0.]))  # TODO: replace this line
        "*** YOUR CODE HERE ***"
        N = l_obs.data.shape[0]

        # set gamma to zero for those states that are terminal
        discounts = np.array(
            [0.0 if l_done.data[i] else self._discount for i in range(N)])
        # get Q value estimate from Q-target network
        q_indices = F.argmax(self._q.forward(l_next_obs), axis=1)
        y = l_rew + discounts * F.select_item(self._qt.forward(l_next_obs),
                                              q_indices)

        # compute the loss using the current Q-network ont the taken actions
        loss = F.mean((y - F.select_item(self._q.forward(l_obs), l_act))**2)
        return loss

示例#18

0

显示文件

def update(Q, target_Q, opt, samples, gamma=0.99, target_type='double_dqn'):
    """Update a Q-function with given samples and a target Q-function."""
    dtype = chainer.get_dtype()
    xp = Q.xp
    obs = xp.asarray([sample[0] for sample in samples], dtype=dtype)
    action = xp.asarray([sample[1] for sample in samples], dtype=np.int32)
    reward = xp.asarray([sample[2] for sample in samples], dtype=dtype)
    done = xp.asarray([sample[3] for sample in samples], dtype=dtype)
    obs_next = xp.asarray([sample[4] for sample in samples], dtype=dtype)
    # Predicted values: Q(s,a)
    y = F.select_item(Q(obs), action)
    # Target values: r + gamma * max_b Q(s',b)
    with chainer.no_backprop_mode():
        if target_type == 'dqn':
            next_q = F.max(target_Q(obs_next), axis=1)
        elif target_type == 'double_dqn':
            next_q = F.select_item(target_Q(obs_next),
                                   F.argmax(Q(obs_next), axis=1))
        else:
            raise ValueError('Unsupported target_type: {}'.format(target_type))
        target = reward + gamma * (1 - done) * next_q
    loss = mean_clipped_loss(y, target)
    Q.cleargrads()
    loss.backward()
    opt.update()

示例#19

0

显示文件

 def compute_double_q_learning_loss(self, l_obs, l_act, l_rew, l_next_obs,
                                    l_done):
     """
     :param l_obs: A chainer variable holding a list of observations. Should be of shape N * |S|.
     :param l_act: A chainer variable holding a list of actions. Should be of shape N.
     :param l_rew: A chainer variable holding a list of rewards. Should be of shape N.
     :param l_next_obs: A chainer variable holding a list of observations at the next time step. Should be of
     shape N * |S|.
     :param l_done: A chainer variable holding a list of binary values (indicating whether episode ended after this
     time step). Should be of shape N.
     :return: A chainer variable holding a scalar loss.
     """
     # Hint: You may want to make use of the following fields: self._discount, self._q, self._qt
     # Hint2: Q-function can be called by self._q.forward(argument)
     # Hint3: You might also find https://docs.chainer.org/en/stable/reference/generated/chainer.functions.select_item.html useful
     # loss = C.Variable(np.array([0.]))  # TODO: replace this line
     l_rew = F.cast(l_rew, np.float32)
     q_future = self._q.forward(l_next_obs)
     qt_future = self._qt.forward(l_next_obs)
     future_rew = l_rew + self._discount * F.select_item(
         qt_future, F.argmax(q_future, axis=1))
     target = F.select_item(F.stack([future_rew, l_rew], axis=1),
                            F.cast(l_done, np.int32))
     y = F.select_item(self._q.forward(l_obs), l_act)
     return F.mean(F.square(y - target))

示例#20

0

显示文件

文件： sqil.py 项目： toy101/DSAC

    def _compute_q_loss(self, batch):
        """B(D, r)"""

        batch_reward = self.xp.concatenate([
            self.xp.ones_like(batch['reward'][:self.minibatch_size]),
            self.xp.zeros_like(batch['reward'][self.minibatch_size:])
        ],
                                           axis=0)
        batch_state = batch['state']
        batch_next_state = batch['next_state']
        batch_actions = batch['action']
        batch_discount = batch['discount']
        batch_terminal = batch['is_state_terminal']
        batch_absorb = batch['is_state_absorb']
        batch_next_absorb = batch['is_next_state_absorb']

        with chainer.no_backprop_mode(), chainer.using_config('train', False):
            target_next_v = self._calc_target_v(batch_next_state,
                                                batch_next_absorb)

            if self.reward_func:
                # reward for gan
                D = F.sigmoid(
                    self.reward_func(batch_state, batch_absorb, batch_actions))
                batch_reward = F.flatten(
                    F.log(D + 1e-8) - F.log(1 - D + 1e-8)
                )  # + 0.5 * batch_reward / self.temperature / self.lamda

            batch_reward = F.flatten(batch_reward)

            self.reward_demo_record.extend(
                cuda.to_cpu(batch_reward.array[:self.minibatch_size]))
            self.reward_samp_record.extend(
                cuda.to_cpu(batch_reward.array[self.minibatch_size:]))

            target_q = batch_reward + batch_discount * \
                       (1.0 - batch_terminal) * target_next_v

        if self.is_discrete:
            predict_q1 = F.flatten(
                F.select_item(self.q_func1(batch_state, batch_absorb),
                              batch_actions))
            predict_q2 = F.flatten(
                F.select_item(self.q_func2(batch_state, batch_absorb),
                              batch_actions))
        else:
            predict_q1 = F.flatten(
                self.q_func1(batch_state, batch_absorb, batch_actions))
            predict_q2 = F.flatten(
                self.q_func2(batch_state, batch_absorb, batch_actions))

        # soft bellman error
        loss1 = 0.5 * F.mean_squared_error(target_q, predict_q1)
        loss2 = 0.5 * F.mean_squared_error(target_q, predict_q2)

        self.q1_record.extend(cuda.to_cpu(predict_q1.array))
        self.q2_record.extend(cuda.to_cpu(predict_q2.array))

        return loss1, loss2

示例#21

0

显示文件

文件： test_select_item.py 项目： Fhrozen/chainer

    def check_value_check(self, x_data, t_data):
        x = chainer.Variable(x_data)
        t = chainer.Variable(t_data)

        if self.valid:
            # Check if it throws nothing
            functions.select_item(x, t)
        else:
            with self.assertRaises(ValueError):
                functions.select_item(x, t)

示例#22

0

显示文件

    def check_value_check(self, x_data, t_data):
        x = chainer.Variable(x_data)
        t = chainer.Variable(t_data)

        if self.valid:
            # Check if it throws nothing
            functions.select_item(x, t)
        else:
            with self.assertRaises(ValueError):
                functions.select_item(x, t)

示例#23

0

显示文件

def u2a(u):  # u, a: (N * 1) Variable
    N = len(u.data)
    phi = np.argsort(u.data.reshape(N))  # u.data[phi]: ascending
    a_list = [0] * N
    cumprod = Variable(np.array([[1.0]]).astype(np.float32))
    for i in range(N):
        a_list[phi[i]] = cumprod * (
            1.0 - F.reshape(F.select_item(F.transpose(u), np.array([phi[i]])),
                            (1, 1)))
        cumprod *= F.reshape(F.select_item(F.transpose(u), np.array([phi[i]])),
                             (1, 1))
    return F.concat(a_list, 0)  # concat vertically

示例#24

0

显示文件

文件： sac.py 项目： toy101/DSAC

    def update_q_func(self, batch):
        """Compute loss for a given Q-function."""

        batch_next_state = batch['next_state']
        batch_rewards = batch['reward']
        batch_terminal = batch['is_state_terminal']
        batch_state = batch['state']
        batch_actions = batch['action']
        batch_discount = batch['discount']

        with chainer.no_backprop_mode(), chainer.using_config('train', False):
            next_action_distrib = self.policy(batch_next_state)
            next_actions, next_log_prob =\
                next_action_distrib.sample_with_log_prob()
            entropy_term = self.temperature * next_log_prob
            if self.is_discrete:
                next_q1 = F.select_item(self.target_q_func1(batch_next_state),
                                        next_actions)
                next_q2 = F.select_item(self.target_q_func2(batch_next_state),
                                        next_actions)
            else:
                next_q1 = self.target_q_func1(batch_next_state, next_actions)
                next_q2 = self.target_q_func2(batch_next_state, next_actions)
                entropy_term = entropy_term[..., None]
            next_q = F.minimum(next_q1, next_q2)
            assert next_q.shape == entropy_term.shape

            target_q = batch_rewards + batch_discount * \
                (1.0 - batch_terminal) * F.flatten(next_q - entropy_term)

        if self.is_discrete:
            predict_q1 = F.flatten(
                F.select_item(self.q_func1(batch_state), batch_actions))
            predict_q2 = F.flatten(
                F.select_item(self.q_func2(batch_state), batch_actions))
        else:
            predict_q1 = F.flatten(self.q_func1(batch_state, batch_actions))
            predict_q2 = F.flatten(self.q_func2(batch_state, batch_actions))

        loss1 = 0.5 * F.mean_squared_error(target_q, predict_q1)
        loss2 = 0.5 * F.mean_squared_error(target_q, predict_q2)

        # Update stats
        self.q1_record.extend(cuda.to_cpu(predict_q1.array))
        self.q2_record.extend(cuda.to_cpu(predict_q2.array))
        self.q_func1_loss_record.append(float(loss1.array))
        self.q_func2_loss_record.append(float(loss2.array))

        self.q_func1_optimizer.update(lambda: loss1)
        self.q_func2_optimizer.update(lambda: loss2)

示例#25

0

显示文件

文件： test_select_item.py 项目： UnitedShift/chainer

    def check_forward(self, x_data, t_data):
        x = chainer.Variable(x_data)
        t = chainer.Variable(t_data)
        y = functions.select_item(x, t)
        y_exp = cuda.to_cpu(x_data)[range(t_data.size), cuda.to_cpu(t_data)]

        numpy.testing.assert_equal(cuda.to_cpu(y.data), y_exp)

示例#26

0

显示文件

文件： qlearning.py 项目： petrosgk/RL_experiments

    def train(self, x, y, actions=None):
        actions = actions.astype(np.int32)
        batch_size = len(actions)

        if self._gpu_device:
            x = cuda.to_gpu(x, self._gpu_device)
            y = cuda.to_gpu(y, self._gpu_device)
            actions = cuda.to_gpu(actions, self._gpu_device)

        q = self._model(x)
        q_subset = F.reshape(F.select_item(q, actions), (batch_size, 1))
        y = y.reshape(batch_size, 1)

        loss = F.sum(F.huber_loss(q_subset, y, 1.0))

        self._model.cleargrads()
        loss.backward()
        self._optimizer.update()

        self._loss_val = np.asscalar(cuda.to_cpu(loss.data))

        # Keeps track of the number of train() calls
        self._steps += 1
        if self._steps % self._target_update_interval == 0:
            # copy weights
            self._target.copyparams(self._model)

示例#27

0

显示文件

    def compute_batch_loss(self, batch, weights):
        """Compute gradients on a list of trajectories.

        Args:
            batch -- a TrajectoryBatch
            weights -- a list of weights for trajectories in the batch

        Returns a loss value
        """

        weights = self._xp.array(weights)
        for step, step_batch in batch.step_batches(self._gpu_device):
            policies, values = self._model(step_batch.states)
            values *= (1 - step_batch.terminals).reshape(values.shape)
            logprobs = F.select_item(policies, step_batch.actions)
            batch.set_logprobs_and_values(step, logprobs, values)

        losses = []
        for trajectory, logprobs, values in batch:
            losses.append(
                self.compute_trajectory_loss(trajectory, logprobs, values))

        losses = F.stack(losses)
        loss = F.average(losses * weights)
        loss.backward()

        return np.asscalar(cuda.to_cpu(loss.data))

示例#28

0

显示文件

    def compute_loss(self, state, action, reward, next_state, episode_ends):
        batchsize = state.shape[0]
        xp = self.dqn.model.xp

        with chainer.using_config("train", True):
            q = self.dqn.compute_q_value(state)
        with chainer.no_backprop_mode():
            max_target_q_data = self.dqn.compute_target_q_value(
                next_state).data
            max_target_q_data = xp.amax(max_target_q_data, axis=1)

        t = reward + (1 -
                      episode_ends) * self.discount_factor * max_target_q_data
        t = Variable(xp.reshape(t.astype(xp.float32), (-1, 1)))

        y = functions.reshape(functions.select_item(q, action), (-1, 1))

        if self.clip_loss:
            loss = functions.huber_loss(t, y, delta=1.0)
        else:
            loss = functions.mean_squared_error(t, y) / 2
        loss = functions.sum(loss)

        # check NaN
        loss_value = float(loss.data)
        if loss_value != loss_value:
            import pdb
            pdb.set_trace()
        return loss

示例#29

0

显示文件

文件： dqn.py 项目： ryanai3/fieryhuedoom

 def train_batch(self):
     j =  \
       np.random.permutation(min(self.frame, self.pool_size - self.train_term))[:self.batch_size] % self.pool_size
     j1 = j + 1
     s_j = (Variable(self.xp.asarray(self.state_pool[j].astype(np.float32)))
            / 127.5) - 1
     s_j1 = (Variable(
         self.xp.asarray(self.state_pool[j + 1].astype(np.float32))) /
             127.5) - 1
     Qhat = self.target_q(s_j1, train=False)
     max_Q = cuda.to_cpu(F.max(Qhat, axis=1).data)
     #    max_Q = cuda.to_cpu(self.xp.max(Qhat.data, axis=1))
     y_j = Variable(
         self.xp.asarray(self.reward_pool[j] +
                         (1 - self.terminal_pool[j]) * self.gamma * max_Q))
     a_j = Variable(self.xp.asarray(self.action_pool[j]))
     qs = self.action_q(s_j)
     q_preds = F.select_item(qs, a_j)
     loss = F.mean_squared_error(y_j, q_preds)
     self.optimizer.zero_grads()
     loss.backward()
     loss.unchain_backward()
     self.optimizer.update()
     qp_cpu = qs.data
     print "Q", np.mean(q_preds.data)
     print "loss", loss.data
     print np.mean(qp_cpu, axis=0)

示例#30

0

显示文件

    def train(self, x, y, actions=None):
        actions = actions.astype(np.int32)
        batch_size = len(actions)

        if self._gpu_device:
            x = cuda.to_gpu(x, self._gpu_device)
            y = cuda.to_gpu(y, self._gpu_device)
            actions = cuda.to_gpu(actions, self._gpu_device)

        q = self._model(x)
        q_subset = F.reshape(F.select_item(q, actions), (batch_size, 1))
        y = y.reshape(batch_size, 1)

        loss = F.sum(F.huber_loss(q_subset, y, 1.0))

        self._model.cleargrads()
        loss.backward()
        self._optimizer.update()

        self._loss_val = np.asscalar(cuda.to_cpu(loss.data))

        # Keeps track of the number of train() calls
        self._steps += 1
        if self._steps % self._target_update_interval == 0:
            # copy weights
            self._target.copyparams(self._model)

示例#31

0

显示文件

文件： dqn.py 项目： ryanai3/fieryhuedoom

 def _train_batch(self, j):
     j1 = j + 1
     s_j = (Variable(self.xp.asarray(self.state_pool[j].astype(np.float32)))
            / 127.5) - 1
     s_j1 = (Variable(
         self.xp.asarray(self.state_pool[j + 1].astype(np.float32))) /
             127.5) - 1
     Qhat = self.target_q(s_j1, train=False)
     max_Q = cuda.to_cpu(F.max(Qhat, axis=1).data)
     #    max_Q = cuda.to_cpu(self.xp.max(Qhat.data, axis=1))
     y_j = Variable(
         self.xp.asarray(self.reward_pool[j] +
                         (1 - self.terminal_pool[j]) * self.gamma * max_Q))
     a_j = Variable(self.xp.asarray(self.action_pool[j]))
     qs = self.action_q(s_j)
     q_preds = F.select_item(qs, a_j)
     loss = F.mean_squared_error(y_j, q_preds)
     self.optimizer.zero_grads()
     res = loss.backward()
     loss.unchain_backward()
     self.optimizer.update()
     qp_cpu = qs.data
     #    print "loss", loss.data
     #    print np.mean(qp_cpu, axis=0)
     #    print(res)
     return np.mean(cuda.to_cpu(q_preds.data))

示例#32

0

显示文件

文件： pixelwise_a3c.py 项目： zxhSAMA/pixelRL

def mylog_prob(self, x):
    n_batch, n_actions, h, w = self.all_log_prob.shape
    p_trans = F.transpose(self.all_log_prob, axes=(0,2,3,1))
    p_trans = F.reshape(p_trans,(-1,n_actions))
    x_reshape = F.reshape(x,(1,-1))[0]
    selected_p = F.select_item(p_trans,x_reshape)
    return F.reshape(selected_p, (n_batch,1,h,w))

示例#33

0

显示文件

    def compute_q_learning_loss(self, l_obs, l_act, l_rew, l_next_obs, l_done):
        """
        :param l_obs: A chainer variable holding a list of observations. Should be of shape N * |S|.
        :param l_act: A chainer variable holding a list of actions. Should be of shape N.
        :param l_rew: A chainer variable holding a list of rewards. Should be of shape N.
        :param l_next_obs: A chainer variable holding a list of observations at the next time step. Should be of
        shape N * |S|.
        :param l_done: A chainer variable holding a list of binary values (indicating whether episode ended after this
        time step). Should be of shape N.
        :return: A chainer variable holding a scalar loss.
        """
        # Hint: You may want to make use of the following fields: self._discount, self._q, self._qt
        # Hint2: Q-function can be called by self._q.forward(argument)
        # Hint3: You might also find https://docs.chainer.org/en/stable/reference/generated/chainer.functions.select_item.html useful
        "*** YOUR CODE HERE ***"

        # Ideal next action per state, maximizing value.
        Qt_greedy = F.max(self._qt.forward(l_next_obs), -1)

        # Find y
        y = l_rew + (1 - l_done) * (self._discount * Qt_greedy)

        # Find Q, our current model.
        Q = F.select_item(self._q.forward(l_obs), l_act)

        # Find the total loss from this iteration.
        loss = F.mean((y - Q) ** 2)

        return loss

示例#34

0

显示文件

    def update_model(self):
        (s, action, reward, s_next, is_terminal) = self.memory.sample_minibatch(self.minibatch_size)

        # compute Q targets (max_a' Q_hat(s_next, a'))
        Q_hat = self.target_network(s_next)
        Q_hat_max = F.max(Q_hat, axis=1, keepdims=True)
        y = (1-is_terminal)*self.gamma*Q_hat_max + reward

        # compute Q(s, action)
        Q = self.model_network(s)
        Q_subset = F.reshape(F.select_item(Q, action), (self.minibatch_size, 1))

        # compute Huber loss
        error = y - Q_subset
        loss_clipped = abs(error) * (abs(error.data) > 1) + (error**2) * (abs(error.data) <= 1)
        loss = F.sum(loss_clipped) / self.minibatch_size

        # perform model update
        self.model_network.zerograds() ## zero out the accumulated gradients in all network parameters
        loss.backward()
        self.optimizer.update()

        # target network tracks the model
        for dst, src in zip(self.target_network.params(), self.model_network.params()):
            dst.data = self.tau * src.data + (1 - self.tau) * dst.data

        return loss.data

示例#35

0

显示文件

文件： NeuralNet.py 项目： shiba24/rlLib

    def __call__(self, x, t, index):
        h = self.predict(x)
        self.history = np.append(self.history, np.array([np.mean(h.data, axis=0)]), axis=0)

        h = F.select_item(h, index)             # choose the action[index] in each column
        error_abs = abs(h - t)
        error = F.concat((F.expand_dims(error_abs ** 2, 1), F.expand_dims(error_abs, 1)), axis=1)
        # 1 < error_abs <=> error ** 2 > error,  error < 1 <=> error ** 2 < error
        self.loss = F.sum(F.min(error, axis=1)) / np.float32(len(error_abs))
        return self.loss

示例#36

0

显示文件

文件： main.py 项目： stjordanis/Deep-RL-Bootcamp-Labs

    def compute_double_q_learning_loss(self, l_obs, l_act, l_rew, l_next_obs, l_done):
        """
        :param l_obs: A chainer variable holding a list of observations. Should be of shape N * |S|.
        :param l_act: A chainer variable holding a list of actions. Should be of shape N.
        :param l_rew: A chainer variable holding a list of rewards. Should be of shape N.
        :param l_next_obs: A chainer variable holding a list of observations at the next time step. Should be of
        shape N * |S|.
        :param l_done: A chainer variable holding a list of binary values (indicating whether episode ended after this
        time step). Should be of shape N.
        :return: A chainer variable holding a scalar loss.
        """
        # Hint: You may want to make use of the following fields: self._discount, self._q, self._qt
        # Hint2: Q-function can be called by self._q.forward(argument)
        # Hint3: You might also find https://docs.chainer.org/en/stable/reference/generated/chainer.functions.select_item.html useful
        "*** YOUR CODE HERE ***"
        tar_act = F.argmax(self._q.forward(l_next_obs), axis=1)
        y = l_rew + (1 - l_done) * self._discount * F.select_item(self._qt.forward(l_next_obs), tar_act)
        q = F.select_item(self._q.forward(l_obs), l_act)
        loss = F.mean_squared_error(y, q)

        return loss

示例#37

0

显示文件

文件： ChainerUtil.py 项目： CurtisHuebner/GPNN

    def read(address):
        #map from the reals to the hypercube of dimesion n
        index = F.tanh(address)
        
        #map from a point to the nearest corner of the hypercube
        f = lambda x: x > 0
        mainIndex = np.vectorize(f,index.data,cache=True)

        mainValue = F.select_item(array,lookup(mainIndex))
        scaleFactor =F.exp(F.sum(F.log(F.absolute(x))))

        return mainValue * scaleFactor

示例#38

0

显示文件

文件： train.py 项目： dsanno/chainer-dqn

def train():
    max_term_size = args.max_train_term
    current_term_size = args.train_term
    term_increase_rate = 1 + args.train_term_increase
    last_clock = time.clock()
    update_target_iteration = 0
    if use_double_dqn:
        target_q = q.copy()
        target_q.reset_state()
    while True:
        term_size = int(current_term_size)
        if frame < batch_size * term_size:
            continue
        batch_index = np.random.permutation(min(frame - term_size, POOL_SIZE))[:batch_size]
        train_image = Variable(xp.asarray(state_pool[batch_index]))
        y = q(train_image)
        if use_double_dqn and update_target_iteration >= update_target_interval:
            target_q = q.copy()
            target_q.reset_state()
            target_q(Variable(xp.asarray(state_pool[batch_index]), volatile=True))
            update_iteration = 0
        for term in range(term_size):
            next_batch_index = (batch_index + 1) % POOL_SIZE
            train_image = Variable(xp.asarray(state_pool[next_batch_index]))
            score = q(train_image)
            if only_result:
                t = Variable(xp.asarray(reward_pool[batch_index]))
            else:
                if use_double_dqn:
                    eval_image = Variable(xp.asarray(state_pool[next_batch_index]), volatile=True)
                    target_score = target_q(eval_image)
                    best_action = cuda.to_cpu(xp.argmax(score.data, axis=1))
                    best_q = cuda.to_cpu(target_score.data)[range(batch_size), best_action]
                else:
                    best_q = cuda.to_cpu(xp.max(score.data, axis=1))
                t = Variable(xp.asarray(reward_pool[batch_index] + (1 - terminal_pool[batch_index]) * gamma * best_q))
            action_index = chainer.Variable(xp.asarray(action_pool[batch_index]))
            loss = F.mean_squared_error(F.select_item(y, action_index), t)
            y = score
            optimizer.zero_grads()
            loss.backward()
            loss.unchain_backward()
            optimizer.update()
            batch_index = next_batch_index
            print "loss", float(cuda.to_cpu(loss.data))
            clock = time.clock()
            print "train", clock - last_clock
            last_clock = clock
            if use_double_dqn:
                update_target_iteration += 1
        current_term_size = min(current_term_size * term_increase_rate, max_term_size)
        print "current_term_size ", current_term_size

示例#39

0

显示文件

文件： test_select_item.py 项目： philip30/chainer

    def check_backward(self, x_data, t_data, gy_data):
        x = chainer.Variable(x_data)
        t = chainer.Variable(t_data)
        y = functions.select_item(x, t)
        y.grad = gy_data
        y.backward()
        self.assertEqual(None, t.grad)

        func = y.creator
        f = lambda: func.forward((x.data, t.data))
        gx, = gradient_check.numerical_grad(f, (x.data,), (gy_data,), eps=0.01)

        gradient_check.assert_allclose(gx, x.grad)

示例#40

0

显示文件

文件： test_select_item.py 项目： Fhrozen/chainer

 def f(x):
     y = functions.select_item(x, t_data)
     return y * y

示例#41

0

显示文件

文件： main_iaro_acrobot.py 项目： tamapi/test_001

 def __call__(self, X, Y, A, Q):
     P = Q(X)
     P = F.select_item(P, Variable(np.array(A).astype('int32')))
     return F.mean_squared_error(Y, P)

示例#42

0

显示文件

文件： policy_output.py 项目： BenjamWhite/async-rl

 def sampled_actions_log_probs(self):
     return F.select_item(
         self.log_probs,
         chainer.Variable(np.asarray(self.action_indices, dtype=np.int32)))

示例#43

0

显示文件

文件： agent.py 项目： sho-o/DQN

	def compute_loss(self, s, a, r, new_s, done, loss_log=False):
		if self.net_type == "full":
			s = s.reshape(self.batch_size, self.input_slides*self.size*self.size)
			new_s = new_s.reshape(self.batch_size, self.input_slides*self.size*self.size)

		#gpu
		if self.gpu >= 0:
			s = cuda.to_gpu(s)
			new_s = cuda.to_gpu(new_s)
		if chainer.__version__ >= "2.0.0":
			s = Variable(s)
			new_s = Variable(new_s)
		else:
			s = Variable(s, volatile='auto')
			new_s = Variable(new_s, volatile='auto')
		q_value = self.q(s)

		with chainer.no_backprop_mode():
			if self.mode == "regularize":
				tg_q_value = self.q(new_s)
			elif self.mode == "target_mix":
				tg_q_value = (1.0-self.mix_rate) * self.q(new_s) + self.mix_rate * self.fixed_q(new_s)
			elif self.mode == "default":
				tg_q_value = self.fixed_q(new_s)
		#print "tg_q_value[0]", tg_q_value[0].data

		if self.gpu >= 0:
			a = cuda.to_gpu(a)
			r = cuda.to_gpu(r)
			done = cuda.to_gpu(done)

		if chainer.__version__ >= "2.0.0":
			a = Variable(a)
		else:
			a = Variable(a, volatile='auto')

		argmax_a = F.argmax(tg_q_value, axis=1)

		#print a
		#print r
		q_action_value = F.select_item(q_value, a)
		#print "q_action_value", q_action_value.data
		target = r + self.discount * (1.0 - done) * F.select_item(tg_q_value, argmax_a)
		#print "target", target.data
		#target is float32

		q_action_value = F.reshape(q_action_value, (-1, 1))
		target = F.reshape(target, (-1, 1))

		loss_sum = F.sum(F.huber_loss(q_action_value, target, delta=1.0))
		loss = loss_sum / q_action_value.shape[0]
		#print "loss_a", loss.data

		if self.mode == "regularize" or loss_log == True:
			if self.penalty_function == "value":
				y = q_value
				with chainer.no_backprop_mode():
					t = self.fixed_q(s)
			if self.penalty_function == "action_value":
				y = q_action_value
				with chainer.no_backprop_mode():
					t = F.select_item(self.fixed_q(s), a)
					t = F.reshape(t, (-1, 1))
			if self.penalty_function == "max_action_value":
				y = F.select_item(self.q(new_s), argmax_a)
				y = F.reshape(y, (-1, 1))
				with chainer.no_backprop_mode():
					t = F.select_item(self.fixed_q(new_s), argmax_a)
					t = F.reshape(t, (-1, 1))

			if self.penalty_type == "huber":
				if self.final_penalty_cut == 1:
					penalty_sum = F.sum((1.0 - done)*F.huber_loss(y, t, delta=1.0))
				else:
					penalty_sum = F.sum(F.huber_loss(y, t, delta=1.0))
				penalty = penalty_sum / (y.shape[0]*y.shape[1])
			if self.penalty_type == "mean_squared":
				penalty = F.mean_squared_error(y, t)

			if loss_log == True:
				#y_data = cuda.to_cpu(y.data)
				#t_data = cuda.to_cpu(t.data)
				return loss, penalty
				#return loss, penalty, np.average(y_data), np.std(y_data), np.average(t_data), np.std(t_data)

			if penalty.data > self.threshold:
				#print "-------------on----------------"
				loss = loss + self.penalty_weight * penalty
		#print "loss_b", loss.data
		return loss