def compute_double_q_learning_loss(self, l_obs, l_act, l_rew, l_next_obs, l_done): """ :param l_obs: A chainer variable holding a list of observations. Should be of shape N * |S|. :param l_act: A chainer variable holding a list of actions. Should be of shape N. :param l_rew: A chainer variable holding a list of rewards. Should be of shape N. :param l_next_obs: A chainer variable holding a list of observations at the next time step. Should be of shape N * |S|. :param l_done: A chainer variable holding a list of binary values (indicating whether episode ended after this time step). Should be of shape N. :return: A chainer variable holding a scalar loss. """ # Hint: You may want to make use of the following fields: self._discount, self._q, self._qt # Hint2: Q-function can be called by self._q.forward(argument) # Hint3: You might also find https://docs.chainer.org/en/stable/reference/generated/chainer.functions.select_item.html useful "*** YOUR CODE HERE ***" reward = F.cast(l_rew, np.float32) q_forwarded = self._q.forward(l_next_obs) qt_forwarded = self._qt.forward(l_next_obs) y_non_terminal = reward + self._discount * F.select_item( qt_forwarded, F.argmax(q_forwarded, axis=1)) y_terminal = reward y = F.select_item(F.stack([y_non_terminal, y_terminal], axis=1), F.cast(l_done, np.int32)) Q = F.select_item(self._q.forward(l_obs), l_act) return F.mean(F.square(y - Q))
def _compute_q_y(self, batch_state, batch_action, batch_reward, batch_done, batch_next_state): with chainer.no_backprop_mode(): batch_max = F.argmax(self.network(batch_next_state), axis=1) batch_target_q = self.target_network(batch_next_state) batch_y = batch_reward + self.discount * (1 - batch_done) * F.select_item(batch_target_q, batch_max) batch_q = F.select_item(self.network(batch_state), batch_action) return batch_y, batch_q
def compute_loss(self, input_vocab, output_vocab, window_words, hidden_states): g, rnn_distribution, a = self.decode_one_step(input_vocab, window_words, hidden_states) # define p_vocab as 0 if output word is not in vocab p_vocab = F.select_item( rnn_distribution, xp.array( [self.vocab[output_vocab]], dtype=xp.int32)) if output_vocab in self.vocab else Variable( xp.array([0.0], dtype=xp.float32)) # compute cross entropy indexes = [i for i, x in enumerate(window_words) if x == output_vocab] exist_var = Variable(xp.array([0], dtype=xp.float32)) for idx in indexes: exist_var += F.select_item(a, xp.array([idx], dtype=xp.int32)) p_ptr = F.cast(exist_var, xp.float32) if indexes else Variable( xp.array([0.0], dtype=xp.float32)) cross_entropy = -F.log( F.linear_interpolate(g, p_vocab, p_ptr) + Variable(xp.array([0.01], dtype=xp.float32))) # compute attention loss attention_loss = F.cast(-F.log(g + exist_var), xp.float32) if indexes else Variable( xp.array([0.0], dtype=xp.float32)) return cross_entropy + attention_loss
def update(Q, target_Q, opt, samples, gamma=0.99, target_type='double_dqn'): """Update a Q-function with given samples and a target Q-function.""" dtype = chainer.get_dtype() xp = Q.xp obs = xp.asarray([sample[0] for sample in samples], dtype=dtype) action = xp.asarray([sample[1] for sample in samples], dtype=np.int32) reward = xp.asarray([sample[2] for sample in samples], dtype=dtype) done = xp.asarray([sample[3] for sample in samples], dtype=dtype) obs_next = xp.asarray([sample[4] for sample in samples], dtype=dtype) # Predicted values: Q(s,a) y = F.select_item(Q(obs), action) # Target values: r + gamma * max_b Q(s',b) with chainer.no_backprop_mode(): if target_type == 'dqn': next_q = F.max(target_Q(obs_next), axis=1) elif target_type == 'double_dqn': next_q = F.select_item(target_Q(obs_next), F.argmax(Q(obs_next), axis=1)) else: raise ValueError('Unsupported target_type: {}'.format(target_type)) target = reward + gamma * (1 - done) * next_q loss = mean_clipped_loss(y, target) Q.cleargrads() loss.backward() opt.update()
def compute_double_q_learning_loss(self, l_obs, l_act, l_rew, l_next_obs, l_done): """ :param l_obs: A chainer variable holding a list of observations. Should be of shape N * |S|. :param l_act: A chainer variable holding a list of actions. Should be of shape N. :param l_rew: A chainer variable holding a list of rewards. Should be of shape N. :param l_next_obs: A chainer variable holding a list of observations at the next time step. Should be of shape N * |S|. :param l_done: A chainer variable holding a list of binary values (indicating whether episode ended after this time step). Should be of shape N. :return: A chainer variable holding a scalar loss. """ # Hint: You may want to make use of the following fields: self._discount, self._q, self._qt # Hint2: Q-function can be called by self._q.forward(argument) # Hint3: You might also find https://docs.chainer.org/en/stable/reference/generated/chainer.functions.select_item.html useful "*** YOUR CODE HERE ***" # Compute Target action = F.argmax(self._q.forward(l_next_obs), axis=1) qt_vals = self._qt.forward(l_next_obs) qt_vals = F.select_item(qt_vals, action) y = l_rew + (1 - l_done) * self._discount * qt_vals # Compute Q q = self._q.forward(l_obs) q = F.select_item(q, l_act) # Compute Loss loss = F.mean_squared_error(y, q) return loss
def compute_double_q_learning_loss(self, l_obs, l_act, l_rew, l_next_obs, l_done): """ :param l_obs: A chainer variable holding a list of observations. Should be of shape N * |S|. :param l_act: A chainer variable holding a list of actions. Should be of shape N. :param l_rew: A chainer variable holding a list of rewards. Should be of shape N. :param l_next_obs: A chainer variable holding a list of observations at the next time step. Should be of shape N * |S|. :param l_done: A chainer variable holding a list of binary values (indicating whether episode ended after this time step). Should be of shape N. :return: A chainer variable holding a scalar loss. """ # Hint: You may want to make use of the following fields: self._discount, self._q, self._qt # Hint2: Q-function can be called by self._q.forward(argument) # Hint3: You might also find https://docs.chainer.org/en/stable/reference/generated/chainer.functions.select_item.html useful loss = C.Variable(np.array([0.])) # TODO: replace this line target_net_out = self._qt.forward(l_next_obs) current_q_val = self._q.forward(l_obs) current_q_val_a = F.select_item(current_q_val, l_act) next_q_val = self._q.forward(l_next_obs) max_val_action = F.argmax(next_q_val, 1) target_net_out_max = F.select_item(target_net_out, max_val_action) target_vals = l_rew + self._discount * target_net_out_max * (1 - l_done) loss_vec = target_vals - current_q_val_a loss = F.average(F.square(loss_vec)) # print(max_val_action.shape) "*** YOUR CODE HERE ***" return loss
def train(self, x, cluster_array, class_array, partition): """ :param x: instance in a minibatch :param cluster_array: the array of instances' cluster :param class_array: the array of instances' class (class means the number in the cluster) :param partition: :return: xp array, xp array """ h = self.model.conv(x) cluster_output = F.softmax(self.model.cluster(h)) cluster_output = F.select_item(cluster_output, cluster_array) class_output = None for cluster in range(self.num_clusters): if partition[cluster] == partition[cluster+1]: continue output = F.softmax(self[cluster](h[partition[cluster]:partition[cluster + 1]])) output = F.select_item(output, class_array[partition[cluster]:partition[cluster + 1]]) if class_output is None: class_output = output else: class_output = F.concat((class_output, output), axis=0) return cluster_output * class_output, cluster_output, class_output
def compute_double_q_learning_loss(self, l_obs, l_act, l_rew, l_next_obs, l_done): """ :param l_obs: A chainer variable holding a list of observations. Should be of shape N * |S|. :param l_act: A chainer variable holding a list of actions. Should be of shape N. :param l_rew: A chainer variable holding a list of rewards. Should be of shape N. :param l_next_obs: A chainer variable holding a list of observations at the next time step. Should be of shape N * |S|. :param l_done: A chainer variable holding a list of binary values (indicating whether episode ended after this time step). Should be of shape N. :return: A chainer variable holding a scalar loss. """ # Hint: You may want to make use of the following fields: self._discount, self._q, self._qt # Hint2: Q-function can be called by self._q.forward(argument) # Hint3: You might also find https://docs.chainer.org/en/stable/reference/generated/chainer.functions.select_item.html useful # this is the same as the last excercise but we get are max of Q, then apply it to Qt q_forwarded = self._q.forward(l_next_obs) qt_forwarded = self._qt.forward(l_next_obs) a_q_forwarded = F.argmax(q_forwarded, axis=1) Q_dual = F.select_item(qt_forwarded, a_q_forwarded) y = l_rew + (1 - l_done) * (self._discount * Q_dual) Q = F.select_item(self._q.forward(l_obs), l_act) loss = F.mean(F.square(y - Q)) return loss
def compute_double_q_learning_loss(self, l_obs, l_act, l_rew, l_next_obs, l_done): """ :param l_obs: A chainer variable holding a list of observations. Should be of shape N * |S|. :param l_act: A chainer variable holding a list of actions. Should be of shape N. :param l_rew: A chainer variable holding a list of rewards. Should be of shape N. :param l_next_obs: A chainer variable holding a list of observations at the next time step. Should be of shape N * |S|. :param l_done: A chainer variable holding a list of binary values (indicating whether episode ended after this time step). Should be of shape N. :return: A chainer variable holding a scalar loss. """ # Hint: You may want to make use of the following fields: self._discount, self._q, self._qt # Hint2: Q-function can be called by self._q.forward(argument) # Hint3: You might also find https://docs.chainer.org/en/stable/reference/generated/chainer.functions.select_item.html useful obs_q_value = F.select_item(self._q.forward(l_obs), l_act) target_q_value = np.zeros(l_done.shape[0]) for i in range(l_done.shape[0]): if l_done[i] == True: target_q_value[i] = l_rew[i] else: q_value_next = self._q.forward( F.expand_dims(l_next_obs[i], axis=0)) max_idx = F.argmax(q_value_next) target_value = self._qt.forward( F.expand_dims(l_next_obs[i], axis=0)) max_value = F.select_item(target_value, np.array([max_idx.data])) target_q_value[i] = l_rew[i] + self._discount * max_value.data loss = F.mean_squared_error(F.cast(target_q_value, np.float32), F.cast(obs_q_value, np.float32)) return loss
def __update(self, _q, _qTarget, optimiser, samples, gamma=0.99): """Update a Q-function with given samples and a target Q-function.""" # self.__debug("Running update...") currentStates = _q.xp.asarray(samples["states"], dtype=np.float32) actions = _q.xp.asarray(samples["actions"], dtype=np.int32) rewards = _q.xp.asarray(samples["rewards"], dtype=np.float32) completes = _q.xp.asarray(samples["completes"], dtype=np.float32) nextState = _q.xp.asarray(samples["nextStates"], dtype=np.float32) # Predicted values: Q(s,a) predictions = functions.select_item(_q(currentStates), actions) # Target values: r + gamma * max_b Q(s',b) with chainer.no_backprop_mode(): if self.__doubleDQN: _qNext = functions.select_item( _qTarget(nextState), functions.argmax(_q(nextState), axis=1)) else: _qNext = functions.max(_qTarget(nextState), axis=1) target = rewards + gamma * (1 - completes) * _qNext loss = functions.mean( functions.huber_loss(predictions, target, delta=1.0, reduce='no')) _q.cleargrads() loss.backward() optimiser.update()
def compute_double_q_learning_loss(self, l_obs, l_act, l_rew, l_next_obs, l_done): """ :param l_obs: A chainer variable holding a list of observations. Should be of shape N * |S|. :param l_act: A chainer variable holding a list of actions. Should be of shape N. :param l_rew: A chainer variable holding a list of rewards. Should be of shape N. :param l_next_obs: A chainer variable holding a list of observations at the next time step. Should be of shape N * |S|. :param l_done: A chainer variable holding a list of binary values (indicating whether episode ended after this time step). Should be of shape N. :return: A chainer variable holding a scalar loss. """ # Hint: You may want to make use of the following fields: self._discount, self._q, self._qt # Hint2: Q-function can be called by self._q.forward(argument) # Hint3: You might also find https://docs.chainer.org/en/stable/reference/generated/chainer.functions.select_item.html useful "*** YOUR CODE HERE ***" # this is the same as the last excercise but we get are max of Q, then apply it to Qt # First the normal Q Q_s_a = F.select_item( self._q.forward(l_obs), l_act) # current quality, current_state, current best action Q_sn = self._q.forward(l_next_obs) # current quality, next state a_q_sn = F.argmax( Q_sn, axis=1) # action that's ideal on current Q, next state Qt_sn = self._qt.forward(l_next_obs) # next quality, next state # Now the dual Q, this is the next quality, for the next state, but taking the action that were best for the current quality in the next state. Can be hard to keep track of! Q_dual = F.select_item(Qt_sn, a_q_sn) y = l_rew + (1 - l_done) * (self._discount * Q_dual) loss = F.mean((y - Q_s_a)**2) return loss return loss
def compute_double_q_learning_loss(self, l_obs, l_act, l_rew, l_next_obs, l_done): """ :param l_obs: A chainer variable holding a list of observations. Should be of shape N * |S|. :param l_act: A chainer variable holding a list of actions. Should be of shape N. :param l_rew: A chainer variable holding a list of rewards. Should be of shape N. :param l_next_obs: A chainer variable holding a list of observations at the next time step. Should be of shape N * |S|. :param l_done: A chainer variable holding a list of binary values (indicating whether episode ended after this time step). Should be of shape N. :return: A chainer variable holding a scalar loss. """ # Hint: You may want to make use of the following fields: self._discount, self._q, self._qt # Hint2: Q-function can be called by self._q.forward(argument) # Hint3: You might also find https://docs.chainer.org/en/stable/reference/generated/chainer.functions.select_item.html useful # TODO: replace this line feed_forward_learner = self._q.forward(l_obs) q_learner = F.select_item(feed_forward_learner, l_act) action_q_values = self._q.forward(l_next_obs) best_action = F.argmax(action_q_values, axis=1) feed_forward_target = self._qt.forward(l_next_obs) q_target = F.select_item(feed_forward_target, best_action) terminate = F.cast(l_done, bool) l_rew = F.cast(l_rew, "float32") final_target = F.where(terminate, l_rew, l_rew + self._discount * q_target).data loss = F.mean_squared_error(final_target, q_learner) return loss
def compute_double_q_learning_loss(self, l_obs, l_act, l_rew, l_next_obs, l_done): """ :param l_obs: A chainer variable holding a list of observations. Should be of shape N * |S|. :param l_act: A chainer variable holding a list of actions. Should be of shape N. :param l_rew: A chainer variable holding a list of rewards. Should be of shape N. :param l_next_obs: A chainer variable holding a list of observations at the next time step. Should be of shape N * |S|. :param l_done: A chainer variable holding a list of binary values (indicating whether episode ended after this time step). Should be of shape N. :return: A chainer variable holding a scalar loss. """ # Hint: You may want to make use of the following fields: self._discount, self._q, self._qt # Hint2: Q-function can be called by self._q.forward(argument) # Hint3: You might also find https://docs.chainer.org/en/stable/reference/generated/chainer.functions.select_item.html useful #loss = C.Variable(np.array([0.])) # TODO: replace this line "*** YOUR CODE HERE ***" q_values = self._q.forward(l_next_obs) maxes = F.argmax(q_values, axis=1) q_values = self._qt.forward(l_next_obs) Qs = F.select_item(q_values, maxes) target = l_rew + (1 - l_done) * self._discount * Qs Q_s = F.select_item(self._q.forward(l_obs), l_act) loss = F.mean_squared_error(target, Q_s) return loss
def compute_double_q_learning_loss(self, l_obs, l_act, l_rew, l_next_obs, l_done): """ :param l_obs: A chainer variable holding a list of observations. Should be of shape N * |S|. :param l_act: A chainer variable holding a list of actions. Should be of shape N. :param l_rew: A chainer variable holding a list of rewards. Should be of shape N. :param l_next_obs: A chainer variable holding a list of observations at the next time step. Should be of shape N * |S|. :param l_done: A chainer variable holding a list of binary values (indicating whether episode ended after this time step). Should be of shape N. :return: A chainer variable holding a scalar loss. """ # Hint: You may want to make use of the following fields: self._discount, self._q, self._qt # Hint2: Q-function can be called by self._q.forward(argument) # Hint3: You might also find # https://docs.chainer.org/en/stable/reference/generated/chainer.functions.select_item.html # useful # loss = C.Variable(np.array([0.])) # compute target q value named y q_next = self._qt.forward(l_next_obs) a_next = F.argmax(self._q.forward(l_next_obs), axis=1) q_act_next = F.select_item(q_next, a_next) y = l_rew + self._discount * q_act_next * (1 - l_done) # compute mean square loss function q = self._q.forward(l_obs) q_act = F.select_item(q, l_act) loss = F.mean(F.square(q_act - y)) return loss
def compute_double_q_learning_loss(self, l_obs, l_act, l_rew, l_next_obs, l_done): """ :param l_obs: A chainer variable holding a list of observations. Should be of shape N * |S|. :param l_act: A chainer variable holding a list of actions. Should be of shape N. :param l_rew: A chainer variable holding a list of rewards. Should be of shape N. :param l_next_obs: A chainer variable holding a list of observations at the next time step. Should be of shape N * |S|. :param l_done: A chainer variable holding a list of binary values (indicating whether episode ended after this time step). Should be of shape N. :return: A chainer variable holding a scalar loss. """ # Hint: You may want to make use of the following fields: self._discount, self._q, self._qt # Hint2: Q-function can be called by self._q.forward(argument) # Hint3: You might also find https://docs.chainer.org/en/stable/reference/generated/chainer.functions.select_item.html useful "*** YOUR CODE HERE ***" #################################################################################################### target_action = F.argmax(self._q.forward(l_next_obs), axis=1) y = l_rew + (1 - l_done) * self._discount * F.select_item(self._qt.forward(l_next_obs), target_action) q = F.select_item(self._q.forward(l_obs), l_act) # same as before loss = F.mean_squared_error(y, q) # same as before # Based on the performance of the Double DQN algorithm, we agree that the performance gain is # not obvious (refer to "Results.pdf"). We would even add, that its improvement in performance is # less stable. #################################################################################################### return loss
def compute_double_q_learning_loss(self, l_obs, l_act, l_rew, l_next_obs, l_done): """ :param l_obs: A chainer variable holding a list of observations. Should be of shape N * |S|. :param l_act: A chainer variable holding a list of actions. Should be of shape N. :param l_rew: A chainer variable holding a list of rewards. Should be of shape N. :param l_next_obs: A chainer variable holding a list of observations at the next time step. Should be of shape N * |S|. :param l_done: A chainer variable holding a list of binary values (indicating whether episode ended after this time step). Should be of shape N. :return: A chainer variable holding a scalar loss. """ # Hint: You may want to make use of the following fields: self._discount, self._q, self._qt # Hint2: Q-function can be called by self._q.forward(argument) # Hint3: You might also find https://docs.chainer.org/en/stable/reference/generated/chainer.functions.select_item.html useful "*** YOUR CODE HERE ***" # Find the quality of this state and this action. Q = F.select_item(self._q.forward(l_obs), l_act) # Find the quality of the next state using our current model. Q_next = self._q.forward(l_next_obs) action = F.argmax(self._q.forward(l_next_obs), axis=1) # Find the greedy quality values with the next state. Q_target = self._qt.forward(l_next_obs) # The dual Q, the next quality for the next state, but using the # actions that were best for the current quality in the current state. dual_q = F.select_item(Q_target, action) # Find y. y = l_rew + (1 - l_done) * (self._discount * dual_q) loss = F.mean((y - Q) ** 2) return loss
def compute_double_q_learning_loss(self, l_obs, l_act, l_rew, l_next_obs, l_done): """ :param l_obs: A chainer variable holding a list of observations. Should be of shape N * |S|. :param l_act: A chainer variable holding a list of actions. Should be of shape N. :param l_rew: A chainer variable holding a list of rewards. Should be of shape N. :param l_next_obs: A chainer variable holding a list of observations at the next time step. Should be of shape N * |S|. :param l_done: A chainer variable holding a list of binary values (indicating whether episode ended after this time step). Should be of shape N. :return: A chainer variable holding a scalar loss. """ # Hint: You may want to make use of the following fields: self._discount, self._q, self._qt # Hint2: Q-function can be called by self._q.forward(argument) # Hint3: You might also find https://docs.chainer.org/en/stable/reference/generated/chainer.functions.select_item.html useful loss = C.Variable(np.array([0.])) # TODO: replace this line "*** YOUR CODE HERE ***" N = l_obs.data.shape[0] # set gamma to zero for those states that are terminal discounts = np.array( [0.0 if l_done.data[i] else self._discount for i in range(N)]) # get Q value estimate from Q-target network q_indices = F.argmax(self._q.forward(l_next_obs), axis=1) y = l_rew + discounts * F.select_item(self._qt.forward(l_next_obs), q_indices) # compute the loss using the current Q-network ont the taken actions loss = F.mean((y - F.select_item(self._q.forward(l_obs), l_act))**2) return loss
def update(Q, target_Q, opt, samples, gamma=0.99, target_type='double_dqn'): """Update a Q-function with given samples and a target Q-function.""" dtype = chainer.get_dtype() xp = Q.xp obs = xp.asarray([sample[0] for sample in samples], dtype=dtype) action = xp.asarray([sample[1] for sample in samples], dtype=np.int32) reward = xp.asarray([sample[2] for sample in samples], dtype=dtype) done = xp.asarray([sample[3] for sample in samples], dtype=dtype) obs_next = xp.asarray([sample[4] for sample in samples], dtype=dtype) # Predicted values: Q(s,a) y = F.select_item(Q(obs), action) # Target values: r + gamma * max_b Q(s',b) with chainer.no_backprop_mode(): if target_type == 'dqn': next_q = F.max(target_Q(obs_next), axis=1) elif target_type == 'double_dqn': next_q = F.select_item(target_Q(obs_next), F.argmax(Q(obs_next), axis=1)) else: raise ValueError('Unsupported target_type: {}'.format(target_type)) target = reward + gamma * (1 - done) * next_q loss = mean_clipped_loss(y, target) Q.cleargrads() loss.backward() opt.update()
def compute_double_q_learning_loss(self, l_obs, l_act, l_rew, l_next_obs, l_done): """ :param l_obs: A chainer variable holding a list of observations. Should be of shape N * |S|. :param l_act: A chainer variable holding a list of actions. Should be of shape N. :param l_rew: A chainer variable holding a list of rewards. Should be of shape N. :param l_next_obs: A chainer variable holding a list of observations at the next time step. Should be of shape N * |S|. :param l_done: A chainer variable holding a list of binary values (indicating whether episode ended after this time step). Should be of shape N. :return: A chainer variable holding a scalar loss. """ # Hint: You may want to make use of the following fields: self._discount, self._q, self._qt # Hint2: Q-function can be called by self._q.forward(argument) # Hint3: You might also find https://docs.chainer.org/en/stable/reference/generated/chainer.functions.select_item.html useful # loss = C.Variable(np.array([0.])) # TODO: replace this line l_rew = F.cast(l_rew, np.float32) q_future = self._q.forward(l_next_obs) qt_future = self._qt.forward(l_next_obs) future_rew = l_rew + self._discount * F.select_item( qt_future, F.argmax(q_future, axis=1)) target = F.select_item(F.stack([future_rew, l_rew], axis=1), F.cast(l_done, np.int32)) y = F.select_item(self._q.forward(l_obs), l_act) return F.mean(F.square(y - target))
def _compute_q_loss(self, batch): """B(D, r)""" batch_reward = self.xp.concatenate([ self.xp.ones_like(batch['reward'][:self.minibatch_size]), self.xp.zeros_like(batch['reward'][self.minibatch_size:]) ], axis=0) batch_state = batch['state'] batch_next_state = batch['next_state'] batch_actions = batch['action'] batch_discount = batch['discount'] batch_terminal = batch['is_state_terminal'] batch_absorb = batch['is_state_absorb'] batch_next_absorb = batch['is_next_state_absorb'] with chainer.no_backprop_mode(), chainer.using_config('train', False): target_next_v = self._calc_target_v(batch_next_state, batch_next_absorb) if self.reward_func: # reward for gan D = F.sigmoid( self.reward_func(batch_state, batch_absorb, batch_actions)) batch_reward = F.flatten( F.log(D + 1e-8) - F.log(1 - D + 1e-8) ) # + 0.5 * batch_reward / self.temperature / self.lamda batch_reward = F.flatten(batch_reward) self.reward_demo_record.extend( cuda.to_cpu(batch_reward.array[:self.minibatch_size])) self.reward_samp_record.extend( cuda.to_cpu(batch_reward.array[self.minibatch_size:])) target_q = batch_reward + batch_discount * \ (1.0 - batch_terminal) * target_next_v if self.is_discrete: predict_q1 = F.flatten( F.select_item(self.q_func1(batch_state, batch_absorb), batch_actions)) predict_q2 = F.flatten( F.select_item(self.q_func2(batch_state, batch_absorb), batch_actions)) else: predict_q1 = F.flatten( self.q_func1(batch_state, batch_absorb, batch_actions)) predict_q2 = F.flatten( self.q_func2(batch_state, batch_absorb, batch_actions)) # soft bellman error loss1 = 0.5 * F.mean_squared_error(target_q, predict_q1) loss2 = 0.5 * F.mean_squared_error(target_q, predict_q2) self.q1_record.extend(cuda.to_cpu(predict_q1.array)) self.q2_record.extend(cuda.to_cpu(predict_q2.array)) return loss1, loss2
def check_value_check(self, x_data, t_data): x = chainer.Variable(x_data) t = chainer.Variable(t_data) if self.valid: # Check if it throws nothing functions.select_item(x, t) else: with self.assertRaises(ValueError): functions.select_item(x, t)
def check_value_check(self, x_data, t_data): x = chainer.Variable(x_data) t = chainer.Variable(t_data) if self.valid: # Check if it throws nothing functions.select_item(x, t) else: with self.assertRaises(ValueError): functions.select_item(x, t)
def u2a(u): # u, a: (N * 1) Variable N = len(u.data) phi = np.argsort(u.data.reshape(N)) # u.data[phi]: ascending a_list = [0] * N cumprod = Variable(np.array([[1.0]]).astype(np.float32)) for i in range(N): a_list[phi[i]] = cumprod * ( 1.0 - F.reshape(F.select_item(F.transpose(u), np.array([phi[i]])), (1, 1))) cumprod *= F.reshape(F.select_item(F.transpose(u), np.array([phi[i]])), (1, 1)) return F.concat(a_list, 0) # concat vertically
def update_q_func(self, batch): """Compute loss for a given Q-function.""" batch_next_state = batch['next_state'] batch_rewards = batch['reward'] batch_terminal = batch['is_state_terminal'] batch_state = batch['state'] batch_actions = batch['action'] batch_discount = batch['discount'] with chainer.no_backprop_mode(), chainer.using_config('train', False): next_action_distrib = self.policy(batch_next_state) next_actions, next_log_prob =\ next_action_distrib.sample_with_log_prob() entropy_term = self.temperature * next_log_prob if self.is_discrete: next_q1 = F.select_item(self.target_q_func1(batch_next_state), next_actions) next_q2 = F.select_item(self.target_q_func2(batch_next_state), next_actions) else: next_q1 = self.target_q_func1(batch_next_state, next_actions) next_q2 = self.target_q_func2(batch_next_state, next_actions) entropy_term = entropy_term[..., None] next_q = F.minimum(next_q1, next_q2) assert next_q.shape == entropy_term.shape target_q = batch_rewards + batch_discount * \ (1.0 - batch_terminal) * F.flatten(next_q - entropy_term) if self.is_discrete: predict_q1 = F.flatten( F.select_item(self.q_func1(batch_state), batch_actions)) predict_q2 = F.flatten( F.select_item(self.q_func2(batch_state), batch_actions)) else: predict_q1 = F.flatten(self.q_func1(batch_state, batch_actions)) predict_q2 = F.flatten(self.q_func2(batch_state, batch_actions)) loss1 = 0.5 * F.mean_squared_error(target_q, predict_q1) loss2 = 0.5 * F.mean_squared_error(target_q, predict_q2) # Update stats self.q1_record.extend(cuda.to_cpu(predict_q1.array)) self.q2_record.extend(cuda.to_cpu(predict_q2.array)) self.q_func1_loss_record.append(float(loss1.array)) self.q_func2_loss_record.append(float(loss2.array)) self.q_func1_optimizer.update(lambda: loss1) self.q_func2_optimizer.update(lambda: loss2)
def check_forward(self, x_data, t_data): x = chainer.Variable(x_data) t = chainer.Variable(t_data) y = functions.select_item(x, t) y_exp = cuda.to_cpu(x_data)[range(t_data.size), cuda.to_cpu(t_data)] numpy.testing.assert_equal(cuda.to_cpu(y.data), y_exp)
def train(self, x, y, actions=None): actions = actions.astype(np.int32) batch_size = len(actions) if self._gpu_device: x = cuda.to_gpu(x, self._gpu_device) y = cuda.to_gpu(y, self._gpu_device) actions = cuda.to_gpu(actions, self._gpu_device) q = self._model(x) q_subset = F.reshape(F.select_item(q, actions), (batch_size, 1)) y = y.reshape(batch_size, 1) loss = F.sum(F.huber_loss(q_subset, y, 1.0)) self._model.cleargrads() loss.backward() self._optimizer.update() self._loss_val = np.asscalar(cuda.to_cpu(loss.data)) # Keeps track of the number of train() calls self._steps += 1 if self._steps % self._target_update_interval == 0: # copy weights self._target.copyparams(self._model)
def compute_batch_loss(self, batch, weights): """Compute gradients on a list of trajectories. Args: batch -- a TrajectoryBatch weights -- a list of weights for trajectories in the batch Returns a loss value """ weights = self._xp.array(weights) for step, step_batch in batch.step_batches(self._gpu_device): policies, values = self._model(step_batch.states) values *= (1 - step_batch.terminals).reshape(values.shape) logprobs = F.select_item(policies, step_batch.actions) batch.set_logprobs_and_values(step, logprobs, values) losses = [] for trajectory, logprobs, values in batch: losses.append( self.compute_trajectory_loss(trajectory, logprobs, values)) losses = F.stack(losses) loss = F.average(losses * weights) loss.backward() return np.asscalar(cuda.to_cpu(loss.data))
def compute_loss(self, state, action, reward, next_state, episode_ends): batchsize = state.shape[0] xp = self.dqn.model.xp with chainer.using_config("train", True): q = self.dqn.compute_q_value(state) with chainer.no_backprop_mode(): max_target_q_data = self.dqn.compute_target_q_value( next_state).data max_target_q_data = xp.amax(max_target_q_data, axis=1) t = reward + (1 - episode_ends) * self.discount_factor * max_target_q_data t = Variable(xp.reshape(t.astype(xp.float32), (-1, 1))) y = functions.reshape(functions.select_item(q, action), (-1, 1)) if self.clip_loss: loss = functions.huber_loss(t, y, delta=1.0) else: loss = functions.mean_squared_error(t, y) / 2 loss = functions.sum(loss) # check NaN loss_value = float(loss.data) if loss_value != loss_value: import pdb pdb.set_trace() return loss
def train_batch(self): j = \ np.random.permutation(min(self.frame, self.pool_size - self.train_term))[:self.batch_size] % self.pool_size j1 = j + 1 s_j = (Variable(self.xp.asarray(self.state_pool[j].astype(np.float32))) / 127.5) - 1 s_j1 = (Variable( self.xp.asarray(self.state_pool[j + 1].astype(np.float32))) / 127.5) - 1 Qhat = self.target_q(s_j1, train=False) max_Q = cuda.to_cpu(F.max(Qhat, axis=1).data) # max_Q = cuda.to_cpu(self.xp.max(Qhat.data, axis=1)) y_j = Variable( self.xp.asarray(self.reward_pool[j] + (1 - self.terminal_pool[j]) * self.gamma * max_Q)) a_j = Variable(self.xp.asarray(self.action_pool[j])) qs = self.action_q(s_j) q_preds = F.select_item(qs, a_j) loss = F.mean_squared_error(y_j, q_preds) self.optimizer.zero_grads() loss.backward() loss.unchain_backward() self.optimizer.update() qp_cpu = qs.data print "Q", np.mean(q_preds.data) print "loss", loss.data print np.mean(qp_cpu, axis=0)
def train(self, x, y, actions=None): actions = actions.astype(np.int32) batch_size = len(actions) if self._gpu_device: x = cuda.to_gpu(x, self._gpu_device) y = cuda.to_gpu(y, self._gpu_device) actions = cuda.to_gpu(actions, self._gpu_device) q = self._model(x) q_subset = F.reshape(F.select_item(q, actions), (batch_size, 1)) y = y.reshape(batch_size, 1) loss = F.sum(F.huber_loss(q_subset, y, 1.0)) self._model.cleargrads() loss.backward() self._optimizer.update() self._loss_val = np.asscalar(cuda.to_cpu(loss.data)) # Keeps track of the number of train() calls self._steps += 1 if self._steps % self._target_update_interval == 0: # copy weights self._target.copyparams(self._model)
def _train_batch(self, j): j1 = j + 1 s_j = (Variable(self.xp.asarray(self.state_pool[j].astype(np.float32))) / 127.5) - 1 s_j1 = (Variable( self.xp.asarray(self.state_pool[j + 1].astype(np.float32))) / 127.5) - 1 Qhat = self.target_q(s_j1, train=False) max_Q = cuda.to_cpu(F.max(Qhat, axis=1).data) # max_Q = cuda.to_cpu(self.xp.max(Qhat.data, axis=1)) y_j = Variable( self.xp.asarray(self.reward_pool[j] + (1 - self.terminal_pool[j]) * self.gamma * max_Q)) a_j = Variable(self.xp.asarray(self.action_pool[j])) qs = self.action_q(s_j) q_preds = F.select_item(qs, a_j) loss = F.mean_squared_error(y_j, q_preds) self.optimizer.zero_grads() res = loss.backward() loss.unchain_backward() self.optimizer.update() qp_cpu = qs.data # print "loss", loss.data # print np.mean(qp_cpu, axis=0) # print(res) return np.mean(cuda.to_cpu(q_preds.data))
def mylog_prob(self, x): n_batch, n_actions, h, w = self.all_log_prob.shape p_trans = F.transpose(self.all_log_prob, axes=(0,2,3,1)) p_trans = F.reshape(p_trans,(-1,n_actions)) x_reshape = F.reshape(x,(1,-1))[0] selected_p = F.select_item(p_trans,x_reshape) return F.reshape(selected_p, (n_batch,1,h,w))
def compute_q_learning_loss(self, l_obs, l_act, l_rew, l_next_obs, l_done): """ :param l_obs: A chainer variable holding a list of observations. Should be of shape N * |S|. :param l_act: A chainer variable holding a list of actions. Should be of shape N. :param l_rew: A chainer variable holding a list of rewards. Should be of shape N. :param l_next_obs: A chainer variable holding a list of observations at the next time step. Should be of shape N * |S|. :param l_done: A chainer variable holding a list of binary values (indicating whether episode ended after this time step). Should be of shape N. :return: A chainer variable holding a scalar loss. """ # Hint: You may want to make use of the following fields: self._discount, self._q, self._qt # Hint2: Q-function can be called by self._q.forward(argument) # Hint3: You might also find https://docs.chainer.org/en/stable/reference/generated/chainer.functions.select_item.html useful "*** YOUR CODE HERE ***" # Ideal next action per state, maximizing value. Qt_greedy = F.max(self._qt.forward(l_next_obs), -1) # Find y y = l_rew + (1 - l_done) * (self._discount * Qt_greedy) # Find Q, our current model. Q = F.select_item(self._q.forward(l_obs), l_act) # Find the total loss from this iteration. loss = F.mean((y - Q) ** 2) return loss
def update_model(self): (s, action, reward, s_next, is_terminal) = self.memory.sample_minibatch(self.minibatch_size) # compute Q targets (max_a' Q_hat(s_next, a')) Q_hat = self.target_network(s_next) Q_hat_max = F.max(Q_hat, axis=1, keepdims=True) y = (1-is_terminal)*self.gamma*Q_hat_max + reward # compute Q(s, action) Q = self.model_network(s) Q_subset = F.reshape(F.select_item(Q, action), (self.minibatch_size, 1)) # compute Huber loss error = y - Q_subset loss_clipped = abs(error) * (abs(error.data) > 1) + (error**2) * (abs(error.data) <= 1) loss = F.sum(loss_clipped) / self.minibatch_size # perform model update self.model_network.zerograds() ## zero out the accumulated gradients in all network parameters loss.backward() self.optimizer.update() # target network tracks the model for dst, src in zip(self.target_network.params(), self.model_network.params()): dst.data = self.tau * src.data + (1 - self.tau) * dst.data return loss.data
def __call__(self, x, t, index): h = self.predict(x) self.history = np.append(self.history, np.array([np.mean(h.data, axis=0)]), axis=0) h = F.select_item(h, index) # choose the action[index] in each column error_abs = abs(h - t) error = F.concat((F.expand_dims(error_abs ** 2, 1), F.expand_dims(error_abs, 1)), axis=1) # 1 < error_abs <=> error ** 2 > error, error < 1 <=> error ** 2 < error self.loss = F.sum(F.min(error, axis=1)) / np.float32(len(error_abs)) return self.loss
def compute_double_q_learning_loss(self, l_obs, l_act, l_rew, l_next_obs, l_done): """ :param l_obs: A chainer variable holding a list of observations. Should be of shape N * |S|. :param l_act: A chainer variable holding a list of actions. Should be of shape N. :param l_rew: A chainer variable holding a list of rewards. Should be of shape N. :param l_next_obs: A chainer variable holding a list of observations at the next time step. Should be of shape N * |S|. :param l_done: A chainer variable holding a list of binary values (indicating whether episode ended after this time step). Should be of shape N. :return: A chainer variable holding a scalar loss. """ # Hint: You may want to make use of the following fields: self._discount, self._q, self._qt # Hint2: Q-function can be called by self._q.forward(argument) # Hint3: You might also find https://docs.chainer.org/en/stable/reference/generated/chainer.functions.select_item.html useful "*** YOUR CODE HERE ***" tar_act = F.argmax(self._q.forward(l_next_obs), axis=1) y = l_rew + (1 - l_done) * self._discount * F.select_item(self._qt.forward(l_next_obs), tar_act) q = F.select_item(self._q.forward(l_obs), l_act) loss = F.mean_squared_error(y, q) return loss
def read(address): #map from the reals to the hypercube of dimesion n index = F.tanh(address) #map from a point to the nearest corner of the hypercube f = lambda x: x > 0 mainIndex = np.vectorize(f,index.data,cache=True) mainValue = F.select_item(array,lookup(mainIndex)) scaleFactor =F.exp(F.sum(F.log(F.absolute(x)))) return mainValue * scaleFactor
def train(): max_term_size = args.max_train_term current_term_size = args.train_term term_increase_rate = 1 + args.train_term_increase last_clock = time.clock() update_target_iteration = 0 if use_double_dqn: target_q = q.copy() target_q.reset_state() while True: term_size = int(current_term_size) if frame < batch_size * term_size: continue batch_index = np.random.permutation(min(frame - term_size, POOL_SIZE))[:batch_size] train_image = Variable(xp.asarray(state_pool[batch_index])) y = q(train_image) if use_double_dqn and update_target_iteration >= update_target_interval: target_q = q.copy() target_q.reset_state() target_q(Variable(xp.asarray(state_pool[batch_index]), volatile=True)) update_iteration = 0 for term in range(term_size): next_batch_index = (batch_index + 1) % POOL_SIZE train_image = Variable(xp.asarray(state_pool[next_batch_index])) score = q(train_image) if only_result: t = Variable(xp.asarray(reward_pool[batch_index])) else: if use_double_dqn: eval_image = Variable(xp.asarray(state_pool[next_batch_index]), volatile=True) target_score = target_q(eval_image) best_action = cuda.to_cpu(xp.argmax(score.data, axis=1)) best_q = cuda.to_cpu(target_score.data)[range(batch_size), best_action] else: best_q = cuda.to_cpu(xp.max(score.data, axis=1)) t = Variable(xp.asarray(reward_pool[batch_index] + (1 - terminal_pool[batch_index]) * gamma * best_q)) action_index = chainer.Variable(xp.asarray(action_pool[batch_index])) loss = F.mean_squared_error(F.select_item(y, action_index), t) y = score optimizer.zero_grads() loss.backward() loss.unchain_backward() optimizer.update() batch_index = next_batch_index print "loss", float(cuda.to_cpu(loss.data)) clock = time.clock() print "train", clock - last_clock last_clock = clock if use_double_dqn: update_target_iteration += 1 current_term_size = min(current_term_size * term_increase_rate, max_term_size) print "current_term_size ", current_term_size
def check_backward(self, x_data, t_data, gy_data): x = chainer.Variable(x_data) t = chainer.Variable(t_data) y = functions.select_item(x, t) y.grad = gy_data y.backward() self.assertEqual(None, t.grad) func = y.creator f = lambda: func.forward((x.data, t.data)) gx, = gradient_check.numerical_grad(f, (x.data,), (gy_data,), eps=0.01) gradient_check.assert_allclose(gx, x.grad)
def f(x): y = functions.select_item(x, t_data) return y * y
def __call__(self, X, Y, A, Q): P = Q(X) P = F.select_item(P, Variable(np.array(A).astype('int32'))) return F.mean_squared_error(Y, P)
def sampled_actions_log_probs(self): return F.select_item( self.log_probs, chainer.Variable(np.asarray(self.action_indices, dtype=np.int32)))
def compute_loss(self, s, a, r, new_s, done, loss_log=False): if self.net_type == "full": s = s.reshape(self.batch_size, self.input_slides*self.size*self.size) new_s = new_s.reshape(self.batch_size, self.input_slides*self.size*self.size) #gpu if self.gpu >= 0: s = cuda.to_gpu(s) new_s = cuda.to_gpu(new_s) if chainer.__version__ >= "2.0.0": s = Variable(s) new_s = Variable(new_s) else: s = Variable(s, volatile='auto') new_s = Variable(new_s, volatile='auto') q_value = self.q(s) with chainer.no_backprop_mode(): if self.mode == "regularize": tg_q_value = self.q(new_s) elif self.mode == "target_mix": tg_q_value = (1.0-self.mix_rate) * self.q(new_s) + self.mix_rate * self.fixed_q(new_s) elif self.mode == "default": tg_q_value = self.fixed_q(new_s) #print "tg_q_value[0]", tg_q_value[0].data if self.gpu >= 0: a = cuda.to_gpu(a) r = cuda.to_gpu(r) done = cuda.to_gpu(done) if chainer.__version__ >= "2.0.0": a = Variable(a) else: a = Variable(a, volatile='auto') argmax_a = F.argmax(tg_q_value, axis=1) #print a #print r q_action_value = F.select_item(q_value, a) #print "q_action_value", q_action_value.data target = r + self.discount * (1.0 - done) * F.select_item(tg_q_value, argmax_a) #print "target", target.data #target is float32 q_action_value = F.reshape(q_action_value, (-1, 1)) target = F.reshape(target, (-1, 1)) loss_sum = F.sum(F.huber_loss(q_action_value, target, delta=1.0)) loss = loss_sum / q_action_value.shape[0] #print "loss_a", loss.data if self.mode == "regularize" or loss_log == True: if self.penalty_function == "value": y = q_value with chainer.no_backprop_mode(): t = self.fixed_q(s) if self.penalty_function == "action_value": y = q_action_value with chainer.no_backprop_mode(): t = F.select_item(self.fixed_q(s), a) t = F.reshape(t, (-1, 1)) if self.penalty_function == "max_action_value": y = F.select_item(self.q(new_s), argmax_a) y = F.reshape(y, (-1, 1)) with chainer.no_backprop_mode(): t = F.select_item(self.fixed_q(new_s), argmax_a) t = F.reshape(t, (-1, 1)) if self.penalty_type == "huber": if self.final_penalty_cut == 1: penalty_sum = F.sum((1.0 - done)*F.huber_loss(y, t, delta=1.0)) else: penalty_sum = F.sum(F.huber_loss(y, t, delta=1.0)) penalty = penalty_sum / (y.shape[0]*y.shape[1]) if self.penalty_type == "mean_squared": penalty = F.mean_squared_error(y, t) if loss_log == True: #y_data = cuda.to_cpu(y.data) #t_data = cuda.to_cpu(t.data) return loss, penalty #return loss, penalty, np.average(y_data), np.std(y_data), np.average(t_data), np.std(t_data) if penalty.data > self.threshold: #print "-------------on----------------" loss = loss + self.penalty_weight * penalty #print "loss_b", loss.data return loss