def preprocess_for_bounds(self, x): if not isinstance(x, chainer.Variable): x = chainer.Variable(x) if self.verify: x.lower = F.clip(x - self.epsilon * self.x_scale, *self.x_range) x.upper = F.clip(x + self.epsilon * self.x_scale, *self.x_range) return x
def softmax_focalloss(x, t, gamma=2, eps=1e-7, class_weight=1.0): # p = F.clip(F.softmax(x), x_min=eps, x_max=1-eps) p = F.clip(x, x_min=eps, x_max=1-eps) ## we assume the input is already applied softmax # print(p.shape, t.shape) # print(p.shape,self.xp.eye(class_num)[t[:,0,:,:]].shape) ## label smoothing q = -F.clip(t, x_min=eps, x_max=1-eps) * F.log(p) return F.average(class_weight * q * ((1 - p) ** gamma))
def psnr(pred, truth): """ https://qiita.com/yoya/items/510043d836c9f2f0fe2f """ batch_size = len(pred) mse = F.sum((pred - truth).reshape(batch_size, -1)**2, axis=1) mse = F.clip(mse, 1e-8, 1e+8) max_i = F.max(truth.reshape(batch_size, -1), axis=1) max_i = F.clip(max_i, 1e-8, 1e+8) return 20 * F.log10(max_i) - 10 * F.log10(mse)
def get_bbox_corners(grids, image_size): _, _, height, width = grids.shape grids = (grids + 1) / 2 x_points = grids[:, 0, ...] * image_size.width y_points = grids[:, 1, ...] * image_size.height x_points = F.clip(x_points, 0., float(image_size.width)) y_points = F.clip(y_points, 0., float(image_size.height)) top_left_x = F.get_item(x_points, [..., 0, 0]) top_left_y = F.get_item(y_points, [..., 0, 0]) bottom_right_x = F.get_item(x_points, [..., height - 1, width - 1]) bottom_right_y = F.get_item(y_points, [..., height - 1, width - 1]) return top_left_x, top_left_y, bottom_right_x, bottom_right_y
def merge_representation(self, index, section, xs, ys): """ Merge and average the context representation to prepare the input for next layer. If the prediction is 'O', its corresponding row of context representation of xs will be used as the input for next layer, otherwise its corresponding row of ys will be seleted as the input for next layer. + index: merge index for predicts + xs: context representation, input of bi_word_tag BiLSTM layer + ys: context representation, output of bi_word_tag BiLSTM layer e.g. predicts: B-Gene, I-Gene, O,B-protein,B-DNA index array: [ 1, -1, -1, -1 1, -1, -1, -1 -1, 0, -1, -1 -1, -1, 1, -1 -1, -1, -1, 1 ] ys_index clip array: [ 1, 0, 0, 0 1, 0, 0, 0 0, 1, 0, 0 0, 0, 1, 0 0, 0, 0, 1 ] xs index(1-|index|) array: [ 0, 0, 0, 0 0, 0, 0, 0 0, 1, 0, 0 0, 0, 0, 0 0, 0, 0, 0 ] """ ys_index = index.copy() ys_index = F.clip(ys_index.astype('f'), 0., 1.0) ys = F.matmul(ys_index, F.vstack(ys), transa=True) xs_index = index.copy() xs_index = 1 - Fmat.absolute(xs_index.astype('f')) xs = F.matmul(xs_index, F.vstack(xs), transa=True) # Sum word vectors ys = Fmat.add(xs, ys) # Average word vectors for entity representation sum_index = F.sum(ys_index, axis=0) sum_index = F.clip(sum_index, 1.0, 1000000.0) sum_index = F.tile(sum_index, (ys.shape[1], 1)) sum_index = F.transpose(sum_index) ys = Fmat.div(ys, sum_index) ys = F.split_axis(ys, section, axis=0) return ys
def gaussian_diag_logps(mean, logvar, sample=None): """ """ xp = cuda.get_array_module(mean) if sample is None: noise = xp.random.standard_normal(mean.shape) sample = mean + xp.exp(F.clip(0.5 * logvar, -100., 100.)) * noise output = -0.5 * (xp.log(2.0*xp.pi) + logvar + F.square(sample - mean) / F.exp(F.clip(logvar, -100., 100.))) return output
def calc_msssim(self, x0, x1): msssim = 1 for i in range(self.level - 1): cs = self.ssim_func(x0, x1, cs_map=True) cs = F.clip(cs, 0., np.inf) msssim *= cs**self.weight[i] x0 = F.average_pooling_2d(x0, 2) x1 = F.average_pooling_2d(x1, 2) ssim = self.ssim_func(x0, x1) ssim = F.clip(ssim, 0., np.inf) msssim *= ssim**self.weight[-1] return msssim
def wrm_attack(cls, x, y=None, steps=5.0, loss_type='cross_entropy', c_type='sqaure', gamma=1., alpha=1.0, clip_x=True, return_phis=False): xp = cls.xp x_org = copy.deepcopy(x) _alpha = alpha / gamma if return_phis: phis = [] for t in range(steps): logit = cls(x) if y is None: y = F.argmax(logit, axis=1) loss = loss_fun(logit, y, loss_type, reduce='sum') cost = cost_fun(x1=x, y1=y, x2=x_org, y2=y, type=c_type, reduce='sum') phi = loss - gamma * cost # print(xp.mean(phi.array), xp.mean(xp.sum((x.array - x_org.array) ** 2, axis=(1, 2, 3)))) if return_phis: phis.append(phi.array) grad = chainer.grad([phi], [x])[0] lr = _alpha / (t + 1) x = x + lr * grad.array if clip_x: x = F.clip(x, -1., 1.) if return_phis: return x, phis else: return x
def adversarial_attack(cls, x, y=None, steps=1, loss_type='cross_entropy', eps=2.0, clip_x=True, norm_type='L2', alpha=None): # you can prevent from label leaking by setting y to None xp = cuda.get_array_module(x.array) if alpha is None: alpha = eps x_org = copy.deepcopy(x) for t in range(steps): logit = cls(x) if y is None: y = F.argmax(logit, 1) loss = loss_fun(logit, y, type=loss_type) grad = chainer.grad([loss], [x])[0] d = _normalize(grad.array, xp, norm_type=norm_type) x = x + alpha * d x = Variable( _projection(x_org.array, x.array, eps, xp, norm_type=norm_type)) if clip_x: x = F.clip(x, -1., 1.) return x
def virtual_adversarial_attack(cls, x, steps=1, loss_type='kl', eps=2.0, xi=1e-6, logit=None, clip_x=True): xp = cuda.get_array_module(x.array) if logit is None: logit = cls(x) x_org = copy.deepcopy(x) for t in range(steps): # Apply 1 step virtual adversarial attack and multiple projected gradient descent d = _normalize(xp.random.normal(size=x.shape), xp) x_d = x + xi * d logit_d = cls(x_d) kl_loss = loss_fun(logit, logit_d, type=loss_type) grad = chainer.grad([kl_loss], [x_d])[0] d = _normalize(grad.array, xp) x = x + eps * d x = Variable(_projection(x_org.array, x.array, eps, xp)) if clip_x: x = F.clip(x, -1., 1.) return x
def __call__(self, image, generate=False, train=True): batchsize = image.shape[0] # Send and paint hidden_image = self.sender.perceive(image, train=train) sentence, log_probability, p_dists = self.sender.speak( hidden_image, n_word=self.n_word, train=train) sentence_meaning = self.receiver.listen(sentence, train=train) canvas = self.receiver.paint(sentence_meaning, train=train) # Calculate reconstruction error raw_loss = F.sum((canvas - image) ** 2, axis=1) loss = F.sum(raw_loss) / image.data.size # Add (minus) reinforce reward = - raw_loss.data if self.baseline is None: self.baseline = self.xp.mean(reward) reinforce = F.sum( log_probability * (reward - self.baseline)) / batchsize reinforce_loss = - reinforce * 1. # Update baseline self.baseline = self.baseline * 0.99 + self.xp.mean(reward) * 0.01 if generate: return [i.data for i in sentence], \ log_probability.data, \ F.clip(canvas, 0., 1.).data else: return loss, reinforce_loss
def _pairwise_distances_l2(embeddings): """Compute the 2D matrix of distances between all the embeddings. Distance is defined by L2 norm: distance(x, y) := ||x - y||^2 Args: embeddings: Variable with shape=(batch_size, embed_dim) Returns: pairwise_distances: Variable with shape=(batch_size, batch_size) """ # Multiply 0.5 to embeddings because constrain distance in [0.0, 1.0]. embeddings = 0.5 * embeddings # Get the dot product between all embeddings # shape (batch_size, batch_size) dot_product = F.matmul(embeddings, embeddings, transa=False, transb=True) # Get squared L2 norm for each embedding. We can just take the diagonal of `dot_product`. # This also provides more numerical stability (the diagonal of the result will be exactly 0). # shape (batch_size,) squared_norm = F.diagonal(dot_product) # Compute the pairwise distance matrix as we have: # ||a - b||^2 = ||a||^2 - 2 <a, b> + ||b||^2 # shape (batch_size, batch_size) distances = F.expand_dims( squared_norm, axis=0) - 2.0 * dot_product + F.expand_dims(squared_norm, axis=1) # Because of computation errors, some distances might be negative so we put everything >= 0.0 distances = F.clip(distances, 0.0, 1.0) return distances
def stats_pooling(x, mean_only=False): mean = F.mean(x, axis=-1) if mean_only: return mean var = F.mean(x**2, axis=-1) - mean**2 std = F.sqrt(F.clip(var, 0, np.inf)) return F.concat((mean, std), axis=1)
def __call__(self, x, t): h = self.base(x, layers=['res5'])['res5'] self.cam = h h = _global_average_pooling_2d(h) ################################################################################ # ResNet50の後ろにArcFace実装 ################################################################################ # --------------------------- cos(theta) & phi(theta) --------------------------- cosine = F.linear(F.normalize(h), F.normalize(self.weight)) # fc8 sine = F.sqrt(F.clip((1.0 - F.square(cosine)),0, 1)) phi = cosine * cos_m - sine * sin_m if easy_margin: phi = F.where(cosine.data > 0, phi, cosine) else: phi = F.where(cosine.data > th, phi, cosine - mm) # --------------------------- convert label to one-hot --------------------------- one_hot = cp.eye(10)[t].astype(cp.float32) one_hot = Variable(one_hot) # -------------torch.where(out_i = {x_i if condition_i else y_i) ------------- output = (one_hot * phi) + ((1.0 - one_hot) * cosine) output *= s ################################################################################ #h = self.fc(h) return output
def _decode_multiple(self, s, z=None, decode_num=10): if z is None: xp = chainer.backend.get_array_module(s) z = chainer.Variable( xp.random.normal(0, 1, size=(s.shape[0], decode_num, self._latent_dim))) z = F.cast(z, typ=xp.float32) z = F.clip(z, -0.5, 0.5) s = F.expand_dims(s, axis=0) s = F.repeat(s, repeats=decode_num, axis=0) s = F.transpose(s, axes=(1, 0, 2)) x = F.concat((s, z), axis=2) x = F.reshape(x, shape=(-1, x.shape[-1])) h = self._linear3(x) h = F.relu(h) h = self._linear4(h) h = F.relu(h) h = self._linear5(h) h = F.reshape(h, shape=(-1, decode_num, h.shape[-1])) return F.tanh(h), h
def update_model(self): # start minibatch learning for t in range(self.num_train_per_episode): # get learning data with self.lock: states, actions, advantages = self.get_data_from_train_buffer() # get policy and value policies, values = self.model(states) old_policies, _ = self.old_model(states) # calculate loss loss_v = F.squared_error(values, np.array(advantages).astype(np.float32)) loss_ent = -policies.entropy() r = (policies.get_prob(actions) + 1.0e-10) / (old_policies.get_prob(actions) + 1.0e-10) loss_clip = (advantages - values.data) * F.minimum( r, F.clip(r, 1.0 - self.eps, 1.0 + self.eps)) loss = F.mean(-loss_clip + loss_v * 0.2 + 0.01 * loss_ent) self.model.cleargrads() loss.backward() self.optimizer.update() # update old model self.old_model = self.copy_model() self.clear_buffer()
def optimize_surrogate_loss(iterator, model, optimizer, alpha, args): optimizer.target.cleargrads() batch = iterator.next() s_current, action, _, _, log_likelihood, v_target, advantage = concat_examples( batch, device=args.gpu) log_pi_theta = model.compute_log_likelihood(s_current, action) log_pi_theta_old = log_likelihood # print('log_pi_theta: ', log_pi_theta, ' shape: ', log_pi_theta.shape) # print('log_pi_theta_old: ', log_pi_theta_old, ' shape: ', log_pi_theta_old.shape) # division of probability is exponential of difference between log probability probability_ratio = F.exp(log_pi_theta - log_pi_theta_old) clipped_ratio = F.clip(probability_ratio, 1 - args.epsilon * alpha, 1 + args.epsilon * alpha) lower_bounds = F.minimum(probability_ratio * advantage, clipped_ratio * advantage) clip_loss = F.mean(lower_bounds) value = model.value(s_current) xp = chainer.backend.get_array_module(v_target) v_target = xp.reshape(v_target, newshape=value.shape) # print('value: ', value, ' shape: ', value.shape) # print('v_target: ', v_target, ' shape: ', v_target.shape) value_loss = F.mean_squared_error(value, v_target) entropy = model.compute_entropy(s_current) entropy_loss = F.mean(entropy) loss = -clip_loss + args.vf_coeff * value_loss - args.entropy_coeff * entropy_loss # Update parameter loss.backward() optimizer.update() loss.unchain_backward()
def gen(self, nb_samples=1, seed=0, gpu=None): swap_order = self.input_order input_size = self.net[0].W.shape[1] samples = np.zeros((nb_samples, input_size), dtype=np.float32) if gpu is not None: samples = cuda.to_gpu(samples) xp = cuda.get_array_module(samples) rng = np.random.RandomState(self.seed + seed) for i in range(input_size): inv_swap = np.where(swap_order == i)[0][0] out = self.forward(samples) out_exp = F.exp(out[:, inv_swap]) out_exp_invsig = F.log(out_exp / (1 - out_exp)) out_exp_invsig = F.clip(out_exp_invsig, x_min=0., x_max=1.) out_exp_invsig.data[xp.isnan(out_exp_invsig.data)] = 1 if gpu is not None: prob = cuda.to_cpu(out_exp_invsig.data) else: prob = out_exp_invsig.data sample = rng.binomial(p=prob, n=1) if gpu is not None: sample = cuda.to_gpu(sample) samples[:, inv_swap] = sample if gpu is not None: samples = cuda.to_cpu(samples) return samples
def _lossfun(self, entropy, vs_pred, log_probs, vs_pred_old, log_probs_old, advs, vs_teacher): prob_ratio = F.exp(log_probs - log_probs_old) loss_policy = -F.mean( F.minimum( prob_ratio * advs, F.clip(prob_ratio, 1 - self.clip_eps, 1 + self.clip_eps) * advs)) if self.clip_eps_vf is None: loss_value_func = F.mean_squared_error(vs_pred, vs_teacher) else: loss_value_func = F.mean( F.maximum( F.square(vs_pred - vs_teacher), F.square( _elementwise_clip(vs_pred, vs_pred_old - self.clip_eps_vf, vs_pred_old + self.clip_eps_vf) - vs_teacher))) loss_entropy = -F.mean(entropy) self.value_loss_record.append(float(loss_value_func.array)) self.policy_loss_record.append(float(loss_policy.array)) loss = (loss_policy + self.value_func_coef * loss_value_func + self.entropy_coef * loss_entropy) return loss
def __call__(self, x): h = self.head(x) h = self.out(h) mean, log_scale = F.split_axis(h, 2, axis=1) log_scale = F.clip(log_scale, -20., 2.) var = F.exp(log_scale * 2) return distribution.SquashedGaussianDistribution(mean, var=var)
def squashed_diagonal_gaussian_head(x): assert x.shape[-1] == action_size * 2 mean, log_scale = F.split_axis(x, 2, axis=1) log_scale = F.clip(log_scale, -20., 2.) var = F.exp(log_scale * 2) return chainerrl.distribution.SquashedGaussianDistribution( mean, var=var)
def _compute_loss(self, exp_batch, errors_out=None): """Compute a loss of categorical DQN.""" y, t = self._compute_y_and_t(exp_batch) # Minimize the cross entropy # y is clipped to avoid log(0) eltwise_loss = -t * F.log(F.clip(y, 1e-10, 1.)) if errors_out is not None: del errors_out[:] # The loss per example is the sum of the atom-wise loss # Prioritization by KL-divergence delta = F.sum(eltwise_loss, axis=1) delta = cuda.to_cpu(delta.array) for e in delta: errors_out.append(e) if 'weights' in exp_batch: return compute_weighted_value_loss( eltwise_loss, y.shape[0], exp_batch['weights'], batch_accumulator=self.batch_accumulator) else: return compute_value_loss(eltwise_loss, batch_accumulator=self.batch_accumulator)
def numpy_clip(a, a_min, a_max, out=None): check_attribute_scalar(a_min, 'numpy.clip', 'a_min') check_attribute_scalar(a_max, 'numpy.clip', 'a_max') check_attribute_value(out, None, 'numpy.clip', 'out') a = F.clip(a, a_min, a_max) return a
def clamp(self, lower=-0.01, upper=0.01): """Clamp all parameters, including the batch normalization parameters.""" for params in self.params(): params_clipped = F.clip(params, lower, upper) params.data = params_clipped.data
def __call__(self, y_pred: chainer.Variable, t: np.ndarray, epsilon: float=1e-31) -> chainer.Variable: """Forward.""" xp = cuda.get_array_module(t) p = functions.clip(functions.softmax(y_pred), epsilon, 1 - epsilon)[xp.arange(t.shape[0]), t] # loss_by_sample = - self.alpha * (1 - p) ** self.gamma * functions.log(p) loss_by_sample = - (1 - p) ** self.gamma * functions.log(p) return functions.mean(loss_by_sample)
def _encode(self, s, a): mu, ln_var = self._latent_distribution(s, a) # 2 * ln_std = ln_var # original code is written in ln_std form # Clip for numerical stability ln_var = F.clip(ln_var, x_min=-8, x_max=30) return F.gaussian(mu, ln_var), mu, ln_var
def train(self, replay_buffer, iterations, d, clip_value, gamma, tau): if not self._initialized: self._initialize_target_networks() iterator = self._prepare_iterator(replay_buffer) for i in range(iterations): batch = iterator.next() s_current, action, r, s_next, non_terminal = concat_examples( batch, device=self._device) epsilon = F.clip( self._sample_action_noise(shape=(self._batch_size)), -clip_value, clip_value) target_pi = self._target_pi(s_current) assert target_pi.shape == epsilon.shape a_tilde = target_pi + epsilon target_q1 = self._target_q1(s_next, a_tilde) target_q2 = self._target_q2(s_next, a_tilde) r = F.reshape(r, shape=(*r.shape, 1)) non_terminal = F.reshape(non_terminal, shape=(*non_terminal.shape, 1)) min_q = F.minimum(target_q1, target_q2) # print('r shape: ', r.shape) # print('done shape: ', non_terminal.shape) # print('min q shape: ', min_q.shape) y = r + gamma * non_terminal * min_q # print('y shape: ', y.shape) # Remove reference to avoid unexpected gradient update y.unchain() q1 = self._q1(s_current, action) q1_loss = F.mean_squared_error(y, q1) q2 = self._q2(s_current, action) q2_loss = F.mean_squared_error(y, q2) critic_loss = q1_loss + q2_loss self._q1_optimizer.target.cleargrads() self._q2_optimizer.target.cleargrads() critic_loss.backward() critic_loss.unchain_backward() self._q1_optimizer.update() self._q2_optimizer.update() if i % d == 0: a = self._pi(s_current) q1 = self._q1(s_current, a) pi_loss = -F.mean(q1) self._pi_optimizer.target.cleargrads() pi_loss.backward() pi_loss.unchain_backward() self._pi_optimizer.update() self._update_target_network(self._target_q1, self._q1, tau) self._update_target_network(self._target_q2, self._q2, tau) self._update_target_network(self._target_pi, self._pi, tau)
def __call__(self, state, action): h = self._linear1(F.concat((state, action))) h = F.relu(h) h = self._linear2(h) h = F.relu(h) h = self._linear3(h) perturbation = F.tanh(h) * self._phi return F.clip(action + perturbation, -1, 1)
def quantizer(x, q_num): h = x h = h / 4 h = F.clip(h, 0., 1.) h = h * (q_num - 1) h = snake(h, alpha=0.5) qh = round_data(h) return qh
def generate(self, sampled_word_idx_seq, shape): n_turn, n_word = self.n_turn, self.n_word train = self.train batchsize = shape[0] sentence_history = [] log_prob_history = [] canvas_history = [] p_dists_history = [] # Initialize canvas of Listener canvas = chainer.Variable(self.xp.zeros(shape, np.float32), volatile='auto') for turn in range(n_turn): # [Listener] # Interpret the expression & Paint it into canvas # Perceive (only canvas) hidden_canvas = self.listener.perceive(canvas, turn, train=train) # Interpret the expression with current situation (canvas) message_meaning = self.listener.listen(sampled_word_idx_seq, turn, train=train) concept = self.listener.think(hidden_canvas, message_meaning, turn, train=train) # ZURU if self.zuru: concept = thought # Paint # canvas = self.listener.painter( # canvas, concept, turn, train=train) canvas += self.listener.painter(concept, turn, train=train) # Physical limitations of canvas (leaky to make gradient active) canvas = F.clip(canvas, 0., 1.) * 0.9 + canvas * 0.1 # Save canvas_history.append(canvas) return [F.clip(cv, 0., 1.).data for cv in canvas_history]
def gaussian_nll_keepbatch(self, x, mean, ln_var, clip=True): if clip: clip_min = math.log(0.001) clip_max = math.log(10) ln_var = F.clip(ln_var, clip_min, clip_max) x_prec = F.exp(-ln_var) x_diff = x - mean x_power = (x_diff * x_diff) * x_prec * 0.5 return F.sum((math.log(2.0 * math.pi) + ln_var) * 0.5 + x_power, axis=1)
def compute_dists(self, obs): mean_var = Variable(self.running_stat.mean.astype(np.float32)) std_var = Variable(self.running_stat.std.astype(np.float32)) obs = obs - F.broadcast_to(mean_var, obs.shape) obs = obs / (F.broadcast_to(std_var, obs.shape) + 1e-8) if self.clip is not None: obs = F.clip(obs, -self.clip, self.clip) return self.policy.compute_dists(obs)
def check_forward(self, x_data): x = chainer.Variable(x_data) y = functions.clip(x, self.x_min, self.x_max) self.assertEqual(y.data.dtype, numpy.float32) y_expect = self.x.copy() for i in numpy.ndindex(self.x.shape): if self.x[i] < self.x_min: y_expect[i] = self.x_min elif self.x[i] > self.x_max: y_expect[i] = self.x_max gradient_check.assert_allclose(y_expect, y.data)
def forward(self, x, l, train, action): if self.xp == np: loc = l.data else: loc = self.xp.asnumpy(l.data) margin = self.g_size/2 loc = (loc+1)*0.5*(self.in_size-self.g_size+1) + margin loc = np.clip(loc, margin, self.in_size-margin) loc = np.floor(loc).astype(np.int32) # Retina Encoding hx = crop(x, loc=loc, size=self.g_size) hx = F.relu(self.emb_x(hx)) # Location Encoding hl = F.relu(self.emb_l(l)) # Glimpse Net g = F.relu(self.fc_lg(hl) + self.fc_xg(hx)) # Core Net h = self.core_lstm(g) # LSTM(g + h_t-1) # Location Net l = F.tanh(self.fc_hl(h)) if train: # sampling location l s = F.gaussian(mean=l, ln_var=self.ln_var) s = F.clip(s, -1., 1.) # location policy l1, l2 = F.split_axis(l, indices_or_sections=2, axis=1) s1, s2 = F.split_axis(s, indices_or_sections=2, axis=1) norm = (s1-l1)*(s1-l1) + (s2-l2)*(s2-l2) ln_p = 0.5 * norm / self.var ln_p = F.reshape(ln_p, (-1,)) if action: # Action Net y = self.fc_ha(h) if train: return s, ln_p, y else: return l, None, y else: if train: return s, ln_p, None else: return l, None, None
def gaussian_nll_keepbatch(self, x, mean, ln_var, clip=True): if clip: clip_min = math.log(0.01) clip_max = math.log(10) ln_var = F.clip(ln_var, clip_min, clip_max) x_prec = F.exp(-ln_var) x_diff = x - mean x_power = (x_diff * x_diff) * x_prec * 0.5 # print "nll" # print cuda.cupy.amax(x.data), cuda.cupy.amin(x.data) # print cuda.cupy.amax(ln_var.data), cuda.cupy.amin(ln_var.data) # print cuda.cupy.amax(x_prec.data), cuda.cupy.amin(x_prec.data) # print cuda.cupy.amax(x_power.data), cuda.cupy.amin(x_power.data) return F.sum((math.log(2.0 * math.pi) + ln_var) * 0.5 + x_power, axis=1)
def check_forward(self, x_data): x_min, x_max = self.x_min_max x = chainer.Variable(x_data) y = functions.clip(x, x_min, x_max) self.assertEqual(y.data.dtype, self.dtype) y_expect = self.x.copy() for i in numpy.ndindex(self.x.shape): if self.x[i] < x_min: y_expect[i] = x_min elif self.x[i] > x_max: y_expect[i] = x_max testing.assert_allclose(y_expect, y.data)
def compute_distance_of_cluster_heads(self): # list all possible combinations of two cluster heads num_combination = self.nCr(self.ndim_y, 2) # a_labels # [0, 1, 0, 0] # [0, 0, 1, 0] # [0, 0, 1, 0] # [0, 0, 0, 1] # [0, 0, 0, 1] # [0, 0, 0, 1] a_labels = np.zeros((num_combination, self.ndim_y), dtype=np.float32) for i in range(1, self.ndim_y): for n in range(i): j = int(0.5 * i * (i - 1) + n) a_labels[j, i] = 1 # b_labels # [1, 0, 0, 0] # [1, 0, 0, 0] # [0, 1, 0, 0] # [1, 0, 0, 0] # [0, 1, 0, 0] # [0, 0, 1, 0] b_labels = np.zeros((num_combination, self.ndim_y), dtype=np.float32) for i in range(1, self.ndim_y): for n in range(i): j = int(0.5 * i * (i - 1) + n) b_labels[j, n] = 1 xp = self.xp if xp is not np: a_labels = cuda.to_gpu(a_labels) b_labels = cuda.to_gpu(b_labels) a_vector = a_labels b_vector = b_labels distance = functions.sqrt(functions.sum((a_vector - b_vector) ** 2, axis=1)) # clip distance = functions.clip(distance, 0.0, float(self.cluster_head_distance_threshold)) return distance
def f(x): return functions.clip(x, self.x_min, self.x_max)
def forward(self, inputs, batch_lengths, initial_state=None): """ Parameters ---------- inputs : ``torch.FloatTensor``, required. A tensor of shape (batch_size, num_timesteps, input_size) to apply the LSTM over. batch_lengths : ``List[int]``, required. A list of length batch_size containing the lengths of the sequences in batch. initial_state : ``Tuple[torch.Tensor, torch.Tensor]``, optional, (default = None) A tuple (state, memory) representing the initial hidden state and memory of the LSTM. The ``state`` has shape (1, batch_size, hidden_size) and the ``memory`` has shape (1, batch_size, cell_size). Returns ------- output_accumulator : ``torch.FloatTensor`` The outputs of the LSTM for each timestep. A tensor of shape (batch_size, max_timesteps, hidden_size) where for a given batch element, all outputs past the sequence length for that batch are zero tensors. final_state : ``Tuple[``torch.FloatTensor, torch.FloatTensor]`` A tuple (state, memory) representing the initial hidden state and memory of the LSTM. The ``state`` has shape (1, batch_size, hidden_size) and the ``memory`` has shape (1, batch_size, cell_size). """ batch_size = inputs.shape[0] total_timesteps = inputs.shape[1] output_accumulator_list = [] if initial_state is None: full_batch_previous_memory = chainer.Variable( self.xp.zeros((batch_size, self.cell_size), 'f')) full_batch_previous_state = chainer.Variable( self.xp.zeros((batch_size, self.hidden_size), 'f')) else: # first dimension is just (layer * (1 + is_bidirection)), i.e., 1. full_batch_previous_state = F.squeeze(initial_state[0], axis=0) full_batch_previous_memory = F.squeeze(initial_state[1], axis=0) current_length_index = batch_size - 1 if self.go_forward else 0 if self.recurrent_dropout_probability > 0.0 and \ (self.training or chainer.confing.train): dropout_mask = get_dropout_mask(self.recurrent_dropout_probability, full_batch_previous_state) else: dropout_mask = None for timestep in range(total_timesteps): # The index depends on which end we start. index = timestep if self.go_forward else total_timesteps - timestep - 1 # What we are doing here is finding the index into the batch dimension # which we need to use for this timestep, because the sequences have # variable length, so once the index is greater than the length of this # particular batch sequence, we no longer need to do the computation for # this sequence. The key thing to recognise here is that the batch inputs # must be _ordered_ by length from longest (first in batch) to shortest # (last) so initially, we are going forwards with every sequence and as we # pass the index at which the shortest elements of the batch finish, # we stop picking them up for the computation. if self.go_forward: while batch_lengths[current_length_index] <= index: current_length_index -= 1 # If we're going backwards, we are _picking up_ more indices. else: # First conditional: Are we already at the maximum number of elements in the batch? # Second conditional: Does the next shortest sequence beyond the current batch # index require computation use this timestep? while current_length_index < (len(batch_lengths) - 1) and \ batch_lengths[current_length_index + 1] > index: current_length_index += 1 # Actually get the slices of the batch which we # need for the computation at this timestep. # shape (batch_size, cell_size) previous_memory = full_batch_previous_memory[0: current_length_index + 1] # Shape (batch_size, hidden_size) previous_state = full_batch_previous_state[0: current_length_index + 1] # Shape (batch_size, input_size) timestep_input = inputs[0: current_length_index + 1, index] # Do the projections for all the gates all at once. # Both have shape (batch_size, 4 * cell_size) projected_input = self.input_linearity(timestep_input) projected_state = self.state_linearity(previous_state) # Main LSTM equations using relevant chunks of the big linear # projections of the hidden state and inputs. # TODO: split_axis # TODO: cuda kernel input_gate = F.sigmoid(projected_input[:, (0 * self.cell_size):(1 * self.cell_size)] + projected_state[:, (0 * self.cell_size):(1 * self.cell_size)]) forget_gate = F.sigmoid(projected_input[:, (1 * self.cell_size):(2 * self.cell_size)] + projected_state[:, (1 * self.cell_size):(2 * self.cell_size)]) memory_init = F.tanh(projected_input[:, (2 * self.cell_size):(3 * self.cell_size)] + projected_state[:, (2 * self.cell_size):(3 * self.cell_size)]) output_gate = F.sigmoid(projected_input[:, (3 * self.cell_size):(4 * self.cell_size)] + projected_state[:, (3 * self.cell_size):(4 * self.cell_size)]) memory = input_gate * memory_init + forget_gate * previous_memory # Here is the non-standard part of this LSTM cell; first, we clip the # memory cell, then we project the output of the timestep to a smaller size # and again clip it. if self.memory_cell_clip_value: memory = F.clip(memory, -self.memory_cell_clip_value, self.memory_cell_clip_value) # shape (current_length_index, cell_size) pre_projection_timestep_output = output_gate * F.tanh(memory) # shape (current_length_index, hidden_size) timestep_output = self.state_projection( pre_projection_timestep_output) if self.state_projection_clip_value: timestep_output = F.clip(timestep_output, -self.state_projection_clip_value, self.state_projection_clip_value) # Only do dropout if the dropout prob is > 0.0 and we are in training mode. if dropout_mask is not None: timestep_output = timestep_output * \ dropout_mask[0: current_length_index + 1] # We've been doing computation with less than the full batch, so here we create a new # variable for the the whole batch at this timestep and insert the result for the # relevant elements of the batch into it. full_batch_previous_memory = F.concat( [memory, full_batch_previous_memory[current_length_index + 1:]], axis=0) full_batch_previous_state = F.concat( [timestep_output, full_batch_previous_state[current_length_index + 1:]], axis=0) output_accumulator_list.append(timestep_output) # Mimic the pytorch API by returning state in the following shape: # (num_layers * num_directions, batch_size, ...). As this # LSTM cell cannot be stacked, the first dimension here is just 1. final_state = (F.expand_dims(full_batch_previous_state, 0), F.expand_dims(full_batch_previous_memory, 0)) if not self.go_forward: output_accumulator_list = output_accumulator_list[::-1] output_accumulator = F.pad_sequence(output_accumulator_list) output_accumulator = output_accumulator.transpose((1, 0, 2)) # (batch_size, total_timesteps, self.hidden_size) return output_accumulator, final_state
def f(x): y = functions.clip(x, self.x_min, self.x_max) return y * y
def f(x): x_min, x_max = self.x_min_max return functions.clip(x, x_min, x_max)
def test_invalid_interval(self): with self.assertRaises(AssertionError): functions.clip(self.x, 1.0, -1.0)
def check_border_grad(self, x, expected): x = chainer.Variable(x) y = functions.clip(x, self.x_min, self.x_max) l = functions.sum(y) l.backward() testing.assert_allclose(x.grad, expected, atol=0, rtol=0)