def __call__(self, d_gen, d=None): bs_gen = d_gen[0] if d: bs = d[0] return F.sum(F.log(d)) / bs + F.sum(F.log(1 - d_gen)) / bs_gen else: return F.sum(F.log(1 - d_gen)) / bs_gen
def get_nll_gaussian(preds, target, variance, add_const=False): neg_log_p = ((preds - target) ** 2 / (2 * variance)) if add_const: const = 0.5 * (F.log(2 * preds.xp.array(preds.xp.pi, dtype=preds.dtype)) + \ F.log(preds.xp.array(variance, preds.dtype))) neg_log_p += const ret = F.sum(neg_log_p) / (target.shape[0] * target.shape[1]) return ret
def __call__(self, d_x_gen, d_x_real=None): bs_d_x_gen = d_x_gen.shape[0] if d_x_real is not None: bs_d_x_real = d_x_real.shape[0] loss = F.sum(F.log(d_x_real)) / bs_d_x_real \ + F.sum(F.log(1 - d_x_gen)) / bs_d_x_gen return - loss # to minimize else: loss = F.sum(F.log(d_x_gen)) / bs_d_x_gen return - loss # to minimize (reverse trick)
def __call__(self, d_x_gen, d_x=None): #TODO: reverse trick bs_d_x_gen = d_x_gen.shape[0] if d_x is not None: bs_d_x = d_x.shape[0] loss = F.sum(F.log(F.sigmoid(d_x))) / bs_d_x \ + F.sum(F.log(1 - F.sigmoid(d_x_gen))) / bs_d_x_gen return - loss # to minimize else: loss = F.sum(F.log(1 - F.sigmoid(d_x_gen))) / bs_d_x_gen return loss
def kld(self, vec_true, vec_compare): ind = vec_true.data * vec_compare.data > 0 ind_var = chainer.Variable(ind) include_nan = vec_true * F.log(vec_true / vec_compare) z = chainer.Variable(np.zeros((len(ind), 1), dtype=np.float32)) # return np.nansum(vec_true * np.log(vec_true / vec_compare)) return F.sum(F.where(ind_var, include_nan, z))
def _log_prob_words(self, context, temperature=1.0): """ This calculates an softmax over the vocabulary as a function of the dot product of context and word. """ dot = F.matmul(context, F.transpose(self.vocab.W)) prob = F.softmax(dot / temperature) return F.log(prob)
def dirichlet_likelihood(weights, alpha=None): """ Calculate the log likelihood of the observed topic proportions. A negative likelihood is more likely than a negative likelihood. Args: weights (chainer.Variable): Unnormalized weight vector. The vector will be passed through a softmax function that will map the input onto a probability simplex. alpha (float): The Dirichlet concentration parameter. Alpha greater than 1.0 results in very dense topic weights such that each document belongs to many topics. Alpha < 1.0 results in sparser topic weights. The default is to set alpha to 1.0 / n_topics, effectively enforcing the prior belief that a document belong to very topics at once. Returns: ~chainer.Variable: Output loss variable. """ if type(weights) is Variable: n_topics = weights.data.shape[1] else: n_topics = weights.W.data.shape[1] if alpha is None: alpha = 1.0 / n_topics if type(weights) is Variable: proportions = F.softmax(weights) else: proportions = F.softmax(weights.W) loss = (alpha - 1.0) * F.log(proportions + 1e-8) return -F.sum(loss)
def cosine_similarity(x, y, eps=1e-6): n1, n2, n3 = x.data.shape _, m2, _ = y.data.shape z = F.batch_matmul(x, y, transb=True) x2 = F.broadcast_to(F.reshape(F.sum(x * x, axis=2), (n1, n2, 1)), (n1, n2, m2)) y2 = F.broadcast_to(F.reshape(F.sum(y * y, axis=2), (n1, 1, m2)), (n1, n2, m2)) z /= F.exp(F.log(x2 * y2 + eps) / 2) return z
def ordinal_loss(y, mask): xp = cuda.get_array_module(y.data) volatile = y.volatile b, c, n = y.data.shape max_y = F.broadcast_to(F.max(y, axis=1, keepdims=True), y.data.shape) y = y - max_y sum_y = F.broadcast_to(F.expand_dims(F.sum(y, axis=1), 1), y.data.shape) down_tri = np.tri(c, dtype=np.float32) up_tri = down_tri.T w1 = Variable(xp.asarray(down_tri.reshape(c, c, 1, 1)), volatile=volatile) w2 = Variable(xp.asarray(up_tri.reshape(c, c, 1, 1)), volatile=volatile) h = F.exp(F.expand_dims(y, -1)) h1 = F.convolution_2d(h, w1) h1 = F.convolution_2d(F.log(h1), w1) h2 = F.convolution_2d(h, w2) h2 = F.convolution_2d(F.log(h2), w2) h = F.reshape(h1 + h2, (b, c, n)) return F.sum((h - sum_y - y) * mask) / b
def read(address): #map from the reals to the hypercube of dimesion n index = F.tanh(address) #map from a point to the nearest corner of the hypercube f = lambda x: x > 0 mainIndex = np.vectorize(f,index.data,cache=True) mainValue = F.select_item(array,lookup(mainIndex)) scaleFactor =F.exp(F.sum(F.log(F.absolute(x)))) return mainValue * scaleFactor
def __call__(self, x_i, x_j, t_i, t_j): s_i = self.predictor(x_i) s_j = self.predictor(x_j) s_diff = s_i - s_j if t_i.data > t_j.data: S_ij = 1 elif t_i.data < t_j.data: S_ij = -1 else: S_ij = 0 self.loss = (1 - S_ij) * s_diff / 2. + F.log(1 + F.exp(-s_diff)) return self.loss
def calc_log_posterior(theta, x, n=None): """Calculate unnormalized log posterior, ``log p(theta | x) + C`` Args: theta(chainer.Variable): model parameters x(numpy.ndarray): sample data n(int): total data size Returns: chainer.Variable: Variable that holding unnormalized log posterior, ``log p(theta | x) + C`` of shape ``()`` """ theta1, theta2 = F.split_axis(theta, 2, 0) log_prior1 = F.sum(F.log(gaussian.gaussian_likelihood(theta1, 0, VAR1))) log_prior2 = F.sum(F.log(gaussian.gaussian_likelihood(theta2, 0, VAR2))) prob1 = gaussian.gaussian_likelihood(x, theta1, VAR_X) prob2 = gaussian.gaussian_likelihood(x, theta1 + theta2, VAR_X) log_likelihood = F.sum(F.log(prob1 / 2 + prob2 / 2)) if n is not None: log_likelihood *= n / len(x) return log_prior1 + log_prior2 + log_likelihood
def _additional_score(self, y, a, src): batch_size = len(y.data) vocab_size = self._output xp = self._xp src_len = len(self.prob_dict) # Calculating dict prob y_dict = F.reshape(F.batch_matmul(self.prob_dict, a, transa=True), (batch_size, vocab_size)) is_prob = False # Using dict prob if self._method == "bias": yp = y + F.log(eps + y_dict) elif self._method == "linear": yp = self.LI(y_dict, F.softmax(y)) is_prob = True else: raise ValueError("Unrecognized dictionary method:", self._method) return yp, is_prob
def choose_var_of_type(spec, context, scope, type_def): compatible_scope = [var for var in scope if var.type_def.can_be(type_def)] scope = list(scope) var_ndxs = [i for i in range(len(scope)) if scope[i].type_def.can_be(type_def)] var_embeddings = [scope[i].vec for i in var_ndxs] var_lprobs = [F.matmul(vec, F.transpose(context['state'])) for vec in var_embeddings] normalizer = Variable(np.array([[0]], dtype=np.float32)) for vlp in var_lprobs: normalizer = normalizer + F.exp(vlp) normalizer = F.log(normalizer) var_lprobs = [vlp - normalizer for vlp in var_lprobs] vlp_data = np.array([vlp.data for vlp in var_lprobs])[:,0,0] ps = np.exp(vlp_data) ps /= np.sum(ps) ndx = np.random.choice(range(len(ps)), p=ps) lp = var_lprobs[ndx] var = scope[var_ndxs[ndx]] context['lp'] += lp[:,0] return var, context
def free_energy(self, v): """ :param Variable (batch_size, in_channels, image_height, image_width) - input data (training data) :return: scalar """ batch_size = v.data.shape[0] in_channels = self.in_channels real = self.real if real == 0: ''' visible layer is 0, 1 (bit) vbias_term = 1 * SUM(a(i) * v(i)) ''' v_sum = F.sum(v, axis=(2, 3)) # sum over image_height & image_width # Originally, it should return sum for each batch. # but it returns scalar, which is sum over batches, since sum is used at the end anyway. vbias_term = F.sum(F.matmul(v_sum, self.conv.a)) wx_b = self.conv(v) else: ''' visible layer takes real value vbias_term = 0.5 * SUM((v(i)-a(i)) * (v(i) - a(i))) ''' #TODO: check #m = Variable(xp.ones((batch_size, 1), dtype=xp.float32)) n = F.reshape(self.conv.a, (1, in_channels, 1, 1)) xp = cuda.get_array_module(n.data) std_ch = xp.reshape(self.std, (1, in_channels, 1, 1)) #v_ = v - F.matmul(m, n) v_ = (v - F.broadcast_to(n, v.data.shape)) / std_ch vbias_term = F.sum(0.5 * v_ * v_) wx_b = self.conv(v / std_ch) hidden_term = F.sum(F.log(1 + F.exp(wx_b))) # print('vbias = ', vbias_term.data, ', hidden = ', hidden_term.data, 'F.exp(wx_b) = ', F.exp(wx_b).data) return - vbias_term - hidden_term
def focal_loss(predictions, actual_obj_ids, gamma=2.0, alpha=0.25, class_weight=None, xp=numpy): pred_probas = F.softmax(predictions) actual_probas = xp.eye(predictions.shape[-1])[actual_obj_ids] pt_positive = actual_probas * pred_probas pt_negative = (1. - actual_probas) * (1. - pred_probas) pt = pt_positive + pt_negative at_positive = actual_probas * alpha at_negative = (1. - actual_probas) * (1. - alpha) at = at_positive + at_negative fl = -at * (1. - pt)**gamma * F.log(pt) if class_weight is not None: #print(fl.shape, class_weight.shape) weights = xp.array(class_weight.reshape(fl.shape)) fl = weights * fl return F.mean(fl)
def listnet(x, t, nr_docs): """ The Top-1 approximated ListNet loss as in Cao et al (2006), Learning to Rank: From Pairwise Approach to Listwise Approach :param x: The activation of the previous layer :type x: chainer.Variable :param t: The target labels :type t: chainer.Variable :param nr_docs: The number of documents per query :type nr_docs: chainer.Variable :return: The Top-1 listnet loss :rtype: chainer.Variable """ t, nr_docs = as_variable(t), as_variable(nr_docs) t = t.data.astype(x.dtype) st = cf.softmax(t, axis=1) sx = cf.softmax(x, axis=1) sce = -cf.mean(st * cf.log(sx), axis=1) return cf.mean(sce)
def get_normalized_image_variable(time, wavelength): img = get_sun_image(time, wavelength) if img is None: return None img = img[np.newaxis, np.newaxis, :, :] img = img.astype(np.float32) x = Variable(img) if gpuid >= 0: x.to_gpu() if wavelength == 'hmi': ret = x / 300 elif wavelength == 211: ret = F.sigmoid(x / 100) elif wavelength == 193: ret = F.sigmoid(x / 300) elif wavelength == 94: ret = F.sigmoid(x / 30) else: ret = F.log(1 + F.relu(x)) return ret
def decode(self, input_id, teacher_id, label_id, word_th, train=True): """ :param input_id: batch of word ID by output of decoder :param teacher_id : batch of correct ID :param label_id : :param word_th : batch of correct at label :param train: True or false :return: decoded embed vector """ batch_word = chainer.Variable(xp.array(input_id, dtype=xp.int32)) batch_label = chainer.Variable(xp.array(label_id, dtype=xp.int32)) predict_mat, predict_at, self.c_batch, self.h_batch = self.dec( batch_word, batch_label, self.c_batch, self.h_batch, train=train) if train: t = xp.array(teacher_id, dtype=xp.int32) t = chainer.Variable(t) predict_ids = xp.argmax(predict_mat.data, axis=1) correct_at = xp.zeros((1, predict_ids.shape[0]), dtype=xp.float32) for ind in range(predict_ids.shape[0]): # right answer if predict_ids[ind] < word_th and teacher_id[ind] < word_th: correct_at[0, ind] = 1.0 elif predict_ids[ind] > word_th and teacher_id[ind] > word_th: correct_at[0, ind] = 1.0 # wrong answer else: correct_at[0, ind] = 0.0 correct_at = chainer.Variable( correct_at.reshape(predict_ids.shape[0], 1)) at_loss = -F.sum(F.log(predict_at) * correct_at) / self.batch_size # if at_loss.data > 0: # print(at_loss.data) return F.softmax_cross_entropy(predict_mat, t) + at_loss, predict_mat else: return predict_mat
def get_dealer_sampling(N_pic=100, imgH=64, imgW=64, N_card=4): thres = [0.99995, 0.9999, 0.9998, 0.9995] #*512で13,26,52,131個相当 #<ランダム点画像の生成> img_r = xp.random.rand(N_pic, imgW * imgH).astype(np.float32) #100枚分の0-1乱数作成 img_p = xp.zeros( (N_card, N_pic, imgW * imgH)).astype(np.float32) #4*100枚分のイメージメモリ確保 for i, thre in enumerate(thres): #閾値よりも高いものだけ1を代入 img_p[i][img_r >= thre] = 1 #点画像変形 (N_card, N_pic, imgW*imgH,) ⇒ (N_pic, imgW*imgH, N_card) img_p = chainer.Variable(img_p.transpose((1, 2, 0))) #<サンプリング係数の生成> #100個の「1」を作成 x_one = xp.ones((N_pic, 1), dtype=np.float32) #「1」をディーラーを通したあとsoftmaxで0-1確率にする card_prob = F.softmax(Md['de'](x_one)) #gumbel_softmaxを通してサンプリング card_gum = F.gumbel_softmax(F.log(card_prob), tau=0.2) #サンプリング係数の画像化 (N_pic, N_card) ⇒ (N_pic, imgW*imgH, N_card) card_gum_b = F.broadcast_to(F.reshape(card_gum, (N_pic, 1, N_card)), img_p.shape) #<ランダム点画像とサンプリング係数画像の合成> #ランダム点画像とサンプリング係数をかけて、合成(sum)し、2次元画像へ変形 img_p_sum = F.reshape(F.sum(img_p * card_gum_b, axis=2), (N_pic, 1, imgH, imgW)) #点⇒ガウス球へ変形 img_core = Md['decon_core'](img_p_sum) * 255 img_core = F.broadcast_to(img_core, (N_pic, 3, imgH, imgW)) return img_core
def regular_graph_output( self, f_A, f_G ): # f_A is appearance feature shape = (N,D), f_G is geometry feature shape = (N,4) assert f_A.shape[0] == f_G.shape[0] if self.add_self: assert f_A.shape[1] == self.out_size N = f_G.shape[0] geo_dim = f_G.shape[1] f_R = [] for nr in range(self.num_relations): f_G = F.tile(f_G, (1, N)) # shape = (N, 4 * N) f_G_1 = F.reshape( f_G, (N * N, geo_dim)) # after tile: N x (4 x N), then N^2 x 4 f_G_2 = F.tile(f_G, (N, 1)) # shape = (N*N, 4) encoded_offset = self.encode_box_offset(f_G_1, f_G_2) # shape = (N*N, 4) # paper formula (5), shape = (N,N) w_G = F.relu( getattr(self, self.W_G_lst[nr])(self.position_encoding( encoded_offset, self.d_g))) w_G = F.reshape(w_G, shape=(N, N)) # paper formula (4), shape = (N,N) w_K_result = getattr(self, self.W_K_lst[nr])(f_A) # shape = (N, d_k) w_Q_transpose_result = F.transpose( getattr(self, self.W_Q_lst[nr])(f_A)) # shape = (d_k, N) w_A = F.matmul(w_K_result, w_Q_transpose_result) # shape = (N,N) # paper formula (3), shape = (N,N) w_A = w_A + F.log(w_G) w = F.softmax(w_A, axis=1) # w = w_G * F.exp(w_A) / F.sum(w_G * F.exp(w_A), axis=1) # denominator shape = (N,1) numerator shape = (N,N) # paper formula (2), weight sum = matmul:(N,N) x (N, out_size//nr) = (N, out_size//nr) f_R_nr = F.matmul(w, getattr(self, self.W_V_lst[nr])(f_A)) f_R.append(f_R_nr) if self.add_self: return f_A + F.concat(f_R, axis=1) return F.concat(f_R, axis=1)
def train(self, seq_batch, test=False): self.reset_state() forward_h, backward_h = self.scan(seq_batch, test=test) xp = self.xp sum_loss = 0 seq_batch = seq_batch.T for i in xrange(len(forward_h)): fh = forward_h[i] bh = backward_h[i] c = seq_batch[i] c = Variable(xp.asanyarray(c, dtype=np.int32)) if fh is None: out = bh elif bh is None: out = fh else: h = F.concat((fh, bh)) forget = self.forget(h, test=test) forget = F.softmax(forget) out = apply_attention(fh, forget, 0) + apply_attention( bh, forget, 1) if self.fc is not None: out = self.fc(out, test=test) entropy = 0 if fh is not None and bh is not None: entropy = -forget * F.log(forget + 1e-6) entropy = F.sum(entropy) loss = F.softmax_cross_entropy(out, c) + entropy sum_loss += loss self.zero_grads() sum_loss.backward() self.update() if self.gpu: sum_loss.to_cpu() return sum_loss.data
def lf(z_t, z_t_plus_1, action, done_label, reset=True): k = self.k output_dim = self.output_dim if reset: self.reset_state() output = self.fprop(F.concat((z_t, action))) if self.predict_done: coef, mu, ln_var, done = output else: coef, mu, ln_var = output coef = F.reshape(coef, (-1, output_dim, k)) coef = F.softmax(coef, axis=2) mu = F.reshape(mu, (-1, output_dim, k)) ln_var = F.reshape(ln_var, (-1, output_dim, k)) z_t_plus_1 = F.repeat(z_t_plus_1, k, 1).reshape(-1, output_dim, k) normals = F.sum( coef * F.exp(-F.gaussian_nll(z_t_plus_1, mu, ln_var, reduce='no')), axis=2) densities = F.sum(normals, axis=1) nll = -F.log(densities) loss = F.sum(nll) if self.predict_done: done_loss = F.sigmoid_cross_entropy(done.reshape(-1, 1), done_label, reduce="no") done_loss *= (1. + done_label.astype("float32") * 9.) done_loss = F.mean(done_loss) loss = loss + done_loss return loss
def compute_logits(self, new_states, concatenated, attn): new_output_state = new_states[-1] all_concatenated = F.concat((concatenated, new_output_state)) logits = self.decoder_chain.lin_o(self.decoder_chain.maxo(all_concatenated)) if self.lexicon_probability_matrix is not None: current_mb_size = new_output_state.data.shape[0] assert self.mb_size is None or current_mb_size <= self.mb_size lexicon_probability_matrix = self.lexicon_probability_matrix[:current_mb_size] # Just making sure data shape is as expected attn_mb_size, max_source_length_attn = attn.data.shape assert attn_mb_size == current_mb_size lex_mb_size, max_source_length_lexicon, v_size_lexicon = lexicon_probability_matrix.shape assert max_source_length_lexicon == max_source_length_attn assert logits.data.shape == (current_mb_size, v_size_lexicon) if self.demux: assert lex_mb_size == 1 weighted_lex_probs = F.reshape( matmul_constant(attn, lexicon_probability_matrix.reshape(lexicon_probability_matrix.shape[1], lexicon_probability_matrix.shape[2])), logits.data.shape) else: assert lex_mb_size == current_mb_size # weighted_lex_probs = F.reshape( # F.batch_matmul(attn, ConstantFunction(lexicon_probability_matrix)(), transa = True), # logits.data.shape) weighted_lex_probs = F.reshape( batch_matmul_constant(attn, lexicon_probability_matrix, transa=True), logits.data.shape) logits += F.log(weighted_lex_probs + self.lex_epsilon) return logits
def _compute_loss(self, exp_batch, errors_out=None): """Compute a loss of categorical DQN.""" y, t = self._compute_y_and_t(exp_batch) # Minimize the cross entropy # y is clipped to avoid log(0) eltwise_loss = -t * F.log(F.clip(y, 1e-10, 1.)) if errors_out is not None: del errors_out[:] delta = F.sum(eltwise_loss, axis=1) delta = cuda.to_cpu(delta.array) for e in delta: errors_out.append(e) if 'weights' in exp_batch: return compute_weighted_value_loss( y, t, exp_batch['weights'], batch_accumulator=self.batch_accumulator) else: return compute_value_loss(y, t, batch_accumulator=self.batch_accumulator)
def train(self, seq_batch, test=False): self.reset_state() forward_h, backward_h = self.scan(seq_batch, test=test) xp = self.xp sum_loss = 0 seq_batch = seq_batch.T for i in xrange(len(forward_h)): fh = forward_h[i] bh = backward_h[i] c = seq_batch[i] c = Variable(xp.asanyarray(c, dtype=np.int32)) if fh is None: out = bh elif bh is None: out = fh else: h = F.concat((fh, bh)) forget = self.forget(h, test=test) forget = F.softmax(forget) out = apply_attention(fh, forget, 0) + apply_attention(bh, forget, 1) if self.fc is not None: out = self.fc(out, test=test) entropy = 0 if fh is not None and bh is not None: entropy = -forget * F.log(forget + 1e-6) entropy = F.sum(entropy) loss = F.softmax_cross_entropy(out, c) + entropy sum_loss += loss self.zero_grads() sum_loss.backward() self.update() if self.gpu: sum_loss.to_cpu() return sum_loss.data
import numpy as np import chainer from chainer import functions as F a = chainer.Variable(np.array(6.)) b = chainer.Variable(np.array(4.)) c = chainer.Variable(np.array(2.)) r = (a + b) * (F.log(b) + c) # r = (a + b) * (F.log(np.array(4.)) + c) r.grad = np.array(5.0) r.backward() print(a.grad, b.grad, c.grad)
def concat_losses(p, e): loss_x = -F.sum(F.log(sum_axis(p))) / numpy.float32(p.data.shape[0]) loss_e = F.sigmoid_cross_entropy(*e) return loss_x + loss_e
def _loss(self, context, target, weight): _context = F.dropout(context, ratio=self.dropout_ratio) _word = F.dropout(self.vocab(target), ratio=self.dropout_ratio) dot = -F.log(F.sigmoid(F.sum(_context * _word, axis=1)) + 1e-9) return F.sum(dot * weight)
def log(): x = rand((1, 8, 8, 8), bias=1e-10) y = F.log(x) return {'input': x}, {'out': y}
def __init__(self, mean, var): self.mean = _wrap_by_variable(mean) self.var = _wrap_by_variable(var) self.ln_var = F.log(var)
def all_log_prob(self): with chainer.force_backprop_mode(): if self.min_prob > 0: return F.log(self.all_prob) else: return F.log_softmax(self.beta * self.logits)
def softplus(self, x): return F.log(F.exp(x) + 1)
def __call__(self, x): x = F.log(x) + 13.0 h = F.leaky_relu(self.l1(x)) h = F.leaky_relu(self.l2(h)) h = F.leaky_relu(self.l3(h)) return F.exp(self.l9(h)-13.0)
def myCrossEntropyError(m, y): DELTA = 1e-7 # マイナス無限大を発生させないように微小な値を追加する return -F.sum(y * F.log(m + DELTA) + (1 - y) * F.log(1 - m + DELTA))
def logsumexp(x, mask, zero_pad, axis): x_exp = F.where(mask, F.exp(x), zero_pad) return F.log(F.sum(x_exp, axis=axis))
def _encode(self, xs): exs = self.embed_mat(xs) h = F.tanh(self.l1(exs)) logits = F.softplus(self.l2(h)) logits = F.log(logits + 1e-10).reshape(-1, self.M, self.K) return logits, exs
def compute_lower_bound(self, x_l_cpu_data, y_l_cpu_data, x_u_cpu_data, test=False): assert(isinstance(x_l_cpu_data, np.ndarray)) def lower_bound(log_px, log_py, log_pa, log_pz, log_qz, log_qa): return log_px + log_py + log_pa + log_pz - log_qz - log_qa # _l: labeled # _u: unlabeled batchsize_l = x_l_cpu_data.shape[0] batchsize_u = x_u_cpu_data.shape[0] ndim_x = x_u_cpu_data.shape[1] n_types_of_label = y_l_cpu_data.shape[1] num_mc_samples = self.config.num_mc_samples xp = self.xp ### lower bound of labeled data ### # repeat num_mc_samples times if num_mc_samples == 1: x_l = self.to_variable(x_l_cpu_data) y_l = self.to_variable(y_l_cpu_data) else: x_l = self.to_variable(np.repeat(x_l_cpu_data, num_mc_samples, axis=0)) y_l = self.to_variable(np.repeat(y_l_cpu_data, num_mc_samples, axis=0)) a_mean_l, a_ln_var_l = self.q_a_x(x_l, test=test) a_l = F.gaussian(a_mean_l, a_ln_var_l) z_mean_l, z_ln_var_l = self.q_z_axy(a_l, x_l, y_l, test=test) z_l = F.gaussian(z_mean_l, z_ln_var_l) # compute lower bound log_pa_l = self.log_pa(a_l, x_l, y_l, z_l, test=test) log_px_l = self.log_px(a_l, x_l, y_l, z_l, test=test) log_py_l = self.log_py(y_l) log_pz_l = self.log_pz(z_l) log_qa_l = -self.gaussian_nll_keepbatch(a_l, a_mean_l, a_ln_var_l) # 'gaussian_nll_keepbatch' returns the negative log-likelihood log_qz_l = -self.gaussian_nll_keepbatch(z_l, z_mean_l, z_ln_var_l) lower_bound_l = lower_bound(log_px_l, log_py_l, log_pa_l, log_pz_l, log_qz_l, log_qa_l) # take the average if num_mc_samples > 1: lower_bound_l /= num_mc_samples ### lower bound of unlabeled data ### if batchsize_u > 0: # To marginalize y, we repeat unlabeled x, and construct a target (batchsize_u * n_types_of_label) x n_types_of_label # Example of n-dimensional x and target matrix for a 3 class problem and batch_size=2. # x_u y_repeat # [[x0[0], x0[1], ..., x0[n]] [[1, 0, 0] # [x1[0], x1[1], ..., x1[n]] [1, 0, 0] # [x0[0], x0[1], ..., x0[n]] [0, 1, 0] # [x1[0], x1[1], ..., x1[n]] [0, 1, 0] # [x0[0], x0[1], ..., x0[n]] [0, 0, 1] # [x1[0], x1[1], ..., x1[n]]] [0, 0, 1]] # marginalize x and y x_u_marg = np.broadcast_to(x_u_cpu_data, (n_types_of_label, batchsize_u, ndim_x)).reshape((batchsize_u * n_types_of_label, ndim_x)) y_u_marg = np.repeat(np.identity(n_types_of_label, dtype=np.float32), batchsize_u, axis=0) # repeat num_mc_samples times x_u = x_u_marg y_u = y_u_marg if num_mc_samples > 1: n_rows_marg = x_u_marg.shape[0] n_rows = n_rows_marg * num_mc_samples x_u = np.repeat(x_u_marg, num_mc_samples, axis=0) y_u = np.repeat(y_u_marg, num_mc_samples, axis=0) x_u = self.to_variable(x_u) y_u = self.to_variable(y_u) a_mean_u, a_ln_var_u = self.q_a_x(x_u, test=test) a_u = F.gaussian(a_mean_u, a_ln_var_u) z_mean_u, z_ln_var_u = self.q_z_axy(a_u, x_u, y_u, test=test) z_u = F.gaussian(z_mean_u, z_ln_var_u) # compute lower bound log_pa_u = self.log_pa(a_u, x_u, y_u, z_u, test=test) log_px_u = self.log_px(a_u, x_u, y_u, z_u, test=test) log_py_u = self.log_py(y_u) log_pz_u = self.log_pz(z_u) log_qa_u = -self.gaussian_nll_keepbatch(a_u, a_mean_u, a_ln_var_u) # 'gaussian_nll_keepbatch' returns the negative log-likelihood log_qz_u = -self.gaussian_nll_keepbatch(z_u, z_mean_u, z_ln_var_u) lower_bound_u = lower_bound(log_px_u, log_py_u, log_pa_u, log_pz_u, log_qz_u, log_qa_u) # Compute sum_y{q(y|x){-L(x,y) + H(q(y|x))}} # Let LB(xn, y) be the lower bound for an input image xn and a label y (y = 0, 1, ..., 9). # Let bs be the batchsize. # # lower_bound_u is a vector and it looks like... # [LB(x0,0), LB(x1,0), ..., LB(x_bs,0), LB(x0,1), LB(x1,1), ..., LB(x_bs,1), ..., LB(x0,9), LB(x1,9), ..., LB(x_bs,9)] # # After reshaping. (axis 1 corresponds to label, axis 2 corresponds to batch) # [[LB(x0,0), LB(x1,0), ..., LB(x_bs,0)], # [LB(x0,1), LB(x1,1), ..., LB(x_bs,1)], # . # . # . # [LB(x0,9), LB(x1,9), ..., LB(x_bs,9)]] # # After transposing. (axis 1 corresponds to batch) # [[LB(x0,0), LB(x0,1), ..., LB(x0,9)], # [LB(x1,0), LB(x1,1), ..., LB(x1,9)], # . # . # . # [LB(x_bs,0), LB(x_bs,1), ..., LB(x_bs,9)]] if num_mc_samples == 1: lower_bound_u = F.transpose(F.reshape(lower_bound_u, (n_types_of_label, -1))) else: lower_bound_u = F.reshape(lower_bound_u, (n_types_of_label, num_mc_samples * batchsize_u)) lower_bound_u = F.transpose(lower_bound_u) # take expectations w.r.t y if num_mc_samples == 1: x_u = self.to_variable(x_u_cpu_data) else: x_u = self.to_variable(np.repeat(x_u_cpu_data, num_mc_samples, axis=0)) a_mean_u, a_ln_var_u = self.q_a_x(x_u, test=test) a_u = F.gaussian(a_mean_u, a_ln_var_u) y_distribution = F.softmax(self.q_y_ax(a_u, x_u, test=test)) lower_bound_u = y_distribution * (lower_bound_u - F.log(y_distribution + 1e-6)) # take the average if num_mc_samples > 1: lower_bound_u /= num_mc_samples lb_labeled = F.sum(lower_bound_l) / batchsize_l lb_unlabeled = F.sum(lower_bound_u) / batchsize_u lower_bound = lb_labeled + lb_unlabeled else: lb_unlabeled = None lb_labeled = F.sum(lower_bound_l) / batchsize_l lower_bound = lb_labeled return lower_bound, lb_labeled, lb_unlabeled
def forward(self, x): y1 = F.log(x) return y1
def get_kl_categorical_uniform(preds, num_atoms, num_edge_types, add_const=False, eps=1e-16): kl_div = preds * F.log(preds + eps) if add_const: const = F.log(preds.xp.array(num_edge_types, dtype=preds.dtype)) kl_div += const return F.sum(kl_div) / (num_atoms * preds.shape[0])
#setup optimizer optimizerG = optimizers.Adam(alpha=0.001) optimizerG.setup(G) optimizerG.add_hook(chainer.optimizer.WeightDecay(0.0005)) optimizerD = optimizers.Adam(alpha=0.001) optimizerD.setup(D) optimizerD.add_hook(chainer.optimizer.WeightDecay(0.0005)) batch_size = args.batch for i in xrange(args.iter): for k in xrange(args.ksteps): optimizerD.zero_grads() z_batch = gen.sample(batch_size) x_batch = data.sample(batch_size) #符号をチェック loss_d = F.sum(-F.log(D(x_batch)) - F.log(np.ones([batch_size, 1]) - D(G(z_batch)))) / batch_size loss_d.backward() optimizerD.update() optimizerG.zero_grads() z_batch = gen.sample(batch_size) x_batch = data.sample(batch_size) #loss_g=F.sum( F.log(np.ones([batch_size,1])-D(G(z_batch))) )/batch_size loss_g = -F.sum(F.log(D(G(z_batch)))) / batch_size loss_g.backward() optimizerG.update() if i != 0 and i % 100 == 0:
def sigmoid_cross_entropy(x, z): return F.relu(x) - x * z + F.log(1 + F.exp(-abs(x)))
def square_norm(x,y): return F.sum((F.log(x)-F.log(y))**2)/batchsize
def entropy(p): return -F.sum(F.log(p) * p)
def soft_cross_entropy_loss(self, x, p): # in the argment of log(), add small value to care numerical instability loss = -F.mean(F.sum(p * F.log(1e-30 + self.predict_proba(x)), axis=1)) # reporting loss reporter.report({'loss': loss}, self) return loss
def compute_entropy(self, p): if p.ndim == 2: return -F.sum(p * F.log(p + 1e-16), axis=1) return -F.sum(p * F.log(p + 1e-16))
def __init__(self, mean, var): self.mean = distribution._wrap_by_variable(mean) self.var = distribution._wrap_by_variable(var) self.ln_var = F.log(var)
def compute_kld(self, p, q): assert self.get_batchsize(p) == self.get_batchsize(q) return F.reshape( F.sum(p * (F.log(p + 1e-16) - F.log(q + 1e-16)), axis=1), (-1, 1))
def oneplus(x): return 1 + F.log(1 + F.exp(x))
def all_log_prob(self): with chainer.force_backprop_mode(): return F.log(self.all_prob)
def square_norm(x, y): return F.sum((F.log(x) - F.log(y))**2) / batchsize
def kl_divergence(y, t): entropy = -F.sum(t[t.nonzero()] * F.log(t[t.nonzero()])) cross_entropy = -F.sum(t * F.log_softmax(y)) return (cross_entropy - entropy) / y.shape[0]
def distance(y0, y1): p0 = F.sigmoid(y0) p1 = F.sigmoid(y1) return F.sum(p0 * F.log((p0 + 1e-8) / (p1 + 1e-8)) + (1 - p0) * F.log((1 - p0 + 1e-8) / (1 - p1 + 1e-8))) / \ p0.data.shape[0]
def test_parameterized_softmax_distr(self): import numpy as np """test output dimension""" segs = (tuple(), (12, 3, 11), (12, )) x = np.arange(39, dtype=np.float32) x2 = 2 * x logits = F.stack((x, x2)) beta = 1.0 parameterized_distr = ParamSoftmaxDistribution(logits, segs, beta) p = parameterized_distr.all_prob log_p = parameterized_distr.all_log_prob # sanity check assert p.shape[1] == 1 + 12 * 3 * 11 + 12 assert log_p.shape[1] == 1 + 12 * 3 * 11 + 12 """test probabilities and log probabilities calculated""" x = np.arange(15, dtype=np.float32) x2 = 2 * x logits = F.stack((x, x2)) num_of_cols = 6 def probs(): action_type = F.softmax(beta * logits[:, :3]) back_prob = action_type[:, 0:1] filter_prob = action_type[:, 1:2] group_prob = action_type[:, 2:3] filter_col_prob = F.softmax(beta * logits[:, 3:3 + num_of_cols]) filter_col_prob = F.broadcast_to( filter_prob, filter_col_prob.shape) * filter_col_prob group_col_prob = F.softmax( beta * logits[:, 3 + num_of_cols:3 + num_of_cols + num_of_cols]) group_col_prob = F.broadcast_to( group_prob, group_col_prob.shape) * group_col_prob res = F.concat((back_prob, filter_col_prob, group_col_prob)) # sanity check, sum(result, axis=1) == 1 assert np.all(F.sum(res, axis=1).data == 1.0) return res def log_probs(): action_type = F.log_softmax(beta * logits[:, :3]) back_prob = action_type[:, 0:1] filter_prob = action_type[:, 1:2] group_prob = action_type[:, 2:3] filter_col_prob = F.log_softmax(beta * logits[:, 3:3 + num_of_cols]) filter_col_prob = F.broadcast_to( filter_prob, filter_col_prob.shape) + filter_col_prob group_col_prob = F.log_softmax( beta * logits[:, 3 + num_of_cols:3 + num_of_cols + num_of_cols]) group_col_prob = F.broadcast_to( group_prob, group_col_prob.shape) + group_col_prob res = F.concat((back_prob, filter_col_prob, group_col_prob)) return res segs = (tuple(), (num_of_cols, ), (num_of_cols, )) parameterized_distr = ParamSoftmaxDistribution(logits, segs, beta) p = probs() p2 = parameterized_distr.all_prob # sanity check 2: probs() == generic_probs() assert np.allclose(p.data, p2.data) log_p = log_probs() log_p2 = parameterized_distr.all_log_prob # sanity check 3: probs() == generic_probs() assert np.allclose(log_p.data, log_p2.data) # sanity check 4: log(p)==log_p assert np.allclose(F.log(p).data, log_p.data)
def loss_softmax_cross_entropy(self, predict, ground_truth): eps = 1e-16 cross_entropy = -F.mean(F.log(predict + eps) * ground_truth) return cross_entropy
def forward(model, batch, num_samples, word_keep_rate, UNK, train=True): batch_size = batch.shape[0] xp = model.xp use_gpu = (xp == cuda.cupy) if use_gpu: batch = cuda.to_gpu(batch) model.reset_state() model.zerograds() # encode batch_length = len(batch[0])-1 for i in range(batch_length): w = Variable(batch[:, i]) model.encode(w, train=train) # infer q(z|x) model.infer(train=train) # compute KL KL = 0 for i in range(model.num_layers): # h mu, sigma = model.hmus[i], model.hsigmas[i] KL += -F.sum((1 + 2 * F.log(sigma) - sigma*sigma - mu*mu) / 2) # c mu, sigma = model.cmus[i], model.csigmas[i] KL += -F.sum((1 + 2 * F.log(sigma) - sigma*sigma - mu*mu) / 2) KL /= batch_size # draw and decode cross_entropies = [] if not train: ys, ts = [], [] UNKs = np.array([UNK for _ in range(batch_size)], dtype=np.int32) if use_gpu: UNKs = cuda.to_gpu(UNKs) for _ in range(num_samples): cross_entropies.append(0) if not train: ys.append([]) ts.append([]) if train == True: model.set_by_sample(train=train) else: model.set_by_MLE(train=train) last_w = None for i in range(batch_length): w, next_w = Variable(batch[:, i]), Variable(batch[:, i+1]) # word dropout masked_w = batch[:, i] if np.random.uniform() > word_keep_rate: enable = (masked_w != -1) masked_w = F.where(enable, masked_w, UNKs) y = model.decode(masked_w, train=train) cross_entropies[-1] += F.softmax_cross_entropy(y, next_w) if not train: ys[-1].append(xp.argmax(y.data, axis=1)) ts[-1].append(next_w.data) last_w = next_w if not train: ys[-1] = xp.vstack(ys[-1]).T ts[-1] = xp.vstack(ts[-1]).T if use_gpu: ys[-1] = cuda.to_cpu(ys[-1]) ts[-1] = cuda.to_cpu(ts[-1]) if train: return (KL, cross_entropies) else: assert(len(cross_entropies) == 1 and len(ys) == 1 and len(ts) == 1) return (KL, (cross_entropies, ys, ts))
def _safe_log(x): """Logarithm function that won't backprop inf to input.""" return F.log(F.where(x.data > 0, x, x.data))
def __call__(self, *args): density = self.predictor(*args) nll = -F.sum(F.log(density)) report({'nll': nll}, self) return nll
def compute_lower_bound_loss(self, labeled_x, labeled_y, label_ids, unlabeled_x, test=False): def lower_bound(log_px_zy, log_py, log_pz, log_qz_xy): lb = log_px_zy + log_py + log_pz - log_qz_xy return lb # _l: labeled # _u: unlabeled batchsize_l = labeled_x.data.shape[0] batchsize_u = unlabeled_x.data.shape[0] num_types_of_label = labeled_y.data.shape[1] xp = self.xp ### Lower bound for labeled data ### # Compute eq.6 -L(x,y) z_mean_l, z_ln_var_l = self.encoder_xy_z(labeled_x, labeled_y, test=test, apply_f=False) z_l = F.gaussian(z_mean_l, z_ln_var_l) log_px_zy_l = self.log_px_zy(labeled_x, z_l, labeled_y, test=test) log_py_l = self.log_py(labeled_y, test=test) if False: log_pz_l = self.log_pz(z_l, z_mean_l, z_ln_var_l, test=test) log_qz_xy_l = self.log_qz_xy(z_l, z_mean_l, z_ln_var_l, test=test) lower_bound_l = lower_bound(log_px_zy_l, log_py_l, log_pz_l, log_qz_xy_l) else: lower_bound_l = log_px_zy_l + log_py_l - self.gaussian_kl_divergence_keepbatch(z_mean_l, z_ln_var_l) if batchsize_u > 0: ### Lower bound for unlabeled data ### # To marginalize y, we repeat unlabeled x, and construct a target (batchsize_u * num_types_of_label) x num_types_of_label # Example of n-dimensional x and target matrix for a 3 class problem and batch_size=2. # unlabeled_x_ext y_ext # [[x0[0], x0[1], ..., x0[n]] [[1, 0, 0] # [x1[0], x1[1], ..., x1[n]] [1, 0, 0] # [x0[0], x0[1], ..., x0[n]] [0, 1, 0] # [x1[0], x1[1], ..., x1[n]] [0, 1, 0] # [x0[0], x0[1], ..., x0[n]] [0, 0, 1] # [x1[0], x1[1], ..., x1[n]]] [0, 0, 1]] unlabeled_x_ext = xp.zeros((batchsize_u * num_types_of_label, unlabeled_x.data.shape[1]), dtype=xp.float32) y_ext = xp.zeros((batchsize_u * num_types_of_label, num_types_of_label), dtype=xp.float32) for n in xrange(num_types_of_label): y_ext[n * batchsize_u:(n + 1) * batchsize_u,n] = 1 unlabeled_x_ext[n * batchsize_u:(n + 1) * batchsize_u] = unlabeled_x.data y_ext = Variable(y_ext) unlabeled_x_ext = Variable(unlabeled_x_ext) # Compute eq.6 -L(x,y) for unlabeled data z_mean_u_ext, z_mean_ln_var_u_ext = self.encoder_xy_z(unlabeled_x_ext, y_ext, test=test, apply_f=False) z_u_ext = F.gaussian(z_mean_u_ext, z_mean_ln_var_u_ext) log_px_zy_u = self.log_px_zy(unlabeled_x_ext, z_u_ext, y_ext, test=test) log_py_u = self.log_py(y_ext, test=test) if False: log_pz_u = self.log_pz(z_u_ext, z_mean_u_ext, z_mean_ln_var_u_ext, test=test) log_qz_xy_u = self.log_qz_xy(z_u_ext, z_mean_u_ext, z_mean_ln_var_u_ext, test=test) lower_bound_u = lower_bound(log_px_zy_u, log_py_u, log_pz_u, log_qz_xy_u) else: lower_bound_u = log_px_zy_u + log_py_u - self.gaussian_kl_divergence_keepbatch(z_mean_u_ext, z_mean_ln_var_u_ext) # Compute eq.7 sum_y{q(y|x){-L(x,y) + H(q(y|x))}} # Let LB(xn, y) be the lower bound for an input image xn and a label y (y = 0, 1, ..., 9). # Let bs be the batchsize. # # lower_bound_u is a vector and it looks like... # [LB(x0,0), LB(x1,0), ..., LB(x_bs,0), LB(x0,1), LB(x1,1), ..., LB(x_bs,1), ..., LB(x0,9), LB(x1,9), ..., LB(x_bs,9)] # # After reshaping. (axis 1 corresponds to label, axis 2 corresponds to batch) # [[LB(x0,0), LB(x1,0), ..., LB(x_bs,0)], # [LB(x0,1), LB(x1,1), ..., LB(x_bs,1)], # . # . # . # [LB(x0,9), LB(x1,9), ..., LB(x_bs,9)]] # # After transposing. (axis 1 corresponds to batch) # [[LB(x0,0), LB(x0,1), ..., LB(x0,9)], # [LB(x1,0), LB(x1,1), ..., LB(x1,9)], # . # . # . # [LB(x_bs,0), LB(x_bs,1), ..., LB(x_bs,9)]] lower_bound_u = F.transpose(F.reshape(lower_bound_u, (num_types_of_label, batchsize_u))) y_distribution = self.encoder_x_y(unlabeled_x, test=test, softmax=True) lower_bound_u = y_distribution * (lower_bound_u - F.log(y_distribution + 1e-6)) loss_labeled = -F.sum(lower_bound_l) / batchsize_l loss_unlabeled = -F.sum(lower_bound_u) / batchsize_u loss = loss_labeled + loss_unlabeled else: loss_unlabeled = None loss_labeled = -F.sum(lower_bound_l) / batchsize_l loss = loss_labeled return loss, loss_labeled, loss_unlabeled