def call(self, x, reconstruction=False): self.reconstruction = reconstruction x_t = x[:, :self.d_dim, :] d = x[:, 2*self.d_dim:3*self.d_dim, :] if reconstruction: output_dim = self.time_stamp m = x[:, 3*self.d_dim:, :] ref_t = K.tile(d[:, :, None, :], (1, 1, output_dim, 1)) else: m = x[:, self.d_dim: 2*self.d_dim, :] ref_t = np.linspace(0, self.hours_look_ahead, self.ref_points) output_dim = self.ref_points ref_t.shape = (1, ref_t.shape[0]) #x_t = x_t*m d = K.tile(d[:, :, :, None], (1, 1, 1, output_dim)) mask = K.tile(m[:, :, :, None], (1, 1, 1, output_dim)) x_t = K.tile(x_t[:, :, :, None], (1, 1, 1, output_dim)) norm = (d - ref_t)*(d - ref_t) a = K.ones((self.d_dim, self.time_stamp, output_dim)) pos_kernel = K.log(1 + K.exp(self.kernel)) alpha = a*pos_kernel[:, np.newaxis, np.newaxis] w = K.logsumexp(-alpha*norm + K.log(mask), axis=2) w1 = K.tile(w[:, :, None, :], (1, 1, self.time_stamp, 1)) w1 = K.exp(-alpha*norm + K.log(mask) - w1) y = K.sum(w1*x_t, axis=2) if reconstruction: rep1 = tf.concat([y, w], 1) else: w_t = K.logsumexp(-10.0*alpha*norm + K.log(mask), axis=2) # kappa = 10 w_t = K.tile(w_t[:, :, None, :], (1, 1, self.time_stamp, 1)) w_t = K.exp(-10.0*alpha*norm + K.log(mask) - w_t) y_trans = K.sum(w_t*x_t, axis=2) rep1 = tf.concat([y, w, y_trans], 1) return rep1
def sparse_multilabel_categorical_crossentropy(y_true, y_pred, mask_zero=False): """稀疏版多标签分类的交叉熵 说明: 1. y_true.shape=[..., num_positive], y_pred.shape=[..., num_classes]; 2. 请保证y_pred的值域是全体实数,换言之一般情况下 y_pred不用加激活函数,尤其是不能加sigmoid或者 softmax; 3. 预测阶段则输出y_pred大于0的类; 4. 详情请看:https://kexue.fm/archives/7359 。 """ zeros = K.zeros_like(y_pred[..., :1]) y_pred = K.concatenate([y_pred, zeros], axis=-1) if mask_zero: infs = zeros + K.infinity() y_pred = K.concatenate([infs, y_pred[..., 1:]], axis=-1) y_pos_2 = batch_gather(y_pred, y_true) y_pos_1 = K.concatenate([y_pos_2, zeros], axis=-1) if mask_zero: y_pred = K.concatenate([-infs, y_pred[..., 1:]], axis=-1) y_pos_2 = batch_gather(y_pred, y_true) pos_loss = K.logsumexp(-y_pos_1, axis=-1) all_loss = K.logsumexp(y_pred, axis=-1) aux_loss = K.logsumexp(y_pos_2, axis=-1) - all_loss aux_loss = K.clip(1 - K.exp(aux_loss), K.epsilon(), 1) neg_loss = all_loss + K.log(aux_loss) return pos_loss + neg_loss
def weighted_sum(first, second, sigma, first_threshold=-np.inf, second_threshold=np.inf): first_normalized = first - kb.logsumexp(first, axis=-1)[...,None] second_normalized = second - kb.logsumexp(second, axis=-1)[...,None] # sigma.shape = (1,), first_normalized.shape = (T1, ..., Tm, d) # logit_probs.shape = (T1, ..., Tm, d) logit_probs = first_normalized * sigma + second_normalized * (1.0 - sigma) # logit_probs = kb.batch_dot(first_normalized, sigma) + kb.batch_dot(second_normalized, 1.0 - sigma) first_mask = (first_normalized < first_threshold).nonzero() logit_probs = kb.T.set_subtensor(logit_probs[first_mask], -np.inf) second_mask = (second_normalized < second_threshold).nonzero() logit_probs = kb.T.set_subtensor(logit_probs[second_mask], -np.inf) return logit_probs
def multilabel_categorical_crossentropy(y_true, y_pred): """多标签分类的交叉熵 说明:y_true和y_pred的shape一致,y_true的元素非0即1, 1表示对应的类为目标类,0表示对应的类为非目标类。 """ y_pred = (1 - 2 * y_true) * y_pred y_pred_neg = y_pred - y_true * 1e12 y_pred_pos = y_pred - (1 - y_true) * 1e12 zeros = K.zeros_like(y_pred[..., :1]) y_pred_neg = K.concatenate([y_pred_neg, zeros], axis=-1) y_pred_pos = K.concatenate([y_pred_pos, zeros], axis=-1) neg_loss = K.logsumexp(y_pred_neg, axis=-1) pos_loss = K.logsumexp(y_pred_pos, axis=-1) return neg_loss + pos_loss
def step(self, input_energy_t, states, return_logZ=True): prev_target_val, i, chain_energy = states[:3] t = K.cast(i[0, 0], dtype='int32') if len(states) > 3: if K.backend() == 'theano': m = states[3][:, t:(t + 2)] else: m = K.tf.slice(states[3], [0, t], [-1, 2]) input_energy_t = input_energy_t * K.expand_dims(m[:, 0]) chain_energy = chain_energy * K.expand_dims( K.expand_dims( m[:, 0] * m[:, 1])) # (1, F, F)*(B, 1, 1) -> (B, F, F) if return_logZ: energy = chain_energy + K.expand_dims( input_energy_t - prev_target_val, 2) # shapes: (1, B, F) + (B, F, 1) -> (B, F, F) new_target_val = K.logsumexp(-energy, 1) # shapes: (B, F) return new_target_val, [new_target_val, i + 1] else: energy = chain_energy + K.expand_dims( input_energy_t + prev_target_val, 2) min_energy = K.min(energy, 1) argmin_table = K.cast(K.argmin(energy, 1), K.floatx()) # cast for tf-version `K.rnn` return argmin_table, [min_energy, i + 1]
def loss(self, y_true, y_pred): """Negative log pdf. Used logsum trick for numerical stability""" mixture_weights, mu, sigma, = self.split_param_types(y_pred) norm = 1. / (np.sqrt(2. * np.pi) * sigma) exponent = -(K.square(y_true - mu) / (2. * K.square(sigma)) - K.log(mixture_weights) - K.log(norm)) return -K.logsumexp(exponent, axis=-1)
def _gmd_log_likelihood(y_true, y_pred): """Log-likelihood loss for Gaussian Mixture Densities. Currently only supports tensorflow backend. Args: y_true (tensor): A tensor of shape (samples, c) with the target values. y_pred (tensor): Tensor of shape (samples, m*(c + 2)), where m is the number of gaussians. The second dimension encodes the following parameters (in that order): 1) m log-priors (outputs of a log-softmax activation layer) 2) m variances (outputs of a ShiftedELU activation layer) 3) m*c means (outputs of a linear activation layer) Returns: Average negative log-likelihood of each sample. """ splits = [m, m, m * c] # Get GMD parameters # Parameters are concatenated along the second axis # tf.split expect sizes, not locations log_prior, sigma_sq, mu = K.tf.split(y_pred, num_or_size_splits=splits, axis=1) y_true = K.expand_dims(y_true, axis=2) mu = K.reshape(mu, [-1, c, m]) # -1 is for the sample dimension dist = K.sum(K.square(y_true - mu), axis=1) exponent = log_prior - c * HALF_LOG_TWOPI - ( c / 2.0) * K.log(sigma_sq) - (1 / 2.0) * dist / sigma_sq return -K.logsumexp(exponent, axis=1)
def step(self, input_energy_t, states, return_logZ=True): # not in the following `prev_target_val` has shape = (B, F) # where B = batch_size, F = output feature dim # Note: `i` is of float32, due to the behavior of `K.rnn` prev_target_val, i, chain_energy = states[:3] t = K.cast(i[0, 0], dtype='int32') if len(states) > 3: if K.backend() == 'theano': m = states[3][:, t:(t + 2)] else: m = K.tf.slice(states[3], [0, t], [-1, 2]) input_energy_t = input_energy_t * K.expand_dims(m[:, 0]) chain_energy = chain_energy * K.expand_dims( K.expand_dims( m[:, 0] * m[:, 1])) # (1, F, F)*(B, 1, 1) -> (B, F, F) if return_logZ: energy = chain_energy + K.expand_dims( input_energy_t - prev_target_val, 2) # shapes: (1, B, F) + (B, F, 1) -> (B, F, F) new_target_val = K.logsumexp(-energy, 1) # shapes: (B, F) return new_target_val, [new_target_val, i + 1] else: energy = chain_energy + K.expand_dims( input_energy_t + prev_target_val, 2) min_energy = K.min(energy, 1) argmin_table = K.cast(K.argmin(energy, 1), K.floatx()) # cast for tf-version `K.rnn` return argmin_table, [min_energy, i + 1]
def call(self, x): # Construct the pairwise distance matrix D = pairwise_dists(x, x, epsilon=self.epsilon) J = [] # We need to loop through all positive pairs. Since we know # the structure of the batch, this is not too difficult. for c in range(self.p): # Loop through classes for i in range(self.k): for j in range(i + 1, self.k): row_i = c * self.k + i row_j = c * self.k + j rows = K.gather( D, K.constant([row_i, row_j], dtype=K.tf.int32)) rows = K.concatenate([ K.tf.slice(rows, begin=[0, 0], size=[2, c * self.k]), K.tf.slice(rows, begin=[0, (c + 1) * self.k], size=[2, (self.p - c - 1) * self.k]) ], axis=1) rows = K.flatten(rows) J.append(K.logsumexp(self.margin - rows) + D[row_i, row_j]) J = K.stack(J) return K.mean(K.square(K.relu(J))) / 2.0
def discriminate_real(y_output, batch_size=batch_size): # logD(x) = logZ(x) - log(Z(x) + 1) where Z(x) = sum_{k=1}^K exp(l_k(x)) log_zx = K.logsumexp(y_output, axis=1) log_dx = log_zx - K.softplus(log_zx) dx = K.sum(K.exp(log_dx)) / batch_size loss = -K.sum(log_dx) / batch_size return loss, dx
def test_logsumexp(self, x_np, axis, keepdims, K): ''' Check if K.logsumexp works properly for values close to one. ''' x = K.variable(x_np) assert_allclose(K.eval(K.logsumexp(x, axis=axis, keepdims=keepdims)), np.log(np.sum(np.exp(x_np), axis=axis, keepdims=keepdims)), rtol=1e-5)
def test_logsumexp_optim(self, K): ''' Check if optimization works. ''' x_np = np.array([1e+4, 1e-4]) assert_allclose(K.eval(K.logsumexp(K.variable(x_np), axis=0)), 1e4, rtol=1e-5)
def loss(self, y_true, y_pred): # 目标y_pred需要是one hot形式 mask = 1-y_true[:,1:,-1] if self.ignore_last_label else None y_true,y_pred = y_true[:,:,:self.num_labels],y_pred[:,:,:self.num_labels] init_states = [y_pred[:,0]] # 初始状态 log_norm,_,_ = K.rnn(self.log_norm_step, y_pred[:,1:], init_states, mask=mask) # 计算Z向量(对数) log_norm = K.logsumexp(log_norm, 1, keepdims=True) # 计算Z(对数) path_score = self.path_score(y_pred, y_true) # 计算分子(对数) return log_norm - path_score # 即log(分子/分母)
def free_energy0(x, U, mask=None): """ Free energy without boundary potential handling. """ initial_states = [x[:, 0, :]] last_alpha, _ = _forward(x, lambda B: [K.logsumexp(B, axis=1)], initial_states, U, mask) return last_alpha[:, 0]
def get_loss(self, args): logits, action, weights = args action = tf.reshape(action, [-1]) mask = tf.one_hot(action, depth=self.action_size, dtype=tf.float32) logpi = tf.reduce_sum( (logits - tf.transpose([K.logsumexp(logits, axis=-1)])) * mask, axis=-1) logpi_w = tf.transpose([logpi]) * weights return logpi_w
def log_norm_step(self, inputs, states): """递归计算归一化因子 要点:1、递归计算;2、用logsumexp避免溢出。 技巧:通过expand_dims来对齐张量。 """ states = K.expand_dims(states[0], 2) # (batch_size, output_dim, 1) trans = K.expand_dims(self.trans, 0) # (1, output_dim, output_dim) output = K.logsumexp(states + trans, 1) # (batch_size, output_dim) return output + inputs, [output + inputs]
def consensus_categorical_crossentropy(y_true, y_pred): # y_pred = tf.nn.softmax(y_pred, axis=-1) y_pred /= tf.reduce_sum(y_pred, len(y_pred.get_shape()) - 1, True) # print y_pred.shape y_pred = K.clip(y_pred, K.epsilon(), 1.0 - K.epsilon()) # print y_true # print K.sum(y_true * (y_pred - K.logsumexp(y_pred)), axis=-1) return -tf.reduce_sum(y_true * (y_pred - K.logsumexp(y_pred)), len(y_pred.get_shape()) - 1)
def log_norm_step(self, inputs, states): """递归计算归一化因子 要点:1、递归计算;2、用logsumexp避免溢出。 技巧:通过expand_dims来对齐张量。原本是先exp之后矩阵乘法运算的,为了防止溢出,可以先矩阵相加再做exp, 再做的exp使用logsumexp完成有效防止溢出 """ states = K.expand_dims(states[0], 2) # (batch_size, output_dim, 1) trans = K.expand_dims(self.trans, 0) # (1, output_dim, output_dim) output = K.logsumexp(states + trans, 1) # (batch_size, output_dim) return output + inputs, [output + inputs]
def call(self, logits): norm_logits = logits - K.tile(K.logsumexp(logits, axis=-1, keepdims=True), (1, K.shape(logits)[1])) categorical = K.softmax(logits) kl = -K.sum(categorical * (norm_logits - K.log(data['pis'])), axis=-1) ll = K.transpose(K.stack([mode.log_prob(self._input) for mode in modes])) ll = K.sum(categorical * ll, axis=-1) elbo = ll - kl self.add_loss(-elbo, inputs=logits) return logits
def euclidean_distance(self, args): a, b = args N, D = K.shape(a)[0], K.shape(a)[1] M = K.shape(b)[0] a = K.expand_dims(a, axis=1) b = K.expand_dims(b, axis=0) a = K.tile(a, [1, M, 1]) b = K.tile(b, [N, 1, 1]) dist = K.mean(K.square(a - b), axis=2) return -dist - K.logsumexp(-dist) #tf.nn.log_softmax(-dist)
def _get_weights(self): log_likelihood = -self.nll log_p = K.sum([q.prior.log_prob(q.samples) for q in self.latents], axis=0) log_q = K.sum([q.log_prob(q.samples) for q in self.latents], axis=0) log_weights = log_likelihood + log_p - log_q log_weights -= K.logsumexp(log_weights, axis=-1, keepdims=True) weights_unnormalized = K.exp(log_weights) return weights_unnormalized / K.sum( weights_unnormalized, axis=-1, keepdims=True)
def entropy_estimator_kl(x, var): # KL-based upper bound on entropy of mixture of Gaussians with covariance matrix var * I # see Kolchinsky and Tracey, Estimating Mixture Entropy with Pairwise Distances, Entropy, 2017. Section 4. # and Kolchinsky and Tracey, Nonlinear Information Bottleneck, 2017. Eq. 10 dims, N = get_shape(x) dists = Kget_dists(x) dists2 = dists / (2 * var) normconst = (dims / 2.0) * K.log(2 * np.pi * var) lprobs = K.logsumexp(-dists2, axis=1) - K.log(N) - normconst h = -K.mean(lprobs) return dims / 2 + h
def multilabel_categorical_crossentropy(y_true, y_pred): """多标签分类的交叉熵 说明: 1. y_true和y_pred的shape一致,y_true的元素非0即1, 1表示对应的类为目标类,0表示对应的类为非目标类; 2. 请保证y_pred的值域是全体实数,换言之一般情况下 y_pred不用加激活函数,尤其是不能加sigmoid或者 softmax; 3. 预测阶段则输出y_pred大于0的类; 4. 详情请看:https://kexue.fm/archives/7359 。 """ y_pred = (1 - 2 * y_true) * y_pred y_neg = y_pred - y_true * K.infinity() y_pos = y_pred - (1 - y_true) * K.infinity() zeros = K.zeros_like(y_pred[..., :1]) y_neg = K.concatenate([y_neg, zeros], axis=-1) y_pos = K.concatenate([y_pos, zeros], axis=-1) neg_loss = K.logsumexp(y_neg, axis=-1) pos_loss = K.logsumexp(y_pos, axis=-1) return neg_loss + pos_loss
def entropy_upper(data, noise_variance): pairwise_dists = get_dists_backend(data) pairwise_dists /= (2 * noise_variance) N = K.cast(K.shape(data)[0], K.floatx()) dims = K.cast(K.shape(data)[1], K.floatx()) normconst = (dims / 2.0) * K.log(2 * np.pi * noise_variance) term1 = K.logsumexp(-pairwise_dists, axis=1) - K.log(N) - normconst return -K.mean(term1) + dims / 2
def get_mdn_coef(output): # first column is the batch dimension assert output.shape[1] % 3 == 0 num_components = int(int(output.shape[1]) / 3) logmix = output[:, :num_components] mean = output[:, num_components:2 * num_components] logstd = output[:, 2 * num_components:] logmix = logmix - K.logsumexp(logmix, axis=1, keepdims=True) return logmix, mean, logstd
def for_each_batch(args): y_true, y_pred = args # y_pred_ = tf.boolean_mask(y_pred, tf.not_equal(y_true, -1)) # y_true_ = tf.boolean_mask(y_true, tf.not_equal(y_true, -1)) match = tf.cast(tf.equal(comb, y_true), tf.float32) num_matches_needed = len( tf.boolean_mask(y_true, tf.not_equal(y_true, -1))) y_true_combs = tf.boolean_mask( comb, K.sum(match, axis=1) == num_matches_needed) # yp = K.sum(K.log(y_true_*y_pred_ + (1-y_true_)*(1-y_pred_))) yp = K.logsumexp( K.sum(-K.binary_crossentropy(y_true_combs, y_pred), axis=1)) # certain_combs = tf.numpy_function(lambda x: np.unique(x, axis=0), [tf.boolean_mask(self.comb, tf.not_equal(y_true, -1), axis=1)], tf.float32) # certain_combs = tf.Print(certain_combs, [certain_combs], 'Combs ') # yp -= K.logsumexp(K.sum(K.log(y_pred*self.comb + (1-y_pred)*(1-self.comb)), axis=1)) yp -= K.logsumexp( K.sum(-K.binary_crossentropy(self.comb, y_pred), axis=1)) return yp
def log_norm_step(self, inputs, states): """递归计算归一化因子 要点:1、递归计算;2、用logsumexp避免溢出。 技巧:通过expand_dims来对齐张量。 """ inputs, mask = inputs[:, :-1], inputs[:, -1:] states = K.expand_dims(states[0], 2) # (batch_size, output_dim, 1) trans = K.expand_dims(self.trans, 0) # (1, output_dim, output_dim) outputs = K.logsumexp(states + trans, 1) # (batch_size, output_dim) outputs = outputs + inputs outputs = mask * outputs + (1 - mask) * states[:, :, 0] return outputs, [outputs]
def loss(self, y_true, y_pred): # 目标y_pred需要是one hot形式 if self.ignore_last_label: mask = 1 - y_true[:, :, -1:] else: mask = K.ones_like(y_pred[:, :, :1]) y_true, y_pred = y_true[:, :, :self.num_labels], y_pred[:, :, :self.num_labels] path_score = self.path_score(y_pred, y_true) # 计算分子(对数) init_states = [y_pred[:, 0]] # 初始状态 y_pred = K.concatenate([y_pred, mask]) log_norm, _, _ = K.rnn(self.log_norm_step, y_pred[:, 1:], init_states) # 计算Z向量(对数) log_norm = K.logsumexp(log_norm, 1, keepdims=True) # 计算Z(对数) return log_norm - path_score # 即log(分子/分母)
def log_norm_pre(self, inputs, states): ''' expand previous states and inputs, sum with trans :param inputs: (batch_size, num_label), current word emission scores :param states: (batch_size, num_label), all paths score of previous word :return: ''' states = K.expand_dims(states[0], 2) inputs = K.expand_dims(inputs, 1) trans = K.expand_dims(self.trans, 0) scores = states + trans + inputs output = K.logsumexp(scores, 1) return output, [output]
def call(self, inputs): inputs, labels = inputs # 以“预测值+目标(one hot)”为输入 mask = 1 - labels[:, 1:, -1] if self.ignore_last_label else None inputs, labels = inputs[:, :, :self.num_labels], labels[:, :, :self. num_labels] init_states = [inputs[:, 0]] # 初始状态 log_norm, _, _ = K.rnn(self.log_norm_step, inputs[:, 1:], init_states, mask=mask) # 计算Z向量(对数) log_norm = K.logsumexp(log_norm, 1, keepdims=True) # 计算Z(对数) path_score = self.path_score(inputs, labels) # 计算分子(对数) return log_norm - path_score # 即log(分子/分母)
def sparse_amsoftmax_loss(y_true, y_pred, scale=30, margin=0.35): y_true = K.expand_dims(y_true[:, 0], 1) # 保证y_true的shape=(None, 1) y_true = K.cast(y_true, 'int32') # 保证y_true的dtype=int32 batch_idxs = K.arange(0, K.shape(y_true)[0]) batch_idxs = K.expand_dims(batch_idxs, 1) idxs = K.concatenate([batch_idxs, y_true], 1) y_true_pred = K.tf.gather_nd(y_pred, idxs) # 目标特征,用tf.gather_nd提取出来 y_true_pred = K.expand_dims(y_true_pred, 1) y_true_pred_margin = y_true_pred - margin # 减去margin _Z = K.concatenate([y_pred, y_true_pred_margin], 1) # 为计算配分函数 _Z = _Z * scale # 缩放结果,主要因为pred是cos值,范围[-1, 1] logZ = K.logsumexp(_Z, 1, keepdims=True) # 用logsumexp,保证梯度不消失 logZ = logZ + K.log(1 - K.exp(scale * y_true_pred - logZ)) # 从Z中减去exp(scale * y_true_pred) return - y_true_pred_margin * scale + logZ
def sparse_simpler_asoftmax_loss(y_true, y_pred, scale=30): y_true = K.expand_dims(y_true[:, 0], 1) # 保证y_true的shape=(None, 1) y_true = K.cast(y_true, 'int32') # 保证y_true的dtype=int32 batch_idxs = K.arange(0, K.shape(y_true)[0]) batch_idxs = K.expand_dims(batch_idxs, 1) idxs = K.concatenate([batch_idxs, y_true], 1) y_true_pred = K.tf.gather_nd(y_pred, idxs) # 目标特征,用tf.gather_nd提取出来 y_true_pred = K.expand_dims(y_true_pred, 1) # 用到了四倍角公式进行展开 y_true_pred_margin = 1 - 8 * K.square(y_true_pred) + 8 * K.square(K.square(y_true_pred)) # 下面等效于min(y_true_pred, y_true_pred_margin) y_true_pred_margin = y_true_pred_margin - K.relu(y_true_pred_margin - y_true_pred) _Z = K.concatenate([y_pred, y_true_pred_margin], 1) # 为计算配分函数 _Z = _Z * scale # 缩放结果,主要因为pred是cos值,范围[-1, 1] logZ = K.logsumexp(_Z, 1, keepdims=True) # 用logsumexp,保证梯度不消失 logZ = logZ + K.log(1 - K.exp(scale * y_true_pred - logZ)) # 从Z中减去exp(scale * y_true_pred) return - y_true_pred_margin * scale + logZ
def logsumexp(x): return K.logsumexp(x, axis=1, keepdims=False)