def sub_mean(x): x = x / 255 x = x - backend.mean(x) return x
def contrastive_loss(label, ED): margin = 1 # note: the images are scaled between 0 and 1 return K.mean((1 - label) * 0.5 * K.square(ED) + label * 0.5 * K.square(K.maximum(margin - ED, 0)))
def quantile_loss(q, y_true, y_pred): err = (y_true - y_pred) return K.mean(K.maximum(q*err, (q-1)*err), axis=-1)
def l2_loss(y_true: NDArray, y_pred: NDArray): error = y_true - y_pred sqr_error = K.square(error) sum_sqr_error = K.sum(sqr_error, axis=(1, 2, 3)) l2_loss = K.mean(sum_sqr_error, axis=0) return l2_loss * weight
def sparse_crossentropy_masked(y_true, y_pred, pad_idx = 0): y_true_masked = tf.boolean_mask(y_true, tf.not_equal(y_true, pad_idx)) y_pred_masked = tf.boolean_mask(y_pred, tf.not_equal(y_true, pad_idx)) return K.mean(K.sparse_categorical_crossentropy(y_true_masked, y_pred_masked))
def call(self, y_true, y_pred, tau=0.1): error = y_true - y_pred return kb.mean(kb.maximum(tau * error, (tau - 1) * error), axis=-1)
def logloss(y_true, y_pred): y_pred = tf.clip_by_value(y_pred, P_MIN, P_MAX) return -backend.mean(y_true * backend.log(y_pred) + (1 - y_true) * backend.log(1 - y_pred))
def call(self, inputs): """ Creates the layer as a Keras graph. Note that the inputs are tensors with a batch dimension of 1: Keras requires this batch dimension, and for full-batch methods we only have a single "batch". There are three inputs required, the node features, the output indices (the nodes that are to be selected in the final layer) and the graph adjacency matrix Notes: This does not add self loops to the adjacency matrix. The output indices are only used when ``final_layer=True`` Args: inputs (list): list of inputs with 3 items: node features (size 1 x N x F), output indices (size 1 x M), graph adjacency matrix (size N x N), where N is the number of nodes in the graph, F is the dimensionality of node features M is the number of output nodes """ X = inputs[0] # Node features (1 x N x F) out_indices = inputs[1] # output indices (1 x K) A = inputs[2] # Adjacency matrix (N x N) N = K.int_shape(A)[-1] batch_dim, n_nodes, _ = K.int_shape(X) if batch_dim != 1: raise ValueError( "Currently full-batch methods only support a batch dimension of one" ) else: # Remove singleton batch dimension X = K.squeeze(X, 0) out_indices = K.squeeze(out_indices, 0) outputs = [] for head in range(self.attn_heads): kernel = self.kernels[head] # W in the paper (F x F') attention_kernel = self.attn_kernels[ head] # Attention kernel a in the paper (2F' x 1) # Compute inputs to attention network features = K.dot(X, kernel) # (N x F') # Compute feature combinations # Note: [[a_1], [a_2]]^T [[Wh_i], [Wh_2]] = [a_1]^T [Wh_i] + [a_2]^T [Wh_j] attn_for_self = K.dot( features, attention_kernel[0]) # (N x 1), [a_1]^T [Wh_i] attn_for_neighs = K.dot( features, attention_kernel[1]) # (N x 1), [a_2]^T [Wh_j] # Attention head a(Wh_i, Wh_j) = a^T [[Wh_i], [Wh_j]] dense = attn_for_self + K.transpose( attn_for_neighs) # (N x N) via broadcasting # Add nonlinearity dense = LeakyReLU(alpha=0.2)(dense) # Mask values before activation (Vaswani et al., 2017) # YT: this only works for 'binary' A, not for 'weighted' A! # YT: if A does not have self-loops, the node itself will be masked, so A should have self-loops # YT: this is ensured by setting the diagonal elements of A tensor to 1 above if not self.saliency_map_support: mask = -10e9 * (1.0 - A) dense += mask dense = K.softmax(dense) # (N x N), Eq. 3 of the paper else: # dense = dense - tf.reduce_max(dense) # GAT with support for saliency calculations W = (self.delta * A ) * K.exp(dense - K.max(dense, axis=1, keepdims=True)) * ( 1 - self.non_exist_edge) + self.non_exist_edge * ( A + self.delta * (tf.ones((N, N)) - A) + tf.eye(N) ) * K.exp(dense - K.max(dense, axis=1, keepdims=True)) dense = W / K.sum(W, axis=1, keepdims=True) # Apply dropout to features and attention coefficients dropout_feat = Dropout(self.in_dropout_rate)(features) # (N x F') dropout_attn = Dropout(self.attn_dropout_rate)(dense) # (N x N) # Linear combination with neighbors' features [YT: see Eq. 4] node_features = K.dot(dropout_attn, dropout_feat) # (N x F') if self.use_bias: node_features = K.bias_add(node_features, self.biases[head]) # Add output of attention head to final output outputs.append(node_features) # Aggregate the heads' output according to the reduction method if self.attn_heads_reduction == "concat": output = K.concatenate(outputs) # (N x KF') else: output = K.mean(K.stack(outputs), axis=0) # N x F') # Nonlinear activation function output = self.activation(output) # On the final layer we gather the nodes referenced by the indices if self.final_layer: output = K.gather(output, out_indices) # Add batch dimension back if we removed it if batch_dim == 1: output = K.expand_dims(output, 0) return output
def wasserstein(self, y_true, y_pred): return K.mean(y_true * y_pred,axis=-1)
def mae_loss(y_true, y_pred): global percentage_MAE return percentage_MAE * K.mean(mean_absolute_error(y_true, y_pred))
def squared_area_between(y_true, y_pred): return K.mean( K.square(K.cumsum(y_true, axis=-1) - K.cumsum(y_pred, axis=-1)))
def area_between(y_true, y_pred): return K.mean(K.abs(K.cumsum(y_true, axis=-1) - K.cumsum(y_pred, axis=-1)))
def rmse(a, b): return K.sqrt(K.mean(K.square(a - b)))
def weightedMSE(self, y_true, y_pred): y_true = K.cast(y_true, y_pred.dtype) loss = K.mean(K.square(y_true - y_pred) * K.maximum(y_pred, y_true), axis=(-1)) return loss
def class_loss_cls(y_true, y_pred): return lambda_cls_class * K.mean( categorical_crossentropy(y_true[0, :, :], y_pred[0, :, :]))
def __call__(self, loss, seed_input, penultimate_layer=-1, seek_penultimate_conv_layer=True, activation_modifier=lambda cam: K.relu(cam), normalize_gradient=True, expand_cam=True): """Generate a gradient based class activation map (CAM) by using positive gradient of penultimate_layer with respect to loss. For details on Grad-CAM, see the paper: [Grad-CAM: Why did you say that? Visual Explanations from Deep Networks via Gradient-based Localization](https://arxiv.org/pdf/1610.02391v1.pdf). # Arguments loss: A loss function. If the model has multiple outputs, you can use a different loss on each output by passing a list of losses. seed_input: An N-dim Numpy array. If the model has multiple inputs, you have to pass a list of N-dim Numpy arrays. penultimate_layer: A number of integer or a tf.keras.layers.Layer object. seek_penultimate_conv_layer: True to seek the penultimate layter that is a subtype of `keras.layers.convolutional.Conv` class. If False, the penultimate layer is that was elected by penultimate_layer index. normalize_gradient: True to normalize gradients. activation_modifier: A function to modify gradients. expand_cam: True to expand cam to same as input image size. ![Note] Even if the model has multiple inputs, this function return only one cam value (That's, when `expand_cam` is True, multiple cam images are generated from a model that has multiple inputs). # Returns The heatmap image or a list of their images that indicate the `seed_input` regions whose change would most contribute the loss value, # Raises ValueError: In case of invalid arguments for `loss`, or `penultimate_layer`. """ # Preparing losses = self._get_losses_for_multiple_outputs(loss) seed_inputs = self._get_seed_inputs_for_multiple_inputs(seed_input) penultimate_output_tensor = self._find_penultimate_output( penultimate_layer, seek_penultimate_conv_layer) # Processing gradcam model = tf.keras.Model(inputs=self.model.inputs, outputs=self.model.outputs + [penultimate_output_tensor]) with tf.GradientTape() as tape: tape.watch(seed_inputs) outputs = model(seed_inputs) outputs, penultimate_output = outputs[:-1], outputs[-1] loss_values = [loss(y) for y, loss in zip(outputs, losses)] grads = tape.gradient(loss_values, penultimate_output) if normalize_gradient: grads = K.l2_normalize(grads) weights = K.mean(grads, axis=tuple(range(grads.ndim)[1:-1]), keepdims=True) cam = np.sum(penultimate_output * weights, axis=-1) if activation_modifier is not None: cam = activation_modifier(cam) if not expand_cam: return cam # Visualizing cam = self._zoom_for_visualizing(seed_inputs, cam) if len(self.model.inputs) == 1 and not isinstance(seed_input, list): cam = cam[0] return cam
def root_mean_square(x, axis=None, keepdims=False): """均方根,相当于模长的变体 """ return K.sqrt(K.mean(K.square(x), axis=axis, keepdims=keepdims))
def call(self, inputs, **kwargs): """ Creates the layer as a Keras graph Notes: This does not add self loops to the adjacency matrix. The output indices are only used when `final_layer=True` Args: inputs (list): list of inputs with 4 items: node features (size b x N x F), output indices (size b x M), sparse graph adjacency matrix (size N x N), where N is the number of nodes in the graph, F is the dimensionality of node features M is the number of output nodes """ X = inputs[0] # Node features (1 x N x F) out_indices = inputs[1] # output indices (1 x K) A_sparse = inputs[2] # Adjacency matrix (1 x N x N) if not isinstance(A_sparse, tf.SparseTensor): raise TypeError("A is not sparse") # Get undirected graph edges (E x 2) A_indices = A_sparse.indices batch_dim, n_nodes, _ = K.int_shape(X) if batch_dim != 1: raise ValueError( "Currently full-batch methods only support a batch dimension of one" ) else: # Remove singleton batch dimension out_indices = K.squeeze(out_indices, 0) X = K.squeeze(X, 0) outputs = [] for head in range(self.attn_heads): kernel = self.kernels[head] # W in the paper (F x F') attention_kernel = self.attn_kernels[ head] # Attention kernel a in the paper (2F' x 1) # Compute inputs to attention network features = K.dot(X, kernel) # (N x F') # Compute feature combinations # Note: [[a_1], [a_2]]^T [[Wh_i], [Wh_j]] = [a_1]^T [Wh_i] + [a_2]^T [Wh_j] attn_for_self = K.dot( features, attention_kernel[0]) # (N x 1), [a_1]^T [Wh_i] attn_for_neighs = K.dot( features, attention_kernel[1]) # (N x 1), [a_2]^T [Wh_j] # Create sparse attention vector (All non-zero values of the matrix) sparse_attn_self = tf.gather(K.reshape(attn_for_self, [-1]), A_indices[:, 0], axis=0) sparse_attn_neighs = tf.gather(K.reshape(attn_for_neighs, [-1]), A_indices[:, 1], axis=0) attn_values = sparse_attn_self + sparse_attn_neighs # Add nonlinearity attn_values = LeakyReLU(alpha=0.2)(attn_values) # Apply dropout to features and attention coefficients dropout_feat = Dropout(self.in_dropout_rate)(features) # (N x F') dropout_attn = Dropout(self.attn_dropout_rate)( attn_values) # (N x N) # Convert to sparse matrix sparse_attn = tf.sparse.SparseTensor( A_indices, values=dropout_attn, dense_shape=[n_nodes, n_nodes]) # Apply softmax to get attention coefficients sparse_attn = tf.sparse.softmax( sparse_attn) # (N x N), Eq. 3 of the paper # Linear combination with neighbors' features [YT: see Eq. 4] node_features = tf.sparse.sparse_dense_matmul( sparse_attn, dropout_feat) # (N x F') if self.use_bias: node_features = K.bias_add(node_features, self.biases[head]) # Add output of attention head to final output outputs.append(node_features) # Aggregate the heads' output according to the reduction method if self.attn_heads_reduction == "concat": output = K.concatenate(outputs) # (N x KF') else: output = K.mean(K.stack(outputs), axis=0) # N x F') output = self.activation(output) # On the final layer we gather the nodes referenced by the indices if self.final_layer: output = K.gather(output, out_indices) # Add batch dimension back if we removed it if batch_dim == 1: output = K.expand_dims(output, 0) return output
def r2_score(y_true, y_pred): SS_res = K.sum(K.square(y_true - y_pred)) SS_tot = K.sum(K.square(y_true - K.mean(y_true))) return (1 - SS_res / (SS_tot + K.epsilon()))
def acc(y_true, y_pred): return K.mean(K.all(K.equal(tf.cast(K.reshape(y_true, (-1, max_seq_len)), tf.int64), K.argmax(y_pred, axis=-1)), axis=-1))
def pTLossTF(y_true, y_pred): y_t = K.cast(y_true < 80, K.dtype(y_true)) * y_true + K.cast( y_true >= 80, K.dtype(y_true)) * K.cast( y_true < 250, K.dtype(y_true)) * y_true * 2.4 + K.cast( y_true >= 160, K.dtype(y_true)) * 10 return K.mean(y_t * K.pow((y_pred - y_true) / y_true, 2)) / 250
def call(self, x): mean = K.mean(x=x, axis=-1, keepdims=True) std = K.std(x=x, axis=-1, keepdims=True) return self.a_2 * (x - mean) / (std + self.eps) + self.b_2
def metr(y_true, y_pred): '''custom keras metric to monitor real data''' return K.mean(K.square(K.exp(y_pred) - K.exp(y_true)))
def policy_loss_with_metrics(self, Adv, A=None): """ This method constructs the policy loss as a scalar-valued Tensor, together with a dictionary of metrics (also scalars). This method may be overridden to construct a custom policy loss and/or to change the accompanying metrics. Parameters ---------- Adv : 1d Tensor, shape: [batch_size] A batch of advantages. A : nd Tensor, shape: [batch_size, ...] A batch of actions taken under the behavior policy. For some choices of policy loss, e.g. ``update_strategy='sac'`` this input is ignored. Returns ------- loss, metrics : (Tensor, dict of Tensors) The policy loss along with some metrics, which is a dict of type ``{name <str>: metric <Tensor>}``. The loss and each of the metrics (dict values) are scalar Tensors, i.e. Tensors with ``ndim=0``. The ``loss`` is passed to a keras Model using ``train_model.add_loss(loss)``. Similarly, each metric in the metric dict is passed to the model using ``train_model.add_metric(metric, name=name, aggregation='mean')``. """ if K.ndim(Adv) == 2: check_tensor(Adv, axis_size=1, axis=1) Adv = K.squeeze(Adv, axis=1) check_tensor(Adv, ndim=1) if self.update_strategy == 'vanilla': assert A is not None log_pi = self.dist.log_proba(A) check_tensor(log_pi, same_as=Adv) entropy = K.mean(self.dist.entropy()) # flip sign to get loss from objective loss = -K.mean(Adv * log_pi) + self.entropy_beta * entropy # no metrics related to behavior_dist since its not used in loss metrics = {'policy/entropy': entropy} elif self.update_strategy == 'ppo': assert A is not None log_pi = self.dist.log_proba(A) log_pi_old = K.stop_gradient(self.target_dist.log_proba(A)) check_tensor(log_pi, same_as=Adv) check_tensor(log_pi_old, same_as=Adv) eps = self.ppo_clip_eps ratio = K.exp(log_pi - log_pi_old) ratio_clip = K.clip(ratio, 1 - eps, 1 + eps) check_tensor(ratio, same_as=Adv) check_tensor(ratio_clip, same_as=Adv) clip_objective = K.mean(K.minimum(Adv * ratio, Adv * ratio_clip)) entropy = K.mean(self.dist.entropy()) kl_div = K.mean(self.target_dist.kl_divergence(self.dist)) # flip sign to get loss from objective loss = -(clip_objective + self.entropy_beta * entropy) metrics = {'policy/entropy': entropy, 'policy/kl_div': kl_div} elif self.update_strategy == 'sac': self.logger.debug("using update_strategy 'sac'") loss = -K.mean(Adv) metrics = {'policy/entropy': K.mean(self.dist.entropy())} elif self.update_strategy == 'cross_entropy': raise NotImplementedError('cross_entropy') else: raise ValueError( "unknown update_strategy '{}'".format(self.update_strategy)) # rename check_tensor(loss, ndim=0) loss = tf.identity(loss, name='policy/loss') return loss, metrics
def loss_gt_(y_true, y_pred): intersection = K.sum(K.abs(y_true * y_pred), axis=[-3, -2, -1]) dn = K.sum(K.square(y_true) + K.square(y_pred), axis=[-3, -2, -1]) + e return -K.mean(2 * intersection / dn, axis=[0, 1])
def l1_loss(y_true: NDArray, y_pred: NDArray): error = y_true - y_pred error = K.abs(error) sum_error = K.sum(error, axis=(1, 2, 3)) l1_loss = K.mean(sum_error, axis=0) return l1_loss
def dice_coefficient(y_true, y_pred): intersection = K.sum(K.abs(y_true * y_pred), axis=[-3, -2, -1]) dn = K.sum(K.square(y_true) + K.square(y_pred), axis=[-3, -2, -1]) + 1e-8 return K.mean(2 * intersection / dn, axis=[0, 1])
def wasserstein(self, y_true, y_pred): return -K.mean(y_true * y_pred)
def vae_reconstruction_loss(y_true, y_predict): reconstruction_loss_factor = 1000 reconstruction_loss = K.mean(K.square(y_true - y_predict), axis=[1, 2, 3]) return reconstruction_loss_factor * reconstruction_loss
def rmse(y_true, y_pred): import tensorflow.keras.backend as K return K.sqrt(K.mean(K.square(y_pred - y_true), axis=-1))
def critic_PPO2_loss(self, y_true, y_pred): value_loss = K.mean((y_true - y_pred) ** 2) # standard PPO loss return value_loss