def train_loop(self, iter, beta, anneal=True): beta = tf.convert_to_tensor(beta, tf.float32) beta_conv = tf.cast(beta, tf.float32) history = { 'step': [], 'Free energy mean': [], 'Free energy std': [], 'Energy mean': [], 'Energy std': [], 'Train time': [] } interval = 20 t1 = time() for step in tqdm(range(iter)): if anneal == True: beta = beta_conv * (1 - self.beta_anneal**step) loss, energy = self.backprop(beta) #type: ignore if (step % interval) == interval - 1: t2 = time() history['step'].append(step + 1) history['Free energy mean'].append(tfm.reduce_mean(loss)) history['Free energy std'].append(tfm.reduce_std(loss)) history['Energy mean'].append(tfm.reduce_mean(energy)) history['Energy std'].append(tfm.reduce_std(energy)) history['Train time'].append((t2 - t1) / interval) t1 = time() return history
def var_train_loop(self, iter, anneal=True, mean=0.5, delta=0.1): history = { 'step': [], 'Free energy mean': [], 'Free energy std': [], 'Energy mean': [], 'Energy std': [], 'Train time': [] } interval = 20 t1 = time() for step in tqdm(range(iter)): if anneal == True: mean_beta = mean * (1 - self.beta_anneal**step) else: mean_beta = mean beta = tf.random.normal([], mean_beta, delta) sample = self.model.graph_sampler(self.batch_size, beta, self.seed) loss, energy = self.var_backprop(sample, beta) # type: ignore if (step % interval) == interval - 1: t2 = time() history['step'].append(step + 1) history['Free energy mean'].append(tfm.reduce_mean(loss)) history['Free energy std'].append(tfm.reduce_std(loss)) history['Energy mean'].append(tfm.reduce_mean(energy)) history['Energy std'].append(tfm.reduce_std(energy)) history['Train time'].append((t2 - t1) / interval) t1 = time() return history
def backprop(self, beta): """Performs backpropagation on the calculated loss function Args: beta (float): Inverse temperature Returns: loss (float): The current loss function for the sampled batch """ sample = self.model.graph_sampler(self.batch_size, self.seed) energy = ising.energy(sample) beta = tf.cast(beta, tf.float32) with tf.GradientTape(True, False) as tape: tape.watch(self.model.trainable_weights) log_prob = self.model.log_prob(sample) with tape.stop_recording(): loss = (log_prob + beta * energy) / (self.model.L**2 ) #type: ignore #regularizer = tfm.reduce_euclidean_norm(self.model(sample) + #self.model(-sample)-1) #regularizer = tfm.divide(regularizer, self.model.L**2) loss_reinforce = tfm.reduce_mean( (loss - tfm.reduce_mean(loss)) * log_prob) #loss_reinforce = tfm.add(loss_reinforce, regularizer) grads = tape.gradient(loss_reinforce, self.model.trainable_weights) self.optimizer.apply_gradients(zip(grads, self.model.trainable_weights)) return loss / beta, energy
def dual_cvae_cost(x, x2, y, encoder, decoder, encoder_c, wu_c=0.0, constrain=True): ''' Cost function for conditional VAE with two conditions INPUTS: x - inputs/conditions x2 - second inputs/conditions y - outputs to be reconstructed encoder - the neural network to be used for mapping x, x2 and y to mu_z and log_sig_sq_z decoder - the neural network to be used for mapping z, x and x2 to mu_y and log_sig_sq_y encoder_c - the neural network to be used for mapping x and x2 to mu_cz and log_sig_sq_cz (conditional prior distribution in the latent space) OUTPUTS: cost - the cost function wu_c = constant for bottleneck warmup. wu_c=0 means no bottleneck, wu_c=1 means completely imposing the bottleneck through the latent space ''' x = tf.cast(x, tf.float32) x2 = tf.cast(x2, tf.float32) y = tf.cast(y, tf.float32) # compute moments of p(z|x) mu_cz, log_sig_sq_cz = encoder_c.compute_moments(x, x2) # compute moments of q(z|x,y) mu_z, log_sig_sq_z = encoder.compute_moments(y, x, x2) # sample from q(z|x,y) z = reparameterisation_trick(mu_z, log_sig_sq_z) # bottleneck warmup x_wu = ((1.0 - wu_c) * x + wu_c * tf.random.uniform(tf.shape(x))) x2_wu = ((1.0 - wu_c) * x2 + wu_c * tf.random.uniform(tf.shape(x2))) # compute moments of p(y|z,x) mu_y, log_sig_sq_y = decoder.compute_moments(z, x_wu, x2_wu, constrain=constrain) # KL(q(z|x,y)|p(z|x)) KLe = kl_normal(mu_z, log_sig_sq_z, mu_cz, log_sig_sq_cz) KLc = tfm.reduce_sum(KLe, 1) KL = tfm.reduce_mean(tf.cast(KLc, tf.float32)) # -E_q(z|y,x) log(p(y|z,x)) reconstr_loss = -tfm.reduce_sum( gaussian_log_likelihood(y, mu_y, log_sig_sq_y), 1) cost_R = tfm.reduce_mean(reconstr_loss) # -ELBO cost = cost_R + KL return cost
def abs_negcos_angle(y_reco, y_true, re=False): # Energy loss loss_energy = reduce_mean(abs(subtract(y_reco[:,0], y_true[:,0]))) #this works well but could maybe be improved # Angle loss loss_angle = reduce_mean(1-cos_angle(y_reco, y_true)) if not re: return loss_energy+loss_angle else: return float(loss_energy+loss_angle), [float(loss_energy), float(loss_angle)]
def compute_covariance(x): """ Compute the covariance cov(x) = E[x*x^T] - E[x]E[x]^T. Based on Locatello et al. implementation (https://github.com/google-research/disentanglement_lib) :param x: a matrix of size N*M :return: the covariance of x, a matrix of size M*M """ e_x = tfm.reduce_mean(x, axis=0) e_x_e_xt = tf.expand_dims(e_x, 1) * tf.expand_dims(e_x, 0) e_xxt = tfm.reduce_mean(tf.expand_dims(x, 2) * tf.expand_dims(x, 1), axis=0) return tfm.subtract(e_xxt, e_x_e_xt)
def call(self, x): input_shape = x.shape.as_list() axis = tuple( range(0 if self.batch else 1, len(input_shape) if input_shape else 4)) if self.mean: x = x - tfm.reduce_mean(x, axis=axis, keepdims=True) if self.std: std = tfm.sqrt( tfm.reduce_mean(tfm.square(x), axis=axis, keepdims=True)) elif self.std: std = tfm.reduce_std(x, axis=axis, keepdims=True) return x / ((self.eps + std) if self.eps else std) if self.std else x
def sqr_vonMises23D_angle(y_reco, y_true, re=False): #energy loss_energy = reduce_mean( tf.math.squared_difference(y_reco[:, 0], y_true[:, 0])) #mae again polar_k = abs(y_reco[:, 3]) + eps zenth_k = abs(y_reco[:, 4]) + eps cos_azi = cos(subtract(y_true[:, 2], y_reco[:, 2])) cos_zenth = cos(subtract(y_true[:, 1], y_reco[:, 1])) lnI0_azi = polar_k + tf.math.log( 1 + tf.math.exp(-2 * polar_k) ) - 0.25 * tf.math.log(1 + 0.25 * tf.square(polar_k)) + tf.math.log( 1 + 0.24273 * tf.square(polar_k)) - tf.math.log(1 + 0.43023 * tf.square(polar_k)) lnI0_zenth = zenth_k + tf.math.log( 1 + tf.math.exp(-2 * zenth_k) ) - 0.25 * tf.math.log(1 + 0.25 * tf.square(zenth_k)) + tf.math.log( 1 + 0.24273 * tf.square(zenth_k)) - tf.math.log(1 + 0.43023 * tf.square(zenth_k)) llh_azi = polar_k * cos_azi - lnI0_azi llh_zenith = zenth_k * cos_zenth - lnI0_zenth loss_azi = reduce_mean(-llh_azi) loss_zenith = reduce_mean(-llh_zenith) kappa = tf.math.abs(y_reco[:, 5]) + eps cos_alpha = cos_angle(y_reco, y_true) # tf.debugging.assert_less_equal(tf.math.abs(cos_alpha), 1, message='cos_alpha problem', summarize=None, name=None) tf.debugging.assert_all_finite(tf.math.abs(cos_alpha), message='cos_alpha problem infinite/nan', name=None) nlogC = -tf.math.log(kappa) + kappa + tf.math.log(1 - tf.math.exp(-2 * kappa)) tf.debugging.assert_all_finite(nlogC, 'log kappa problem', name=None) loss_angle = tf.reduce_mean(-kappa * cos_alpha + nlogC) if not re: return loss_azi + loss_zenith + loss_energy + loss_angle if re: return float(loss_azi + loss_zenith + loss_energy + loss_angle), [ float(loss_energy), float(loss_zenith), float(loss_azi), float(loss_angle) ]
def abs_linear_unit(y_reco, y_true, re=False): '' from tensorflow.math import sin, cos, acos, abs, reduce_mean, subtract #energy loss loss_energy = reduce_mean(abs(subtract(y_reco[:,0], y_true[:,0]))) #angle loss cos_alpha = cos_unit(y_reco,y_true) loss_angle = reduce_mean(tf.math.acos(cos_alpha)) if not re: return loss_energy+loss_angle else: return float(loss_energy+loss_angle), [float(loss_energy), float(loss_angle)]
def loss_funcxpos2(y_reco, y_true, re=False): from tensorflow.math import sin, cos, acos, abs, reduce_mean, subtract, square # Energy loss loss_energy = reduce_mean(abs(subtract(y_reco[:,0], y_true[:,0]))) #this works well but could maybe be improved zeni = [cos(y_true[:,1]) - y_reco[:,1] , sin(y_true[:,1]) - y_reco[:,2]] azi = [cos(y_true[:,2]) - y_reco[:,3] , sin(y_true[:,2]) - y_reco[:,4]] loss_angle = reduce_mean(square(azi[0]))+reduce_mean(square(azi[1]))+reduce_mean(square(zeni[0]))+reduce_mean(square(zeni[1])) if not re: return loss_energy+loss_angle else: return float(loss_energy+loss_angle), [float(loss_energy), float(loss_angle)]
def call(self, ensemble_logits, logits): ''' ensemble_logits are the outputs from our ensemble (batch x ensembles x classes) logits are the predicted outputs from our model (batch x classes) ''' if self.temp is None: self.temp = self.init_temp # Convert values to appropiate type logits = tf.cast(logits, dtype=tf.float64) ensemble_logits = tf.cast(ensemble_logits, dtype=tf.float64) # Calculate probabilities by softmax over classes, adjusted for temperature ensemble_probs = softmax(ensemble_logits / self.temp, axis=2) PN_probs = softmax(logits / self.temp, axis=1) # Calculate mean teacher prediction ensemble_probs_mean = reduce_sum(ensemble_probs, axis=1) # Calculate cost (entropy) cost = reduce_mean(-ensemble_probs_mean * log(PN_probs)**(self.temp**2)) return cost
def cross_entropy_loss(logits: tf.Tensor, labels: tf.Tensor) -> tf.Tensor: one_hot_labels = tf.one_hot(labels, logits.shape[3]) batch_loss = tf.nn.softmax_cross_entropy_with_logits( one_hot_labels, logits) loss_value = math.reduce_mean(tf.reduce_sum(batch_loss, 1)) return loss_value
def g_loss(d_scores_fake): """ `d_scores_fake` is the output of the discrimonator model applied to a batch of fake data NOTE: we always define objectives as if we were minimizing them (remember that maximize = negate and minimize) """ if TASK == 1: return tm.reduce_mean(tm.log(1 - d_scores_fake)) elif TASK == 2: return -tm.reduce_mean(tm.log(d_scores_fake)) elif TASK == 3 or TASK == 4: # tries to maximize score such that is becomes positive # (similar to the discriminator score) return -tm.reduce_mean(d_scores_fake) elif TASK == 5: # INN does not generator return None
def cross_entropy_loss(logits: tf.Tensor, labels: tf.Tensor) -> tf.Tensor: # import pdb; pdb.set_trace() # why is loss 0 sometimes!? one_hot_labels = tf.one_hot(labels, logits.shape[1]) batch_loss = tf.nn.softmax_cross_entropy_with_logits( one_hot_labels, logits) loss_value = math.reduce_mean(batch_loss) return loss_value
def var_backprop(self, sample, beta): energy = ising.energy(sample, pbc=True) beta = tf.cast(beta, tf.float32) with tf.GradientTape(True, False) as tape: tape.watch(self.model.trainable_weights) log_prob = self.model.log_prob(sample, beta) with tape.stop_recording(): beta = tf.squeeze(beta) loss = (log_prob + beta * energy) / (self.model.L**2) #regularizer = tfm.reduce_euclidean_norm(self.model(sample, beta) + #self.model(-sample, beta) - 1) #regularizer = tfm.divide(regularizer, self.model.L**2) loss_reinforce = tfm.reduce_mean( (loss - tfm.reduce_mean(loss)) * log_prob) #loss_reinforce = tfm.add(loss_reinforce, regularizer) grads = tape.gradient(loss_reinforce, self.model.trainable_weights) self.optimizer.apply_gradients(zip(grads, self.model.trainable_weights)) return loss / beta, energy
def abs_vM2D_KDE_weak(y_reco, y_true, kdet, re=False): #energy loss_energy = reduce_mean(abs(subtract(y_reco[:, 0], y_true[:, 0]))) #mae again polar_k = abs(y_reco[:, 3]) + eps zenth_k = abs(y_reco[:, 4]) + eps cos_azi = cos(subtract(y_true[:, 2], y_reco[:, 2])) cos_zenth = cos(subtract(y_true[:, 1], y_reco[:, 1])) lnI0_azi = polar_k + tf.math.log( 1 + tf.math.exp(-2 * polar_k) ) - 0.25 * tf.math.log(1 + 0.25 * tf.square(polar_k)) + tf.math.log( 1 + 0.24273 * tf.square(polar_k)) - tf.math.log(1 + 0.43023 * tf.square(polar_k)) lnI0_zenth = zenth_k + tf.math.log( 1 + tf.math.exp(-2 * zenth_k) ) - 0.25 * tf.math.log(1 + 0.25 * tf.square(zenth_k)) + tf.math.log( 1 + 0.24273 * tf.square(zenth_k)) - tf.math.log(1 + 0.43023 * tf.square(zenth_k)) llh_azi = polar_k * cos_azi - lnI0_azi llh_zenith = zenth_k * cos_zenth - lnI0_zenth loss_azi = reduce_mean(-llh_azi) loss_zenith = reduce_mean(-llh_zenith) kder = kde(tf.cast(y_reco[:, 1], tf.float32)) kdeloss = tf.reduce_mean( tf.math.abs(kdet.log_prob(xkde) - kder.log_prob(xkde))) / 10 if not re: return loss_azi + loss_zenith + loss_energy + tf.cast( kdeloss, tf.float32) if re: return float(loss_azi + loss_zenith + loss_energy + kdeloss), [ float(loss_energy), float(loss_zenith), float(loss_azi), float(kdeloss) ]
def call(self, inputs, training=None, mask=None): hidden = inputs for layer in self.layers: hidden = layer(hidden, training=training) axes = [1, 2] hidden = math.reduce_mean(hidden, axes, keepdims=True) hidden = squeeze(hidden, axes) return hidden
def call(self, inputs): mean = reduce_mean(inputs, axis=0) std = reduce_std(inputs, axis=0) + 1e-6 InputBatchNormalization.temp += 1 InputBatchNormalization.mean += mean InputBatchNormalization.std += std inputs = divide(subtract(inputs, mean), std) return inputs.squeeze(0)
def __call__(self, y_true, y_pred): from tensorflow.math import square, reduce_mean from tensorflow import reshape, concat, expand_dims from tensorflow.keras import backend as K assert (K.int_shape(y_true)[-1] == 3 * 2) assert (K.int_shape(y_pred)[-1] == 3 * 2) y_true = reshape(y_true, [-1, 3]) y_pred = reshape(y_pred, [-1, 3]) if self._use_pxyz: y_true = self._convert_to_pxyz(y_true) y_pred = self._convert_to_pxyz(y_pred) return reduce_mean(square(y_true - y_pred)) else: d_sq = square(y_true - y_pred) d_phi = square(_delta_phi_tf(y_true[:, 2], y_pred[:, 2])) d_phi = expand_dims(d_phi, axis=-1) diff = concat([d_sq[:, 0:2], d_phi], axis=-1) return reduce_mean(diff)
def adaptive_wing_loss(labels, output): alpha = 2.1 omega = 14 epsilon = 1 theta = 0.5 with tf.name_scope('adaptive_wing_loss'): x = output - labels theta_over_epsilon_tensor = tf.fill(tf.shape(labels), theta/epsilon) A = omega*(1/(1+pow(theta_over_epsilon_tensor, alpha-labels)))*(alpha-labels)*pow(theta_over_epsilon_tensor, alpha-labels-1)*(1/epsilon) C = theta*A-omega*log(1+pow(theta_over_epsilon_tensor, alpha-labels)) absolute_x = abs(x) losses = tf.where(greater(theta, absolute_x), omega*log(1+pow(absolute_x/epsilon, alpha-labels)), A*absolute_x-C) loss = reduce_mean(reduce_sum(losses, axis=[1, 2]), axis=0) return loss
def sqr_vonMises2D_angle(y_reco, y_true, re=False): #energy loss_energy = reduce_mean( tf.math.squared_difference(y_reco[:, 0], y_true[:, 0])) #mae again polar_k = abs(y_reco[:, 3]) + eps zenth_k = abs(y_reco[:, 4]) + eps cos_azi = cos(subtract(y_true[:, 2], y_reco[:, 2])) cos_zenth = cos(subtract(y_true[:, 1], y_reco[:, 1])) lnI0_azi = polar_k + tf.math.log( 1 + tf.math.exp(-2 * polar_k) ) - 0.25 * tf.math.log(1 + 0.25 * tf.square(polar_k)) + tf.math.log( 1 + 0.24273 * tf.square(polar_k)) - tf.math.log(1 + 0.43023 * tf.square(polar_k)) lnI0_zenth = zenth_k + tf.math.log( 1 + tf.math.exp(-2 * zenth_k) ) - 0.25 * tf.math.log(1 + 0.25 * tf.square(zenth_k)) + tf.math.log( 1 + 0.24273 * tf.square(zenth_k)) - tf.math.log(1 + 0.43023 * tf.square(zenth_k)) llh_azi = polar_k * cos_azi - lnI0_azi llh_zenith = zenth_k * cos_zenth - lnI0_zenth loss_azi = reduce_mean(-llh_azi) loss_zenith = reduce_mean(-llh_zenith) if not re: return loss_azi + loss_zenith + loss_energy if re: return float(loss_azi + loss_zenith + loss_energy), [ float(loss_energy), float(loss_zenith), float(loss_azi) ]
def call(self, ensemble_logits, logits): ''' ensemble_logits are the outputs from our ensemble (batch x ensembles x classes) logits are the predicted outputs from our model (batch x classes) ''' logits = tf.cast(logits, dtype=tf.float64) ensemble_logits = tf.cast(ensemble_logits, dtype=tf.float64) alphas = exp(logits / self.temp) precision = reduce_sum(alphas, axis=1) #sum over classes ensemble_probs = softmax(ensemble_logits / self.temp, axis=2) #softmax over classes # Smooth for num. stability: probs_mean = 1 / (tf.shape(ensemble_probs)[2] ) #divide by nr of classes # Subtract mean, scale down, add mean back) ensemble_probs = self.tp_scaling * (ensemble_probs - probs_mean) + probs_mean log_ensemble_probs_geo_mean = reduce_mean(log(ensemble_probs + self.smooth_val), axis=1) #mean over ensembles target_independent_term = reduce_sum( lgamma(alphas + self.smooth_val), axis=1) - lgamma( precision + self.smooth_val ) #sum over lgammma of classes - lgamma(precision) target_dependent_term = -reduce_sum( (alphas - 1.) * log_ensemble_probs_geo_mean, axis=1) # -sum over classes cost = target_dependent_term + target_independent_term # tf.print(self.temp) return reduce_mean(cost) * (self.temp**2) #mean of all batches
def compute_batch_tc(z, z_mean, z_log_var): """ Estimates the total correlation over a batch. Based on Locatello et al. implementation (https://github.com/google-research/disentanglement_lib). Compute E_j[log(q(z(x_j))) - log(prod_l q(z(x_j)_l))] where i and j are indexing the batch size and l is indexing the number of latent factors. :param z: the sampled values :param z_mean: the mean of the Gaussian :param z_log_var: the log variance of the Gaussian :return: the total correlation estimated over the batch """ log_qz = compute_gaussian_log_pdf(tf.expand_dims(z, 1), tf.expand_dims(z_mean, 0), tf.expand_dims(z_log_var, 0)) prod_log_qz = tfm.reduce_sum(tfm.reduce_logsumexp(log_qz, axis=1, keepdims=False), axis=1, keepdims=False) log_sum_qz = tfm.reduce_logsumexp(tfm.reduce_sum(log_qz, axis=2, keepdims=False), axis=1, keepdims=False) return tfm.reduce_mean(log_sum_qz, prod_log_qz)
def train_bound(t): """Trains the model to equalize values and spatial derivatives at boundaries x=5 and x=-5 to enforce periodic boundary condition Args: t : A tf.Tensor of shape (batch_size,). """ x1 = 5 * tf.ones(t.shape) x2 = -5 * tf.ones(t.shape) with tf.GradientTape(True, False) as tape: tape.watch(PINN.trainable_weights) with tf.GradientTape(True, False) as grtape1: grtape1.watch([t, x1, x2]) #Automatic differentiation of complex functions is weird in tensorflow #so we differentiate real and imaginary parts seperately h_real_1 = tfm.real(PINN(tf.stack([t, x1], -1))) h_imag_1 = tfm.imag(PINN(tf.stack([t, x1], -1))) h_real_2 = tfm.real(PINN(tf.stack([t, x2], -1))) h_imag_2 = tfm.imag(PINN(tf.stack([t, x2], -1))) #First order derivatives h_x1_real = grtape1.gradient(h_real_1, x1) h_x1_imag = grtape1.gradient(h_imag_1, x1) h_x2_real = grtape1.gradient(h_real_2, x2) h_x2_imag = grtape1.gradient(h_imag_2, x2) #h1_real and h1_imag have shape (batch_size,2) del grtape1 h1 = tf.complex(h_real_1, h_imag_1) h1_x = tf.complex(h_x1_real, h_x1_imag) h2 = tf.complex(h_real_2, h_imag_2) h2_x = tf.complex(h_x2_real, h_x2_imag) MSE = tfm.reduce_mean( tfm.pow(tfm.abs(h1 - h2), 2) + tfm.pow(tfm.abs(h1_x - h2_x), 2)) grads = tape.gradient(MSE, PINN.trainable_weights) sgd_opt.apply_gradients(zip(grads, PINN.trainable_weights)) return MSE
def call(self, x): """Calculates the probability of sample x Args: x (int32): Value of input lattice """ def SplDense(x, n): """We are using this "layer" instead of regular keras Dense layer to facilitate use of common kernel and bias""" kernel = tf.stack(self.kernel[:n]) return tfk.activations.sigmoid(tf.matmul(x, kernel) + self.bias) x = self.flatten(x) p = tf.TensorArray(tf.float32, size=0, dynamic_size=True) Id = tf.ones([x.shape[0]]) p = p.write(0, Id) #The first lattice point is fixed at +1 for i in range(1, self.D): x1 = tf.gather(x, tf.range(i), axis=1) h = SplDense(x1, i) y = tf.squeeze(self.output_layer[i-1](h)) x2 = tf.gather(x, tf.constant(i), axis=1) p = p.write(i, 0.5*(Id-x2) + (x2*y)) return tfm.reduce_mean(p.stack(), axis=0)
def d_loss(d_scores_fake, d_scores_real): """ `d_scores_fake` is the output of the discrimonator model applied to a batch of fake data `d_scores_real` is the output of the discrimonator model applied to a batch of real data NOTE: we always define objectives as if we were minimizing them (remember that maximize = negate and minimize) """ if TASK == 1: return -tm.reduce_mean( tm.log(d_scores_real) + tm.log(1 - d_scores_fake)) elif TASK == 2: return -tm.reduce_mean( tm.log(d_scores_real) + tm.log(1 - d_scores_fake)) elif TASK == 3 or TASK == 4: # Maximize Critic score # push real samples mean to large positive values, # and push fake scores mean to large negative values return -(tm.reduce_mean(d_scores_real) - tm.reduce_mean(d_scores_fake)) elif TASK == 5: return -(tm.reduce_mean(d_scores_real) - tm.reduce_mean(d_scores_fake))
def nrmse(y_true, y_pred): return tfm.sqrt( tfm.reduce_mean(tfm.squared_difference(y_true, y_pred)) / tfm.reduce_mean(y_true**2))