def testNegativeBinomialSample(self): with self.cached_session() as sess: probs = [.3, .9] total_count = [4., 11.] n = int(100e3) negbinom = negative_binomial.NegativeBinomial( total_count=total_count, probs=probs) samples = negbinom.sample(n, seed=12345) self.assertEqual([n, 2], samples.get_shape()) sample_mean = math_ops.reduce_mean(samples, axis=0) sample_var = math_ops.reduce_mean( (samples - sample_mean[array_ops.newaxis, ...])**2., axis=0) sample_min = math_ops.reduce_min(samples) [sample_mean_, sample_var_, sample_min_] = sess.run([ sample_mean, sample_var, sample_min]) self.assertAllEqual(np.ones(sample_min_.shape, dtype=np.bool), sample_min_ >= 0.0) for i in range(2): self.assertAllClose(sample_mean_[i], stats.nbinom.mean(total_count[i], 1 - probs[i]), atol=0., rtol=.02) self.assertAllClose(sample_var_[i], stats.nbinom.var(total_count[i], 1 - probs[i]), atol=0., rtol=.02)
def _sliced_wasserstein(a, b, random_sampling_count, random_projection_dim): """Compute the approximate sliced Wasserstein distance. Args: a: (matrix) Distribution "a" of samples (row, col). b: (matrix) Distribution "b" of samples (row, col). random_sampling_count: (int) Number of random projections to average. random_projection_dim: (int) Dimension of the random projection space. Returns: Float containing the approximate distance between "a" and "b". """ s = array_ops.shape(a) means = [] for _ in range(random_sampling_count): # Random projection matrix. proj = random_ops.random_normal( [array_ops.shape(a)[1], random_projection_dim]) proj *= math_ops.rsqrt( math_ops.reduce_sum(math_ops.square(proj), 0, keepdims=True)) # Project both distributions and sort them. proj_a = math_ops.matmul(a, proj) proj_b = math_ops.matmul(b, proj) proj_a = _sort_rows(proj_a, s[0]) proj_b = _sort_rows(proj_b, s[0]) # Pairwise Wasserstein distance. wdist = math_ops.reduce_mean(math_ops.abs(proj_a - proj_b)) means.append(wdist) return math_ops.reduce_mean(means)
def _potential_scale_reduction_single_state(state, independent_chain_ndims): """potential_scale_reduction for one single state `Tensor`.""" # We assume exactly one leading dimension indexes e.g. correlated samples from # each Markov chain. state = ops.convert_to_tensor(state, name="state") sample_ndims = 1 sample_axis = math_ops.range(0, sample_ndims) chain_axis = math_ops.range(sample_ndims, sample_ndims + independent_chain_ndims) sample_and_chain_axis = math_ops.range(0, sample_ndims + independent_chain_ndims) n = _axis_size(state, sample_axis) m = _axis_size(state, chain_axis) # In the language of [2], # B / n is the between chain variance, the variance of the chain means. # W is the within sequence variance, the mean of the chain variances. b_div_n = _reduce_variance( math_ops.reduce_mean(state, sample_axis, keepdims=True), sample_and_chain_axis, biased=False) w = math_ops.reduce_mean( _reduce_variance(state, sample_axis, keepdims=True, biased=True), sample_and_chain_axis) # sigma^2_+ is an estimate of the true variance, which would be unbiased if # each chain was drawn from the target. c.f. "law of total variance." sigma_2_plus = w + b_div_n return ((m + 1.) / m) * sigma_2_plus / w - (n - 1.) / (m * n)
def _statistics(x, axes): """Calculate the mean and mean square of `x`. Modified from the implementation of `tf.nn.moments`. Args: x: A `Tensor`. axes: Array of ints. Axes along which to compute mean and variance. Returns: Two `Tensor` objects: `mean` and `square mean`. """ # The dynamic range of fp16 is too limited to support the collection of # sufficient statistics. As a workaround we simply perform the operations # on 32-bit floats before converting the mean and variance back to fp16 y = math_ops.cast(x, dtypes.float32) if x.dtype == dtypes.float16 else x # Compute true mean while keeping the dims for proper broadcasting. shift = array_ops.stop_gradient(math_ops.reduce_mean(y, axes, keepdims=True)) shifted_mean = math_ops.reduce_mean(y - shift, axes, keepdims=True) mean = shifted_mean + shift mean_squared = math_ops.reduce_mean(math_ops.square(y), axes, keepdims=True) mean = array_ops.squeeze(mean, axes) mean_squared = array_ops.squeeze(mean_squared, axes) if x.dtype == dtypes.float16: return (math_ops.cast(mean, dtypes.float16), math_ops.cast(mean_squared, dtypes.float16)) else: return (mean, mean_squared)
def center_bias(self, center_bias_var, gradients, hessians): # For in memory, we already have a full batch of gradients and hessians, # so just take a mean and proceed with centering. mean_gradients = array_ops.expand_dims( math_ops.reduce_mean(gradients, 0), 0) mean_heassians = array_ops.expand_dims(math_ops.reduce_mean(hessians, 0), 0) return self._center_bias_fn(center_bias_var, mean_gradients, mean_heassians)
def test_docstring_example(self): # Produce the first 1000 members of the Halton sequence in 3 dimensions. num_results = 1000 dim = 3 with self.test_session(): sample = halton.sample(dim, num_results=num_results, randomized=False) # Evaluate the integral of x_1 * x_2^2 * x_3^3 over the three dimensional # hypercube. powers = math_ops.range(1.0, limit=dim + 1) integral = math_ops.reduce_mean( math_ops.reduce_prod(sample ** powers, axis=-1)) true_value = 1.0 / math_ops.reduce_prod(powers + 1.0) # Produces a relative absolute error of 1.7%. self.assertAllClose(integral.eval(), true_value.eval(), rtol=0.02) # Now skip the first 1000 samples and recompute the integral with the next # thousand samples. The sequence_indices argument can be used to do this. sequence_indices = math_ops.range(start=1000, limit=1000 + num_results, dtype=dtypes.int32) sample_leaped = halton.sample(dim, sequence_indices=sequence_indices, randomized=False) integral_leaped = math_ops.reduce_mean( math_ops.reduce_prod(sample_leaped ** powers, axis=-1)) self.assertAllClose(integral_leaped.eval(), true_value.eval(), rtol=0.05)
def npairs_loss(labels, embeddings_anchor, embeddings_positive, reg_lambda=0.002, print_losses=False): """Computes the npairs loss. Npairs loss expects paired data where a pair is composed of samples from the same labels and each pairs in the minibatch have different labels. The loss has two components. The first component is the L2 regularizer on the embedding vectors. The second component is the sum of cross entropy loss which takes each row of the pair-wise similarity matrix as logits and the remapped one-hot labels as labels. See: http://www.nec-labs.com/uploads/images/Department-Images/MediaAnalytics/papers/nips16_npairmetriclearning.pdf Args: labels: 1-D tf.int32 `Tensor` of shape [batch_size/2]. embeddings_anchor: 2-D Tensor of shape [batch_size/2, embedding_dim] for the embedding vectors for the anchor images. Embeddings should not be l2 normalized. embeddings_positive: 2-D Tensor of shape [batch_size/2, embedding_dim] for the embedding vectors for the positive images. Embeddings should not be l2 normalized. reg_lambda: Float. L2 regularization term on the embedding vectors. print_losses: Boolean. Option to print the xent and l2loss. Returns: npairs_loss: tf.float32 scalar. """ # pylint: enable=line-too-long # Add the regularizer on the embedding. reg_anchor = math_ops.reduce_mean( math_ops.reduce_sum(math_ops.square(embeddings_anchor), 1)) reg_positive = math_ops.reduce_mean( math_ops.reduce_sum(math_ops.square(embeddings_positive), 1)) l2loss = math_ops.multiply( 0.25 * reg_lambda, reg_anchor + reg_positive, name='l2loss') # Get per pair similarities. similarity_matrix = math_ops.matmul( embeddings_anchor, embeddings_positive, transpose_a=False, transpose_b=True) # Reshape [batch_size] label tensor to a [batch_size, 1] label tensor. lshape = array_ops.shape(labels) assert lshape.shape == 1 labels = array_ops.reshape(labels, [lshape[0], 1]) labels_remapped = math_ops.to_float( math_ops.equal(labels, array_ops.transpose(labels))) labels_remapped /= math_ops.reduce_sum(labels_remapped, 1, keepdims=True) # Add the softmax loss. xent_loss = nn.softmax_cross_entropy_with_logits( logits=similarity_matrix, labels=labels_remapped) xent_loss = math_ops.reduce_mean(xent_loss, name='xentropy') if print_losses: xent_loss = logging_ops.Print( xent_loss, ['cross entropy:', xent_loss, 'l2loss:', l2loss]) return l2loss + xent_loss
def training_loss(self, logits, target, features, name="training_loss"): """Returns training loss tensor for this head. Training loss is different from the loss reported on the tensorboard as we should respect the example weights when computing the gradient. L = sum_{i} w_{i} * l_{i} / B where B is the number of examples in the batch, l_{i}, w_{i} are individual losses, and example weight. Args: logits: logits, a float tensor. target: either a tensor for labels or in multihead case, a dict of string to target tensor. features: features dict. name: Op name. Returns: Loss tensor. """ target = target[self.name] if isinstance(target, dict) else target loss_unweighted = self._loss_fn(logits, target) weight_tensor = self.get_weight_tensor(features) if weight_tensor is None: return math_ops.reduce_mean(loss_unweighted, name=name) loss_weighted = self._weighted_loss(loss_unweighted, weight_tensor) return math_ops.reduce_mean(loss_weighted, name=name)
def loss_wrapper(labels, logits, weight_tensor): if weight_tensor is None: weight_tensor = array_ops.ones( shape=[array_ops.shape(labels)[0], 1], dtype=dtypes.float32) weighted_loss, _ = loss_fn(labels, weight_tensor, logits) average_loss = math_ops.reduce_mean(weighted_loss) return average_loss, average_loss / math_ops.reduce_mean(weight_tensor)
def testSampleConsistentStats(self): loc = np.float32([[-1., 1], [1, -1]]) scale = np.float32([1., 0.5]) n_samp = 1e4 with self.test_session() as sess: ind = independent_lib.Independent( distribution=mvn_diag_lib.MultivariateNormalDiag( loc=loc, scale_identity_multiplier=scale), reduce_batch_ndims=1) x = ind.sample(int(n_samp), seed=42) sample_mean = math_ops.reduce_mean(x, axis=0) sample_var = math_ops.reduce_mean( math_ops.squared_difference(x, sample_mean), axis=0) sample_std = math_ops.sqrt(sample_var) sample_entropy = -math_ops.reduce_mean(ind.log_prob(x), axis=0) [ sample_mean_, sample_var_, sample_std_, sample_entropy_, actual_mean_, actual_var_, actual_std_, actual_entropy_, actual_mode_, ] = sess.run([ sample_mean, sample_var, sample_std, sample_entropy, ind.mean(), ind.variance(), ind.stddev(), ind.entropy(), ind.mode(), ]) self.assertAllClose(sample_mean_, actual_mean_, rtol=0.02, atol=0.) self.assertAllClose(sample_var_, actual_var_, rtol=0.04, atol=0.) self.assertAllClose(sample_std_, actual_std_, rtol=0.02, atol=0.) self.assertAllClose(sample_entropy_, actual_entropy_, rtol=0.01, atol=0.) self.assertAllClose(loc, actual_mode_, rtol=1e-6, atol=0.)
def testCovarianceFromSampling(self): alpha = np.array([[1., 2, 3], [2.5, 4, 0.01]], dtype=np.float32) with self.test_session() as sess: dist = dirichlet_lib.Dirichlet(alpha) # batch_shape=[2], event_shape=[3] x = dist.sample(int(250e3), seed=1) sample_mean = math_ops.reduce_mean(x, 0) x_centered = x - sample_mean[None, ...] sample_cov = math_ops.reduce_mean(math_ops.matmul( x_centered[..., None], x_centered[..., None, :]), 0) sample_var = array_ops.matrix_diag_part(sample_cov) sample_stddev = math_ops.sqrt(sample_var) [ sample_mean_, sample_cov_, sample_var_, sample_stddev_, analytic_mean, analytic_cov, analytic_var, analytic_stddev, ] = sess.run([ sample_mean, sample_cov, sample_var, sample_stddev, dist.mean(), dist.covariance(), dist.variance(), dist.stddev(), ]) self.assertAllClose(sample_mean_, analytic_mean, atol=0., rtol=0.04) self.assertAllClose(sample_cov_, analytic_cov, atol=0., rtol=0.06) self.assertAllClose(sample_var_, analytic_var, atol=0., rtol=0.03) self.assertAllClose(sample_stddev_, analytic_stddev, atol=0., rtol=0.02)
def _reduce_variance(x, axis=None, biased=True, keepdims=False): with ops.name_scope("reduce_variance"): x = ops.convert_to_tensor(x, name="x") mean = math_ops.reduce_mean(x, axis=axis, keepdims=True) biased_var = math_ops.reduce_mean( math_ops.squared_difference(x, mean), axis=axis, keepdims=keepdims) if biased: return biased_var n = _axis_size(x, axis) return (n / (n - 1.)) * biased_var
def mean_only_frechet_classifier_distance_from_activations( real_activations, generated_activations): """Classifier distance for evaluating a generative model from activations. Given two Gaussian distribution with means m and m_w and covariance matrices C and C_w, this function calcuates |m - m_w|^2 which captures how different the distributions of real images and generated images (or more accurately, their visual features) are. Note that unlike the Inception score, this is a true distance and utilizes information about real world images. Note that when computed using sample means and sample covariance matrices, Frechet distance is biased. It is more biased for small sample sizes. (e.g. even if the two distributions are the same, for a small sample size, the expected Frechet distance is large). It is important to use the same sample size to compute frechet classifier distance when comparing two generative models. In this variant, we only compute the difference between the means of the fitted Gaussians. The computation leads to O(n) vs. O(n^2) memory usage, yet still retains much of the same information as FID. Args: real_activations: 2D array of activations of real images of size [num_images, num_dims] to use to compute Frechet Inception distance. generated_activations: 2D array of activations of generated images of size [num_images, num_dims] to use to compute Frechet Inception distance. Returns: The mean-only Frechet Inception distance. A floating-point scalar of the same type as the output of the activations. """ real_activations.shape.assert_has_rank(2) generated_activations.shape.assert_has_rank(2) activations_dtype = real_activations.dtype if activations_dtype != dtypes.float64: real_activations = math_ops.cast(real_activations, dtypes.float64) generated_activations = math_ops.cast(generated_activations, dtypes.float64) # Compute means of activations. m = math_ops.reduce_mean(real_activations, 0) m_w = math_ops.reduce_mean(generated_activations, 0) # Next the distance between means. mean = math_ops.reduce_sum( math_ops.squared_difference(m, m_w)) # Equivalent to L2 but more stable. mofid = mean if activations_dtype != dtypes.float64: mofid = math_ops.cast(mofid, activations_dtype) return mofid
def _loss(loss_unweighted, weight, name): """Returns loss.""" if weight is None: loss = math_ops.reduce_mean(loss_unweighted, name=name) return loss, loss loss_weighted = _weighted_loss(loss_unweighted, weight) weighted_average_loss = math_ops.div( math_ops.reduce_sum(loss_weighted), math_ops.to_float(math_ops.reduce_sum(weight)), name="weighted_average_loss") loss = math_ops.reduce_mean(loss_weighted, name=name) return loss, weighted_average_loss
def classifier_score(images, classifier_fn, num_batches=1): """Classifier score for evaluating a conditional generative model. This is based on the Inception Score, but for an arbitrary classifier. This technique is described in detail in https://arxiv.org/abs/1606.03498. In summary, this function calculates exp( E[ KL(p(y|x) || p(y)) ] ) which captures how different the network's classification prediction is from the prior distribution over classes. Args: images: Images to calculate the classifier score for. classifier_fn: A function that takes images and produces logits based on a classifier. num_batches: Number of batches to split `generated_images` in to in order to efficiently run them through the classifier network. Returns: The classifier score. A floating-point scalar. """ generated_images_list = array_ops.split( images, num_or_size_splits=num_batches) # Compute the classifier splits using the memory-efficient `map_fn`. logits = functional_ops.map_fn( fn=classifier_fn, elems=array_ops.stack(generated_images_list), parallel_iterations=1, back_prop=False, swap_memory=True, name='RunClassifier') logits = array_ops.concat(array_ops.unstack(logits), 0) logits.shape.assert_has_rank(2) # Use maximum precision for best results. logits_dtype = logits.dtype if logits_dtype != dtypes.float64: logits = math_ops.cast(logits, dtypes.float64) p = nn_ops.softmax(logits) q = math_ops.reduce_mean(p, axis=0) kl = _kl_divergence(p, logits, q) kl.shape.assert_has_rank(1) log_score = math_ops.reduce_mean(kl) final_score = math_ops.exp(log_score) if logits_dtype != dtypes.float64: final_score = math_ops.cast(final_score, dtypes.float64) return final_score
def _loss(loss_unweighted, weight, name): """Returns a tuple of (loss, weighted_average_loss).""" with ops.name_scope(name, values=(loss_unweighted, weight)) as name_scope: if weight is None: loss = math_ops.reduce_mean(loss_unweighted, name=name_scope) return loss, loss loss_weighted = _weighted_loss(loss_unweighted, weight) weighted_average_loss = math_ops.div( math_ops.reduce_sum(loss_weighted), math_ops.to_float(math_ops.reduce_sum(weight)), name="weighted_average_loss") loss = math_ops.reduce_mean(loss_weighted, name=name_scope) return loss, weighted_average_loss
def testSample(self): with self.test_session(): scale = make_pd(1., 2) df = 4 chol_w = distributions.WishartCholesky( df, chol(scale), cholesky_input_output_matrices=False) x = chol_w.sample(1, seed=42).eval() chol_x = [chol(x[0])] full_w = distributions.WishartFull( df, scale, cholesky_input_output_matrices=False) self.assertAllClose(x, full_w.sample(1, seed=42).eval()) chol_w_chol = distributions.WishartCholesky( df, chol(scale), cholesky_input_output_matrices=True) self.assertAllClose(chol_x, chol_w_chol.sample(1, seed=42).eval()) eigen_values = array_ops.matrix_diag_part( chol_w_chol.sample( 1000, seed=42)) np.testing.assert_array_less(0., eigen_values.eval()) full_w_chol = distributions.WishartFull( df, scale, cholesky_input_output_matrices=True) self.assertAllClose(chol_x, full_w_chol.sample(1, seed=42).eval()) eigen_values = array_ops.matrix_diag_part( full_w_chol.sample( 1000, seed=42)) np.testing.assert_array_less(0., eigen_values.eval()) # Check first and second moments. df = 4. chol_w = distributions.WishartCholesky( df=df, scale=chol(make_pd(1., 3)), cholesky_input_output_matrices=False) x = chol_w.sample(10000, seed=42) self.assertAllEqual((10000, 3, 3), x.get_shape()) moment1_estimate = math_ops.reduce_mean(x, reduction_indices=[0]).eval() self.assertAllClose(chol_w.mean().eval(), moment1_estimate, rtol=0.05) # The Variance estimate uses the squares rather than outer-products # because Wishart.Variance is the diagonal of the Wishart covariance # matrix. variance_estimate = (math_ops.reduce_mean( math_ops.square(x), reduction_indices=[0]) - math_ops.square(moment1_estimate)).eval() self.assertAllClose( chol_w.variance().eval(), variance_estimate, rtol=0.05)
def run_test_sample_consistent_mean_variance( self, sess_run_fn, dist, num_samples=int(1e5), seed=24, rtol=1e-2, atol=0.): """Tests that sample/mean/variance are consistent with each other. "Consistency" means that `sample`, `mean`, `variance`, etc all correspond to the same distribution. Args: sess_run_fn: Python `callable` taking `list`-like of `Tensor`s and returning a list of results after running one "step" of TensorFlow computation, typically set to `sess.run`. dist: Distribution instance or object which implements `sample`, `log_prob`, `event_shape_tensor` and `batch_shape_tensor`. num_samples: Python `int` scalar indicating the number of Monte-Carlo samples to draw from `dist`. seed: Python `int` indicating the seed to use when sampling from `dist`. In general it is not recommended to use `None` during a test as this increases the likelihood of spurious test failure. rtol: Python `float`-type indicating the admissible relative error between analytical and sample statistics. atol: Python `float`-type indicating the admissible absolute error between analytical and sample statistics. """ x = math_ops.cast(dist.sample(num_samples, seed=seed), dtypes.float32) sample_mean = math_ops.reduce_mean(x, axis=0) sample_variance = math_ops.reduce_mean( math_ops.square(x - sample_mean), axis=0) sample_stddev = math_ops.sqrt(sample_variance) [ sample_mean_, sample_variance_, sample_stddev_, mean_, variance_, stddev_ ] = sess_run_fn([ sample_mean, sample_variance, sample_stddev, dist.mean(), dist.variance(), dist.stddev(), ]) self.assertAllClose(mean_, sample_mean_, rtol=rtol, atol=atol) self.assertAllClose(variance_, sample_variance_, rtol=rtol, atol=atol) self.assertAllClose(stddev_, sample_stddev_, rtol=rtol, atol=atol)
def testSampleUnbiasedScalarBatch(self): with self.test_session() as sess: dist = ds.DirichletMultinomial( total_count=5., concentration=1. + 2. * self._rng.rand(4).astype(np.float32)) n = int(5e3) x = dist.sample(n, seed=0) sample_mean = math_ops.reduce_mean(x, 0) x_centered = x - sample_mean # Already transposed to [n, 2]. sample_covariance = math_ops.matmul( x_centered, x_centered, adjoint_a=True) / n [ sample_mean_, sample_covariance_, actual_mean_, actual_covariance_, ] = sess.run([ sample_mean, sample_covariance, dist.mean(), dist.covariance(), ]) self.assertAllEqual([4], sample_mean.get_shape()) self.assertAllClose(actual_mean_, sample_mean_, atol=0., rtol=0.05) self.assertAllEqual([4, 4], sample_covariance.get_shape()) self.assertAllClose( actual_covariance_, sample_covariance_, atol=0., rtol=0.15)
def testReuse(self): def f(x): return core_layers.dense(x, self.CHANNELS // 2) def g(x): return core_layers.dense(x, self.CHANNELS // 2) x = random_ops.random_uniform( [self.BATCH_SIZE, self.CHANNELS], dtype=dtypes.float32) x1, x2 = array_ops.split(x, 2, axis=-1) with variable_scope.variable_scope("test"): y1, y2 = rev_block_lib.rev_block(x1, x2, f, g, num_layers=self.NUM_LAYERS) num_vars_before = len(variables.global_variables()) with variable_scope.variable_scope("test", reuse=True): y1, y2 = rev_block_lib.rev_block(x1, x2, f, g, num_layers=self.NUM_LAYERS) num_vars_after = len(variables.global_variables()) self.assertEqual(num_vars_before, num_vars_after) loss = math_ops.reduce_mean(y1 + y2) _ = gradients_impl.gradients(loss, [x] + variables.trainable_variables()) with variable_scope.variable_scope("test", reuse=True): y1, y2 = rev_block_lib.rev_block(x1, x2, f, g, num_layers=self.NUM_LAYERS) num_vars_after = len(variables.global_variables()) self.assertEqual(num_vars_before, num_vars_after)
def loss_fn(labels, logits, weights=None): result = losses.per_example_maxent_loss( labels=labels, logits=logits, weights=weights, num_classes=n_classes) return math_ops.reduce_mean(result[0])
def monte_carlo_hypersphere_volume(dist, num_samples, radius, center): # https://en.wikipedia.org/wiki/Importance_sampling x = dist.sample(num_samples, seed=seed) x = array_ops.identity(x) # Invalidate bijector cacheing. return math_ops.reduce_mean( math_ops.exp(-dist.log_prob(x)) * is_in_ball(x, radius, center), axis=0)
def testQuadraticLoss(self): """Statistical test for the gradient. The equation (5) of https://arxiv.org/abs/1805.08498 says d/dalpha E_{sample ~ Gamma(alpha, 1)} f(sample) = E_{sample ~ Gamma(alpha, 1)} df(sample)/dalpha. Choose a quadratic loss function f(sample) = (sample - t)^2. Then, the lhs can be computed analytically: d/dalpha E_{sample ~ Gamma(alpha, 1)} f(sample) = d/dalpha [ (alpha + alpha^2) - 2 * t * alpha + t^2 ] = 1 + 2 * alpha - 2 * t. We compare the Monte-Carlo estimate of the expectation with the true gradient. """ num_samples = 1000 t = 0.3 alpha = 0.5 expected = 1 + 2 * alpha - 2 * t alpha = constant_op.constant(alpha) sample = random_ops.random_gamma([num_samples], alpha, 1.0) loss = math_ops.reduce_mean(math_ops.square(sample - t)) dloss_dalpha = gradients_impl.gradients(loss, alpha)[0] dloss_dalpha_val = self.evaluate(dloss_dalpha) self.assertAllClose(expected, dloss_dalpha_val, atol=1e-1, rtol=1e-1)
def create_loss(self, features, mode, logits, labels): """See `Head`.""" del mode # Unused for this head. logits = ops.convert_to_tensor(logits) processed_labels = self._process_labels(labels) processed_labels = head_lib._check_dense_labels_match_logits_and_reshape( # pylint:disable=protected-access labels=processed_labels, logits=logits, expected_labels_dimension=self.logits_dimension) if self._loss_fn: unweighted_loss = head_lib._call_loss_fn( # pylint:disable=protected-access loss_fn=self._loss_fn, labels=processed_labels, logits=logits, features=features, expected_loss_dim=1) else: unweighted_loss = losses.sigmoid_cross_entropy( multi_class_labels=processed_labels, logits=logits, reduction=losses.Reduction.NONE) # Averages loss over classes. unweighted_loss = math_ops.reduce_mean( unweighted_loss, axis=-1, keepdims=True) weights = head_lib._get_weights_and_check_match_logits( # pylint:disable=protected-access, features=features, weight_column=self._weight_column, logits=logits) training_loss = losses.compute_weighted_loss( unweighted_loss, weights=weights, reduction=self._loss_reduction) return head_lib.LossSpec( training_loss=training_loss, unreduced_loss=unweighted_loss, weights=weights, processed_labels=processed_labels)
def testGradient(self): with ops.Graph().as_default() as g: inputs = array_ops.placeholder( dtypes.float32, shape=[None, 100], name="input") weights = array_ops.placeholder( dtypes.float32, shape=[100, 10], name="weights") biases = array_ops.placeholder(dtypes.float32, shape=[10], name="biases") activations = nn_ops.relu( math_ops.matmul(inputs, weights) + biases, name="activations") loss = math_ops.reduce_mean(activations, name="loss") gdef = g.as_graph_def() with ops.Graph().as_default() as g: input_placeholder = array_ops.placeholder(dtypes.float32, shape=[32, 100]) weights_var = variables.Variable( random_ops.truncated_normal([100, 10]), name="weights") biases_var = variables.Variable(array_ops.zeros([10]), name="biases") activations, loss = importer.import_graph_def( gdef, input_map={ "input:0": input_placeholder, "weights:0": weights_var, "biases:0": biases_var }, return_elements=["activations:0", "loss:0"]) self.assertEqual([32, 10], activations.get_shape()) self.assertEqual([], loss.get_shape()) weights_grad, biases_grad = gradients_impl.gradients( loss, [weights_var, biases_var]) self.assertEqual([100, 10], weights_grad.get_shape()) self.assertEqual([10], biases_grad.get_shape())
def zero_fraction(value, name=None): """Returns the fraction of zeros in `value`. If `value` is empty, the result is `nan`. This is useful in summaries to measure and report sparsity. For example, ```python z = tf.Relu(...) summ = tf.contrib.deprecated.scalar_summary('sparsity', tf.nn.zero_fraction(z)) ``` Args: value: A tensor of numeric type. name: A name for the operation (optional). Returns: The fraction of zeros in `value`, with type `float32`. """ with ops.name_scope(name, "zero_fraction", [value]): value = ops.convert_to_tensor(value, name="value") zero = constant_op.constant(0, dtype=value.dtype, name="zero") return math_ops.reduce_mean( math_ops.cast(math_ops.equal(value, zero), dtypes.float32))
def _train(self, checkpoint_path, layout_optimizer=False, restore=False): ops.reset_default_graph() graph = ops.get_default_graph() with session.Session( config=get_config(layout_optimizer), graph=graph) as sess: batch = 2 height = 6 width = 7 input_channels = 3 shape = [batch, height, width, input_channels] image = array_ops.placeholder(dtype='float32', shape=shape) conv1 = conv_layers.conv2d(image, 32, [3, 3]) conv2 = conv_layers.conv2d(conv1, 32, [3, 3]) optimizer = gradient_descent.GradientDescentOptimizer(0.01) loss = math_ops.reduce_mean(conv2) train_op = optimizer.minimize(loss) saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2) if restore: saver.restore(sess, checkpoint_path) else: sess.run(variables.global_variables_initializer()) np.random.seed(0) for _ in range(2): image_val = np.random.rand(*shape).astype(np.float32) sess.run([loss, train_op], feed_dict={image: image_val}) if restore: all_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES) all_vars_values = [var.eval(session=sess) for var in all_vars] return all_vars_values else: saver.save(sess, checkpoint_path)
def testGradient(self): if not test.is_gpu_available(cuda_only=True): self.skipTest('GPU required') random_seed.set_random_seed(0) x = random_ops.truncated_normal([1, 200, 200, 3], seed=0) y = conv_layers.conv2d(x, 32, [3, 3]) z = conv_layers.conv2d(y, 32, [3, 3]) optimizer = gradient_descent.GradientDescentOptimizer(1e-4) loss = math_ops.reduce_mean(z) train_op = optimizer.minimize(loss) graph = ops.get_default_graph() graph.add_to_collection('train_op', train_op) meta_graph = saver_lib.export_meta_graph(graph_def=graph.as_graph_def()) rewrite_options = rewriter_config_pb2.RewriterConfig( optimize_tensor_layout=True) optimized_graph = tf_optimizer.OptimizeGraph(rewrite_options, meta_graph) found = 0 for node in optimized_graph.node: if node.op in ['Conv2D', 'Conv2DBackpropFilter', 'Conv2DBackpropInput']: found += 1 self.assertEqual(node.attr['data_format'].s, 'NCHW') self.assertEqual(found, 5)
def testSampleUnbiasedNonScalarBatch(self): with self.test_session() as sess: dist = ds.DirichletMultinomial( total_count=5., concentration=1. + 2. * self._rng.rand(4, 3, 2).astype(np.float32)) n = int(3e3) x = dist.sample(n, seed=0) sample_mean = math_ops.reduce_mean(x, 0) # Cyclically rotate event dims left. x_centered = array_ops.transpose(x - sample_mean, [1, 2, 3, 0]) sample_covariance = math_ops.matmul( x_centered, x_centered, adjoint_b=True) / n [ sample_mean_, sample_covariance_, actual_mean_, actual_covariance_, ] = sess.run([ sample_mean, sample_covariance, dist.mean(), dist.covariance(), ]) self.assertAllEqual([4, 3, 2], sample_mean.get_shape()) self.assertAllClose(actual_mean_, sample_mean_, atol=0., rtol=0.15) self.assertAllEqual([4, 3, 2, 2], sample_covariance.get_shape()) self.assertAllClose( actual_covariance_, sample_covariance_, atol=0., rtol=0.20)
def testCustomGrad(self): def fn(a, b, c): return core_layers.dense(a, 10, use_bias=False) + math_ops.matmul(b, c) def grad_fn(inputs, trainable_variables, unused_outputs, unused_grad_outputs): grad_inputs = [ array_ops.ones_like(t) * (i + 1.) for i, t in enumerate(inputs) ] grad_vars = [ array_ops.ones_like(t) * (i + len(inputs) + 1.) for i, t in enumerate(trainable_variables) ] return grad_inputs, grad_vars a = random_ops.random_uniform([11, 6]) b = random_ops.random_uniform([11, 7]) c = random_ops.random_uniform([7, 10]) w = random_ops.random_uniform([6, 10]) out = rev_block_lib._fn_with_custom_grad(grad_fn)(fn)(a, b, c) loss = math_ops.reduce_mean(out) grads = gradients_impl.gradients( loss, [a, b, c, variables.trainable_variables()[0]]) expected_grads = [ array_ops.ones_like(t) * (i + 1.) for i, t in enumerate([a, b, c, w]) ] with self.test_session() as sess: sess.run(variables.global_variables_initializer()) g_val, eg_val = sess.run([grads, expected_grads]) for g1, g2 in zip(g_val, eg_val): self.assertAllClose(g1, g2)
class RaggedMapOpTest(ragged_test_util.RaggedTensorTestCase, parameterized.TestCase): @parameterized.parameters([ # The following test sets map over a RaggedTensor and apply a # transformation that returns with shape: # [d1, (d2)] -> [d1] dict( fn=mo.reduce_mean, elems=[[1, 2, 3], [4, 5], [6, 7]], expected_output=[2, 4, 6], ), dict( fn=string_ops.reduce_join, elems=[['foo', 'bar', 'baz'], ['a'], ['b', 'c']], expected_output=[b'foobarbaz', b'a', b'bc'], dtype=dtypes.string, ), # [d1, (d2)] -> [d1, 2] dict( fn=lambda x: array_ops.stack([mo.reduce_mean(x), mo.reduce_sum(x)]), # fn=self.stack_mean_and_sum, elems=[[1, 2, 3], [4, 5], [6, 7]], expected_output=[[2, 6], [4.5, 9], [6.5, 13]], dtype=dtypes.float32, expected_ragged_rank=0, ), # [d1, (d2)] -> [d1, (d2)] dict( fn=lambda x: x + np.int64(1), elems=[[1, 2, 3], [4, 5], [6, 7]], expected_output=[[2, 3, 4], [5, 6], [7, 8]], dtype=dtypes.int64, result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64, ragged_rank=1), ), # [d1, (d2), d3] -> [d1, (d2), d3] dict( fn=lambda x: x + np.int64(1), elems=[[[1, 2], [3, 4]], [], [[5, 6], [7, 8], [9, 0]]], elems_ragged_rank=1, expected_ragged_rank=1, result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64, ragged_rank=1), expected_output=[[[2, 3], [4, 5]], [], [[6, 7], [8, 9], [10, 1]]], ), # [d1, (d2)] -> [d1, (d2), (d3)] dict( fn=lambda x: ragged.RaggedTensor.from_row_starts(x, [0]), elems=[[1, 2, 3], [4, 5], [6, 7]], expected_output=[[[1, 2, 3]], [[4, 5]], [[6, 7]]], result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64, ragged_rank=2), ), # [d1, (d2), (d3)] -> [d1, (d2), (d3)] dict( fn=lambda x: ragged.map_flat_values(mo.add, x, 1), elems=[[[1, 2, 3]], [[4, 5], [6, 7]]], expected_output=[[[2, 3, 4]], [[5, 6], [7, 8]]], result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64, ragged_rank=2), ), # [d1, (d2), (d3)] -> [d1, (d2)] dict( fn=lambda x: ragged.reduce_sum(x, axis=1), elems=[[[1, 2, 3]], [[4, 5], [6, 7]]], expected_output=[[6], [9, 13]], result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64, ragged_rank=1), ), # [d1, (d2), (d3)] -> [d1, (d3)] dict( fn=lambda x: ragged.reduce_sum(x, axis=0), elems=[[[1, 2, 3]], [[4, 5], [6, 7]]], expected_output=[[1, 2, 3], [10, 12]], result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64, ragged_rank=1), ), # [d1, (d2), (d3)] -> [d1] dict( fn=ragged.reduce_sum, elems=[[[1, 2, 3]], [[4, 5], [6, 7]]], expected_output=[6, 22], result_dtype=dtypes.int64, ), # [d1] -> [d1, (d2)] dict( fn=mo.range, elems=[4, 0, 2], expected_output=[[0, 1, 2, 3], [], [0, 1]], result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64, ragged_rank=1), ), # [d1] -> [d1, (d2), (d3)] dict( fn=lambda x: ragged.range(mo.range(x)), elems=[5, 0, 3], expected_output=[[[], [0], [0, 1], [0, 1, 2], [0, 1, 2, 3]], [], [[], [0], [0, 1]]], result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64, ragged_rank=2), ), # [d1, (d2), (d3), (d4a), (d5)] -> [d1, (d2), (d3), (d4b), (d5)] dict( fn=lambda x: x + np.int64(1), elems=[[[[[1, 2, 3]], [[4], [5]]]], [[[[6, 7]]], [[[8], []]]]], expected_output=[[[[[2, 3, 4]], [[5], [6]]]], [[[[7, 8]]], [[[9], []]]]], result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64, ragged_rank=4), ), ]) def testRaggedMap( self, fn, elems, expected_output, expected_ragged_rank=None, result_ragged_rank=None, elems_ragged_rank=None, dtype=dtypes.int64, result_dtype=None, infer_shape=False, ): elems = ragged.constant(elems, dtype, elems_ragged_rank) output = ragged.map_fn(fn=fn, elems=elems, dtype=result_dtype, infer_shape=infer_shape) expected_rt = ragged.constant(expected_output, ragged_rank=expected_ragged_rank) self.assertRaggedEqual(expected_rt, output) def testRaggedMapOnStructure(self): batman = ragged.constant([[1, 2, 3], [4], [5, 6, 7]]) # [[10, 20, 30], [40], [50, 60, 70]] robin = ragged.map_flat_values(mo.multiply, batman, 10) features = {'batman': batman, 'robin': robin} def _reduce_sum_from_all(f): return mo.reduce_sum(f['batman']) + mo.reduce_sum(f['robin']) output = ragged.map_fn( fn=_reduce_sum_from_all, elems=features, dtype=dtypes.int32, ) self.assertRaggedEqual(output, [66, 44, 198]) # Test mapping over a dict of RTs can produce a dict of RTs. def testRaggedMapOnStructure_RaggedOutputs(self): batman = ragged.constant([[1, 2, 3], [4], [5, 6, 7]]) # [[10, 20, 30], [40], [50, 60, 70]] robin = ragged.map_flat_values(mo.multiply, batman, 10) features = {'batman': batman, 'robin': robin} def _increment(f): return { 'batman': f['batman'] + 1, 'robin': f['robin'] + 1, } output = ragged.map_fn( fn=_increment, elems=features, infer_shape=False, dtype={ 'batman': ragged.RaggedTensorType(dtype=dtypes.int32, ragged_rank=1), 'robin': ragged.RaggedTensorType(dtype=dtypes.int32, ragged_rank=1) }, ) self.assertRaggedEqual(output['batman'], [[2, 3, 4], [5], [6, 7, 8]]) self.assertRaggedEqual(output['robin'], [[11, 21, 31], [41], [51, 61, 71]]) def testZip(self): x = ragged.constant( [[10, 20], [30, 40], [50, 60], [70], [80, 90, 100]], dtypes.int64) y = array_ops.expand_dims(mo.range(x.nrows(), dtype=dtypes.int64), axis=1) def _zip(foo): y_val, x_val = foo bar = backend.tile(y_val, array_ops.shape(x_val)) return array_ops.stack([bar, x_val], axis=1) output = ragged.map_fn(_zip, (y, x), dtype=ragged.RaggedTensorType( dtype=dtypes.int64, ragged_rank=1), infer_shape=False) self.assertRaggedEqual( output, [[[0, 10], [0, 20]], [[1, 30], [1, 40]], [[2, 50], [2, 60]], [[3, 70]], [[4, 80], [4, 90], [4, 100]]]) def testBatchGather(self): tokens = ragged.constant([['hello', '.', 'there'], ['merhaba'], ['bonjour', '.', 'ca va', '?']]) indices = ragged.constant([[0, 2], [0], [0, 2]]) def gather(x): tokens_val, indices_val = x return array_ops.gather(tokens_val, indices_val) data = tokens, indices out = ragged.map_fn(gather, data, dtype=ragged.RaggedTensorType(dtype=dtypes.string, ragged_rank=1), infer_shape=False) self.assertRaggedEqual( out, [[b'hello', b'there'], [b'merhaba'], [b'bonjour', b'ca va']]) def testMismatchRaggedRank(self): elems = ragged.constant([[[1, 2, 3]], [[4, 5], [6, 7]]]) fn = lambda x: ragged.reduce_sum(x, axis=0) with self.assertRaisesWithLiteralMatch( ValueError, r'The declared ragged rank (23) mismatches the result (1)'): _ = ragged.map_fn(fn, elems, dtype=ragged.RaggedTensorType(dtype=dtypes.int64, ragged_rank=23)) def testMismatchRaggedRank2(self): elems = ragged.constant([[1, 2, 3], [4, 5], [6, 7]]) fn = lambda x: ragged.RaggedTensor.from_row_starts(x, [0]) with self.assertRaisesWithLiteralMatch( ValueError, r'The declared ragged rank (10) mismatches the result (1)'): _ = ragged.map_fn(fn, elems, dtype=ragged.RaggedTensorType(dtype=dtypes.int64, ragged_rank=10)) def testMapOnSparseTensor(self): s = sparse_tensor.SparseTensor( indices=[[0, 0], [0, 1], [1, 0], [1, 1]], values=[0, 5, 0, 4], dense_shape=[2, 2], ) t2 = ragged.RaggedTensor.from_sparse(s) id_t2 = ragged.map_fn( lambda x: x, t2, ) self.assertRaggedEqual(id_t2, [[0, 5], [0, 4]])
def frechet_classifier_distance_from_activations(real_activations, generated_activations): """Classifier distance for evaluating a generative model. This methods computes the Frechet classifier distance from activations of real images and generated images. This can be used independently of the frechet_classifier_distance() method, especially in the case of using large batches during evaluation where we would like precompute all of the activations before computing the classifier distance. This technique is described in detail in https://arxiv.org/abs/1706.08500. Given two Gaussian distribution with means m and m_w and covariance matrices C and C_w, this function calculates |m - m_w|^2 + Tr(C + C_w - 2(C * C_w)^(1/2)) which captures how different the distributions of real images and generated images (or more accurately, their visual features) are. Note that unlike the Inception score, this is a true distance and utilizes information about real world images. Note that when computed using sample means and sample covariance matrices, Frechet distance is biased. It is more biased for small sample sizes. (e.g. even if the two distributions are the same, for a small sample size, the expected Frechet distance is large). It is important to use the same sample size to compute frechet classifier distance when comparing two generative models. Args: real_activations: 2D Tensor containing activations of real data. Shape is [batch_size, activation_size]. generated_activations: 2D Tensor containing activations of generated data. Shape is [batch_size, activation_size]. Returns: The Frechet Inception distance. A floating-point scalar of the same type as the output of the activations. """ real_activations.shape.assert_has_rank(2) generated_activations.shape.assert_has_rank(2) activations_dtype = real_activations.dtype if activations_dtype != dtypes.float64: real_activations = math_ops.cast(real_activations, dtypes.float64) generated_activations = math_ops.cast(generated_activations, dtypes.float64) # Compute mean and covariance matrices of activations. m = math_ops.reduce_mean(real_activations, 0) m_w = math_ops.reduce_mean(generated_activations, 0) num_examples_real = math_ops.cast( array_ops.shape(real_activations)[0], dtypes.float64) num_examples_generated = math_ops.cast( array_ops.shape(generated_activations)[0], dtypes.float64) # sigma = (1 / (n - 1)) * (X - mu) (X - mu)^T real_centered = real_activations - m sigma = math_ops.matmul(real_centered, real_centered, transpose_a=True) / (num_examples_real - 1) gen_centered = generated_activations - m_w sigma_w = math_ops.matmul(gen_centered, gen_centered, transpose_a=True) / (num_examples_generated - 1) # Find the Tr(sqrt(sigma sigma_w)) component of FID sqrt_trace_component = trace_sqrt_product(sigma, sigma_w) # Compute the two components of FID. # First the covariance component. # Here, note that trace(A + B) = trace(A) + trace(B) trace = math_ops.trace(sigma + sigma_w) - 2.0 * sqrt_trace_component # Next the distance between means. mean = math_ops.reduce_sum(math_ops.squared_difference( m, m_w)) # Equivalent to L2 but more stable. fid = trace + mean if activations_dtype != dtypes.float64: fid = math_ops.cast(fid, activations_dtype) return fid
def kernel_classifier_distance_and_std_from_activations( real_activations, generated_activations, max_block_size=1024, dtype=None): """Kernel "classifier" distance for evaluating a generative model. This methods computes the kernel classifier distance from activations of real images and generated images. This can be used independently of the kernel_classifier_distance() method, especially in the case of using large batches during evaluation where we would like to precompute all of the activations before computing the classifier distance, or if we want to compute multiple metrics based on the same images. It also returns a rough estimate of the standard error of the estimator. This technique is described in detail in https://arxiv.org/abs/1801.01401. Given two distributions P and Q of activations, this function calculates E_{X, X' ~ P}[k(X, X')] + E_{Y, Y' ~ Q}[k(Y, Y')] - 2 E_{X ~ P, Y ~ Q}[k(X, Y)] where k is the polynomial kernel k(x, y) = ( x^T y / dimension + 1 )^3. This captures how different the distributions of real and generated images' visual features are. Like the Frechet distance (and unlike the Inception score), this is a true distance and incorporates information about the target images. Unlike the Frechet score, this function computes an *unbiased* and asymptotically normal estimator, which makes comparing estimates across models much more intuitive. The estimator used takes time quadratic in max_block_size. Larger values of max_block_size will decrease the variance of the estimator but increase the computational cost. This differs slightly from the estimator used by the original paper; it is the block estimator of https://arxiv.org/abs/1307.1954. The estimate of the standard error will also be more reliable when there are more blocks, i.e. when max_block_size is smaller. NOTE: the blocking code assumes that real_activations and generated_activations are both in random order. If either is sorted in a meaningful order, the estimator will behave poorly. Args: real_activations: 2D Tensor containing activations of real data. Shape is [batch_size, activation_size]. generated_activations: 2D Tensor containing activations of generated data. Shape is [batch_size, activation_size]. max_block_size: integer, default 1024. The distance estimator splits samples into blocks for computational efficiency. Larger values are more computationally expensive but decrease the variance of the distance estimate. Having a smaller block size also gives a better estimate of the standard error. dtype: If not None, coerce activations to this dtype before computations. Returns: The Kernel Inception Distance. A floating-point scalar of the same type as the output of the activations. An estimate of the standard error of the distance estimator (a scalar of the same type). """ real_activations.shape.assert_has_rank(2) generated_activations.shape.assert_has_rank(2) real_activations.shape[1].assert_is_compatible_with( generated_activations.shape[1]) if dtype is None: dtype = real_activations.dtype assert generated_activations.dtype == dtype else: real_activations = math_ops.cast(real_activations, dtype) generated_activations = math_ops.cast(generated_activations, dtype) # Figure out how to split the activations into blocks of approximately # equal size, with none larger than max_block_size. n_r = array_ops.shape(real_activations)[0] n_g = array_ops.shape(generated_activations)[0] n_bigger = math_ops.maximum(n_r, n_g) n_blocks = math_ops.cast(math_ops.ceil(n_bigger / max_block_size), dtypes.int32) v_r = n_r // n_blocks v_g = n_g // n_blocks n_plusone_r = n_r - v_r * n_blocks n_plusone_g = n_g - v_g * n_blocks sizes_r = array_ops.concat([ array_ops.fill([n_blocks - n_plusone_r], v_r), array_ops.fill([n_plusone_r], v_r + 1), ], 0) sizes_g = array_ops.concat([ array_ops.fill([n_blocks - n_plusone_g], v_g), array_ops.fill([n_plusone_g], v_g + 1), ], 0) zero = array_ops.zeros([1], dtype=dtypes.int32) inds_r = array_ops.concat([zero, math_ops.cumsum(sizes_r)], 0) inds_g = array_ops.concat([zero, math_ops.cumsum(sizes_g)], 0) dim = math_ops.cast(real_activations.shape[1], dtype) def compute_kid_block(i): """Computes the ith block of the KID estimate.""" r_s = inds_r[i] r_e = inds_r[i + 1] r = real_activations[r_s:r_e] m = math_ops.cast(r_e - r_s, dtype) g_s = inds_g[i] g_e = inds_g[i + 1] g = generated_activations[g_s:g_e] n = math_ops.cast(g_e - g_s, dtype) k_rr = (math_ops.matmul(r, r, transpose_b=True) / dim + 1)**3 k_rg = (math_ops.matmul(r, g, transpose_b=True) / dim + 1)**3 k_gg = (math_ops.matmul(g, g, transpose_b=True) / dim + 1)**3 return (-2 * math_ops.reduce_mean(k_rg) + (math_ops.reduce_sum(k_rr) - math_ops.trace(k_rr)) / (m * (m - 1)) + (math_ops.reduce_sum(k_gg) - math_ops.trace(k_gg)) / (n * (n - 1))) ests = map_fn.map_fn(compute_kid_block, math_ops.range(n_blocks), dtype=dtype, back_prop=False) mn = math_ops.reduce_mean(ests) # nn_impl.moments doesn't use the Bessel correction, which we want here n_blocks_ = math_ops.cast(n_blocks, dtype) var = control_flow_ops.cond( math_ops.less_equal(n_blocks, 1), lambda: array_ops.constant(float('nan'), dtype=dtype), lambda: math_ops.reduce_sum(math_ops.square(ests - mn)) / (n_blocks_ - 1)) return mn, math_ops.sqrt(var / n_blocks_)
def _tf_reduce(self, x, reduction_axes, keepdims): return math_ops.reduce_mean(x, reduction_axes, keepdims)
def worker_train_fn(): y = random_ops.random_uniform((10, 2)) return math_ops.reduce_mean(math_ops.matmul(v, y))
def _BatchNormGrad(grad_y, x, scale, pop_mean, pop_var, epsilon, data_format, is_training=True): """Returns the gradients for the 3 inputs of BatchNorm. Args: grad_y: A `Tensor` of 4 dimensions for gradient for y. x: A `Tensor` of 4 dimensions for x. scale: A `Tensor` of 1 dimension for scaling. pop_mean: A `Tensor` of 1 dimension for the population mean. Only used when is_training=False. pop_var: A `Tensor` of 1 dimension for the population variance. Only used when is_training=False. epsilon: A small float number added to the variance of x. data_format: The data format for input. Either b"NHWC" or b"NCHW". is_training: A bool value to indicate the operation is for training (default) or inference. Returns: A tuple (grad_x, grad_scale, grad_offset), where grad_x is the gradient for x, grad_scale the gradient for scale, and grad_offset the gradient for offset. """ x_dtype = x.dtype.base_dtype if x_dtype == dtypes.float16: # float16 math is too imprecise, so we do the batch norm gradient # computations in float32. x = math_ops.cast(x, dtypes.float32) grad_y = math_ops.cast(grad_y, dtypes.float32) if is_training: if data_format == b"NHWC": keepdims = False reduce_axis = [0, 1, 2] else: keepdims = True reduce_axis = [0, 2, 3] shape = [1, array_ops.size(scale), 1, 1] scale = array_ops.reshape(scale, shape) mean_grad_y = math_ops.reduce_mean(grad_y, reduce_axis, keepdims=keepdims) mean_x = math_ops.reduce_mean(x, reduce_axis, keepdims=keepdims) var_x = math_ops.reduce_mean( math_ops.squared_difference(x, array_ops.stop_gradient(mean_x)), reduce_axis, keepdims=keepdims) grad_y_offset = grad_y - mean_grad_y x_offset = x - mean_x mean = math_ops.reduce_mean( grad_y * x_offset, axis=reduce_axis, keepdims=keepdims) grad_x = scale * math_ops.rsqrt(var_x + epsilon) * ( grad_y_offset - math_ops.reciprocal(var_x + epsilon) * mean * x_offset) grad_scale = math_ops.rsqrt(var_x + epsilon) * math_ops.reduce_sum( grad_y * x_offset, axis=reduce_axis, keepdims=keepdims) if data_format == b"NCHW": grad_scale = array_ops.squeeze(grad_scale) grad_offset = math_ops.reduce_sum(grad_y, axis=reduce_axis) return math_ops.cast(grad_x, x_dtype), grad_scale, grad_offset else: if data_format == b"NHWC": reduce_axis = [0, 1, 2] else: reduce_axis = [0, 2, 3] shape = [1, array_ops.size(pop_mean), 1, 1] pop_mean = array_ops.reshape(pop_mean, shape) pop_var = array_ops.reshape(pop_var, shape) scale = array_ops.reshape(scale, shape) grad_offset = math_ops.reduce_sum(grad_y, axis=reduce_axis) var_rsqrt = math_ops.rsqrt(pop_var + epsilon) grad_scale = math_ops.reduce_sum( grad_y * (x - pop_mean) * var_rsqrt, axis=reduce_axis) grad_x = grad_y * scale * var_rsqrt return math_ops.cast(grad_x, x_dtype), grad_scale, grad_offset
def regularizer2(v): return math_ops.reduce_mean(v) + 0.2
def resnet_v2(inputs, blocks, num_classes=None, is_training=True, global_pool=True, output_stride=None, include_root_block=True, reuse=None, scope=None): """Generator for v2 (preactivation) ResNet models. This function generates a family of ResNet v2 models. See the resnet_v2_*() methods for specific model instantiations, obtained by selecting different block instantiations that produce ResNets of various depths. Training for image classification on Imagenet is usually done with [224, 224] inputs, resulting in [7, 7] feature maps at the output of the last ResNet block for the ResNets defined in [1] that have nominal stride equal to 32. However, for dense prediction tasks we advise that one uses inputs with spatial dimensions that are multiples of 32 plus 1, e.g., [321, 321]. In this case the feature maps at the ResNet output will have spatial shape [(height - 1) / output_stride + 1, (width - 1) / output_stride + 1] and corners exactly aligned with the input image corners, which greatly facilitates alignment of the features to the image. Using as input [225, 225] images results in [8, 8] feature maps at the output of the last ResNet block. For dense prediction tasks, the ResNet needs to run in fully-convolutional (FCN) mode and global_pool needs to be set to False. The ResNets in [1, 2] all have nominal stride equal to 32 and a good choice in FCN mode is to use output_stride=16 in order to increase the density of the computed features at small computational and memory overhead, cf. http://arxiv.org/abs/1606.00915. Args: inputs: A tensor of size [batch, height_in, width_in, channels]. blocks: A list of length equal to the number of ResNet blocks. Each element is a resnet_utils.Block object describing the units in the block. num_classes: Number of predicted classes for classification tasks. If None we return the features before the logit layer. is_training: whether batch_norm layers are in training mode. global_pool: If True, we perform global average pooling before computing the logits. Set to True for image classification, False for dense prediction. output_stride: If None, then the output will be computed at the nominal network stride. If output_stride is not None, it specifies the requested ratio of input to output spatial resolution. include_root_block: If True, include the initial convolution followed by max-pooling, if False excludes it. If excluded, `inputs` should be the results of an activation-less convolution. reuse: whether or not the network and its variables should be reused. To be able to reuse 'scope' must be given. scope: Optional variable_scope. Returns: net: A rank-4 tensor of size [batch, height_out, width_out, channels_out]. If global_pool is False, then height_out and width_out are reduced by a factor of output_stride compared to the respective height_in and width_in, else both height_out and width_out equal one. If num_classes is None, then net is the output of the last ResNet block, potentially after global average pooling. If num_classes is not None, net contains the pre-softmax activations. end_points: A dictionary from components of the network to the corresponding activation. Raises: ValueError: If the target output_stride is not valid. """ with variable_scope.variable_scope( scope, 'resnet_v2', [inputs], reuse=reuse) as sc: end_points_collection = sc.original_name_scope + '_end_points' with arg_scope( [layers_lib.conv2d, bottleneck, resnet_utils.stack_blocks_dense], outputs_collections=end_points_collection): with arg_scope([layers.batch_norm], is_training=is_training): net = inputs if include_root_block: if output_stride is not None: if output_stride % 4 != 0: raise ValueError('The output_stride needs to be a multiple of 4.') output_stride /= 4 # We do not include batch normalization or activation functions in # conv1 because the first ResNet unit will perform these. Cf. # Appendix of [2]. with arg_scope( [layers_lib.conv2d], activation_fn=None, normalizer_fn=None): net = resnet_utils.conv2d_same(net, 64, 7, stride=2, scope='conv1') net = layers.max_pool2d(net, [3, 3], stride=2, scope='pool1') net = resnet_utils.stack_blocks_dense(net, blocks, output_stride) # This is needed because the pre-activation variant does not have batch # normalization or activation functions in the residual unit output. See # Appendix of [2]. net = layers.batch_norm( net, activation_fn=nn_ops.relu, scope='postnorm') if global_pool: # Global average pooling. net = math_ops.reduce_mean(net, [1, 2], name='pool5', keepdims=True) net = layers_lib.dropout(net, 0.5, is_training=is_training, scope='dropout') if num_classes is not None: net = layers_lib.conv2d( net, num_classes, [1, 1], activation_fn=None, normalizer_fn=None, scope='logits') # Convert end_points_collection into a dictionary of end_points. end_points = utils.convert_collection_to_dict(end_points_collection) if num_classes is not None: end_points['predictions'] = layers.softmax(net, scope='predictions') return net, end_points
def bottleneck(inputs, depth, depth_bottleneck, stride, rate=1, outputs_collections=None, scope=None): """Bottleneck residual unit variant with BN before convolutions. This is the full preactivation residual unit variant proposed in [2]. See Fig. 1(b) of [2] for its definition. Note that we use here the bottleneck variant which has an extra bottleneck layer. When putting together two consecutive ResNet blocks that use this unit, one should use stride = 2 in the last unit of the first block. Args: inputs: A tensor of size [batch, height, width, channels]. depth: The depth of the ResNet unit output. depth_bottleneck: The depth of the bottleneck layers. stride: The ResNet unit's stride. Determines the amount of downsampling of the units output compared to its input. rate: An integer, rate for atrous convolution. outputs_collections: Collection to add the ResNet unit output. scope: Optional variable_scope. Returns: The ResNet unit's output. """ with variable_scope.variable_scope(scope, 'bottleneck_v2', [inputs]) as sc: depth_in = utils.last_dimension(inputs.get_shape(), min_rank=4) preact = layers.batch_norm( inputs, activation_fn=nn_ops.relu, scope='preact') if depth == depth_in: shortcut = resnet_utils.subsample(inputs, stride, 'shortcut') else: shortcut = layers_lib.conv2d( preact, depth, [1, 1], stride=stride, normalizer_fn=None, activation_fn=None, scope='shortcut') residual = layers_lib.conv2d( preact, depth_bottleneck, [1, 1], stride=1, scope='conv1') residual = resnet_utils.conv2d_same( residual, depth_bottleneck, 3, stride, rate=rate, scope='conv2') residual = layers_lib.conv2d( residual, depth, [1, 1], stride=1, normalizer_fn=None, activation_fn=None, scope='conv3') with variable_scope.variable_scope('CBAM'): max_c = math_ops.reduce_max(residual, axis=[1, 2], name='max_c') max_c = layers_lib.fully_connected(max_c, int(depth / 8), nn_ops.relu, normalizer_fn=None, scope='share1', weights_regularizer=layers_lib.l2_regularizer(0.0001)) max_c = layers_lib.fully_connected(max_c, depth, None, normalizer_fn=None, scope='share2', weights_regularizer=layers_lib.l2_regularizer(0.0001)) avg_c = math_ops.reduce_mean(residual, axis=[1, 2], name='avg_c') avg_c = layers_lib.fully_connected(avg_c, int(depth / 8), nn_ops.relu, normalizer_fn=None, scope='share1', reuse=True) avg_c = layers_lib.fully_connected(avg_c, depth, None, normalizer_fn=None, scope='share2', reuse=True) Mc = math_ops.sigmoid(max_c + avg_c) Mc = array_ops.expand_dims(Mc, 1) Mc = array_ops.expand_dims(Mc, 1) residual = residual * Mc max_s = math_ops.reduce_max(residual, axis=-1, name='max_s', keep_dims=True) avg_s = math_ops.reduce_mean(residual, axis=-1, name='avg_s', keep_dims=True) Ms = array_ops.concat([avg_s, max_s], axis=-1) Ms = layers_lib.conv2d(Ms, 1, [1, 1], activation_fn=math_ops.sigmoid, normalizer_fn=None) residual = residual * Ms output = shortcut + residual return utils.collect_named_outputs(outputs_collections, sc.name, output)
def _testRevBlock(self, x=None, f=None, g=None, f_side_input=None, g_side_input=None): random_seed.set_random_seed(1234) if f is None: def f(x): # pylint: disable=function-redefined return core_layers.dense(x, self.CHANNELS // 2, use_bias=True) if g is None: def g(x): # pylint: disable=function-redefined return core_layers.dense(x, self.CHANNELS // 2, use_bias=True) if f_side_input is None: f_side_input = [] if g_side_input is None: g_side_input = [] if x is None: x = random_ops.random_uniform([self.BATCH_SIZE, self.CHANNELS], dtype=dtypes.float32) x1, x2 = array_ops.split(x, 2, axis=-1) with variable_scope.variable_scope("rev_test") as vs: y1_rev, y2_rev = rev_block_lib.rev_block( x1, x2, f, g, f_side_input=f_side_input, g_side_input=g_side_input, num_layers=self.NUM_LAYERS) y_rev = array_ops.concat([y1_rev, y2_rev], axis=1) fg_vars = vs.trainable_variables() num_vars = len(variables.global_variables()) with variable_scope.variable_scope(vs, reuse=True): y1, y2 = rev_block_lib.rev_block(x1, x2, f, g, f_side_input=f_side_input, g_side_input=g_side_input, num_layers=self.NUM_LAYERS, is_training=False) y = array_ops.concat([y1, y2], axis=1) # Ensure no new vars were created - full reuse assert len(variables.global_variables()) == num_vars loss_rev = math_ops.reduce_mean(y_rev + 10.) loss = math_ops.reduce_mean(y + 10.) wrt = [x] + f_side_input + g_side_input + fg_vars grads_rev = gradients_impl.gradients(loss_rev, wrt) grads = gradients_impl.gradients(loss, wrt) with self.test_session() as sess: sess.run(variables.global_variables_initializer()) y_val, yd_val, gd_val, g_val = sess.run( [y, y_rev, grads_rev, grads]) self.assertAllClose(y_val, yd_val) for g1, g2 in zip(gd_val, g_val): self.assertAllClose(g1, g2, rtol=1e-5)
def get_batch_loss(self, features, mode, state): """Computes predictions and a loss. Args: features: A dictionary (such as is produced by a chunker) with the following key/value pairs (shapes are given as required for training): TrainEvalFeatures.TIMES: A [batch size, self.window_size] integer Tensor with times for each observation. To train on longer sequences, the data should first be chunked. TrainEvalFeatures.VALUES: A [batch size, self.window_size, self.num_features] Tensor with values for each observation. When evaluating, `TIMES` and `VALUES` must have a window size of at least self.window_size, but it may be longer, in which case the last window_size - self.input_window_size times (or fewer if this is not divisible by self.output_window_size) will be evaluated on with non-overlapping output windows (and will have associated predictions). This is primarily to support qualitative evaluation/plotting, and is not a recommended way to compute evaluation losses (since there is no overlap in the output windows, which for window-based models is an undesirable bias). mode: The tf.estimator.ModeKeys mode to use (TRAIN or EVAL). state: Unused Returns: A model.ModelOutputs object. Raises: ValueError: If `mode` is not TRAIN or EVAL, or if static shape information is incorrect. """ features = {feature_name: ops.convert_to_tensor(feature_value) for feature_name, feature_value in features.items()} times = features[TrainEvalFeatures.TIMES] exogenous_regressors = self._process_exogenous_features( times=times, features={key: value for key, value in features.items() if key not in [TrainEvalFeatures.TIMES, TrainEvalFeatures.VALUES, PredictionFeatures.STATE_TUPLE]}) if mode == estimator_lib.ModeKeys.TRAIN: # For training, we require the window size to be self.window_size as # iterating sequentially on larger windows could introduce a bias. return self._process_window( features, mode=mode, exogenous_regressors=exogenous_regressors) elif mode == estimator_lib.ModeKeys.EVAL: # For evaluation, we allow the user to pass in a larger window, in which # case we try to cover as much of the window as possible without # overlap. Quantitative evaluation is more efficient/correct with fixed # windows matching self.window_size (as with training), but this looping # allows easy plotting of "in-sample" predictions. times.get_shape().assert_has_rank(2) static_window_size = times.get_shape().dims[1].value if (static_window_size is not None and static_window_size < self.window_size): raise ValueError( ("ARModel requires a window of at least input_window_size + " "output_window_size to evaluate on (input_window_size={}, " "output_window_size={}, and got shape {} for feature '{}' (batch " "size, window size)).").format( self.input_window_size, self.output_window_size, times.get_shape(), TrainEvalFeatures.TIMES)) num_iterations = ((array_ops.shape(times)[1] - self.input_window_size) // self.output_window_size) output_size = num_iterations * self.output_window_size # Rather than dealing with overlapping windows of output, discard a bit at # the beginning if output windows don't cover evenly. crop_length = output_size + self.input_window_size features = {feature_name: feature_value[:, -crop_length:] for feature_name, feature_value in features.items()} # Note that, unlike the ARModel's predict() while_loop, each iteration # here can run in parallel, since we are not feeding predictions or state # from previous iterations. def _while_condition(iteration_number, loss_ta, mean_ta, covariance_ta): del loss_ta, mean_ta, covariance_ta # unused return iteration_number < num_iterations def _while_body(iteration_number, loss_ta, mean_ta, covariance_ta): """Perform a processing step on a single window of data.""" base_offset = iteration_number * self.output_window_size model_outputs = self._process_window( features={ feature_name: feature_value[:, base_offset:base_offset + self.window_size] for feature_name, feature_value in features.items()}, mode=mode, exogenous_regressors=exogenous_regressors[ :, base_offset:base_offset + self.window_size]) # This code needs to be updated if new predictions are added in # self._process_window assert len(model_outputs.predictions) == 3 assert "mean" in model_outputs.predictions assert "covariance" in model_outputs.predictions assert "observed" in model_outputs.predictions return (iteration_number + 1, loss_ta.write( iteration_number, model_outputs.loss), mean_ta.write( iteration_number, model_outputs.predictions["mean"]), covariance_ta.write( iteration_number, model_outputs.predictions["covariance"])) _, loss_ta, mean_ta, covariance_ta = control_flow_ops.while_loop( _while_condition, _while_body, [0, tensor_array_ops.TensorArray(dtype=self.dtype, size=num_iterations), tensor_array_ops.TensorArray(dtype=self.dtype, size=num_iterations), tensor_array_ops.TensorArray(dtype=self.dtype, size=num_iterations)]) values = math_ops.cast(features[TrainEvalFeatures.VALUES], dtype=self.dtype) batch_size = array_ops.shape(times)[0] prediction_shape = [batch_size, self.output_window_size * num_iterations, self.num_features] (previous_state_times, previous_state_values, previous_state_exogenous_regressors) = state # Make sure returned state always has windows of self.input_window_size, # even if we were passed fewer than self.input_window_size points this # time. if self.input_window_size > 0: new_state_times = array_ops.concat( [previous_state_times, math_ops.cast(times, dtype=dtypes.int64)], axis=1)[:, -self.input_window_size:] new_state_times.set_shape((None, self.input_window_size)) new_state_values = array_ops.concat( [previous_state_values, self._scale_data(values)], axis=1)[:, -self.input_window_size:, :] new_state_values.set_shape((None, self.input_window_size, self.num_features)) new_exogenous_regressors = array_ops.concat( [previous_state_exogenous_regressors, exogenous_regressors], axis=1)[:, -self.input_window_size:, :] new_exogenous_regressors.set_shape( (None, self.input_window_size, self.exogenous_size)) else: # There is no state to keep, and the strided slices above do not handle # input_window_size=0. new_state_times = previous_state_times new_state_values = previous_state_values new_exogenous_regressors = previous_state_exogenous_regressors return model.ModelOutputs( loss=math_ops.reduce_mean(loss_ta.stack(), axis=0), end_state=(new_state_times, new_state_values, new_exogenous_regressors), predictions={ "mean": array_ops.reshape( array_ops.transpose(mean_ta.stack(), [1, 0, 2, 3]), prediction_shape), "covariance": array_ops.reshape( array_ops.transpose(covariance_ta.stack(), [1, 0, 2, 3]), prediction_shape), "observed": values[:, -output_size:]}, prediction_times=times[:, -output_size:]) else: raise ValueError( "Unknown mode '{}' passed to get_batch_loss.".format(mode))
def _RoutingFunctionGradient(op, grad): """The gradient of RoutingFunction. Args: op: The RoutingFunction op. grad: Gradient with respect to the output of the RoutingFunction op. Returns: Gradients with respect to the input of the RoutingFunction op. """ routing_gradient = _training_ops.routing_gradient input_data_tensor = op.inputs[0] tree_weights_tensor = op.inputs[1] tree_thresholds_tensor = op.inputs[2] routing_function_tensor = op.outputs[0] # The derivatives below are each defined over one or two of three dimensions: # (batch_size, num_nodes, num_features). We explicitly expand each derivative # to three dimensions to ensure that they're broadcasted correctly. # dl / du is the derivative of the loss with respect to the output of the # routing function, which is provided by tensorflow. # # dl / du has dimension (batch_size, num_nodes), which we expand to # (batch_size, num_nodes, 1). dl_du = array_ops.expand_dims(grad, 2) # du / df is the derivative of the output of the routing function with respect # to the decision function at each node. It is computed by # routing_gradient_op.cc. # # du / df has dimension (batch_size, num_nodes), which we expand to # (batch_size, num_nodes, 1). du_df = array_ops.expand_dims( routing_gradient(input_data_tensor, tree_weights_tensor, tree_thresholds_tensor, routing_function_tensor, max_nodes=op.get_attr('max_nodes')), 2) # df / dx is the derivative of the decision function with respect to the input # data. f_i(x) = (-t_i * x + b_i), so df_i / dx = -t_i. # # df / dx has dimension (num_nodes, num_features), which we expand to # (1, num_nodes, num_features). df_dx = -array_ops.expand_dims(tree_weights_tensor, 0) # df / dt is the derivative of the decision function with respect to its # parameters. f_i(x) = (-t_i * x + b_i), so df_i / d t_i = -x. # # df / dt has dimension (batch_size, num_features), which we expand to # (batch_size, 1, num_features). df_dt = -array_ops.expand_dims(input_data_tensor, 1) # df / dt is the derivative of the decision function with respect to its # bias parameter. f_i(x) = (-t_i * x + b_i), so df_i / d t_i = 1. # # df / db has dimension (num_nodes), which we expand to # (1, num_nodes, 1). df_db = array_ops.expand_dims( array_ops.expand_dims(array_ops.ones_like(tree_thresholds_tensor), 0), 2) # Compute the derivatives of the loss with respect to the inputs using the # chain rule (backpropagation). dl_dx = math_ops.reduce_mean(dl_du * du_df * df_dx, 1) dl_dt = math_ops.reduce_mean(dl_du * du_df * df_dt, 0) dl_db = math_ops.reduce_mean(array_ops.squeeze(dl_du * du_df * df_db, [2]), 0) input_gradients = [dl_dx, dl_dt, dl_db] return input_gradients
def _KFeatureRoutingFunctionGradient(op, grad): """The gradient of RoutingFunction. Args: op: The RoutingFunction op. grad: Gradient with respect to the output of the RoutingFunction op. Returns: Gradients with respect to the input of the RoutingFunction op. """ gradient_op = _training_ops.k_feature_gradient input_data_tensor = op.inputs[0] tree_weights_tensor = op.inputs[1] tree_thresholds_tensor = op.inputs[2] routing_function_tensor = op.outputs[0] # The derivatives below are each defined over one or two of three dimensions: # (batch_size, num_nodes, num_features). We explicitly expand each derivative # to three dimensions to ensure that they're broadcasted correctly. du_df_raw, df_dx_raw, df_dt_raw = gradient_op( input_data_tensor, tree_weights_tensor, tree_thresholds_tensor, routing_function_tensor, layer_num=op.get_attr('layer_num'), random_seed=op.get_attr('random_seed')) # dl / du is the derivative of the loss with respect to the output of the # routing function, which is provided by tensorflow. # # dl / du has dimension (batch_size, num_nodes), which we expand to # (batch_size, num_nodes, 1). dl_du = array_ops.expand_dims(grad, 2) # du / df is the derivative of the output of the routing function with respect # to the decision function at each node. It is computed by # single_feature_routing_gradient_op.cc. # # du / df has dimension (batch_size, num_nodes), which we expand to # (batch_size, num_nodes, 1). du_df = array_ops.expand_dims(du_df_raw, 2) # df / dx is the derivative of the decision function with respect to the input # data. f(x) = (-t * x + b), so df / dx = -t for the selected features and # zero elsewhere. # # df / dx has dimension (num_nodes, num_features), which we expand to # (1, num_nodes, num_features). df_dx = array_ops.expand_dims(df_dx_raw, 0) # df / dt is the derivative of the decision function with respect to its # parameters. f(x) = (-t * x + b), so df / dt = -x[feature]. # # df / dt has dimension (batch_size, num_nodes, num_features). df_dt = -df_dt_raw # df / dt is the derivative of the decision function with respect to its # bias parameter. f(x) = (-t * x + b), so df / dt = 1. # # df / db has dimension (num_nodes), which we expand to # (1, num_nodes, 1). df_db = array_ops.expand_dims( array_ops.expand_dims(array_ops.ones_like(tree_thresholds_tensor), 0), 2) # Compute the derivatives of the loss with respect to the inputs using the # chain rule (backpropagation). dl_dx = math_ops.reduce_mean(dl_du * du_df * df_dx, 1) dl_dt = math_ops.reduce_mean(dl_du * du_df * df_dt, 0) dl_db = math_ops.reduce_mean(array_ops.squeeze(dl_du * du_df * df_db, [2]), 0) input_gradients = [dl_dx, dl_dt, dl_db] return input_gradients
def compute(x): return math_ops.reduce_mean(x, axis=0, keepdims=True)
def forward_pass(value): count.assign_add(1) residuals = value - model loss = 0.5 * math_ops.reduce_mean(math_ops.pow(residuals, 2)) # Note: count is an integer, so its doutput will be None return loss, count
def _testMVN(self, base_distribution_class, base_distribution_kwargs, batch_shape=(), event_shape=(), not_implemented_message=None): with self.test_session() as sess: # Overriding shapes must be compatible w/bijector; most bijectors are # batch_shape agnostic and only care about event_ndims. # In the case of `Affine`, if we got it wrong then it would fire an # exception due to incompatible dimensions. batch_shape_pl = array_ops.placeholder(dtypes.int32, name="dynamic_batch_shape") event_shape_pl = array_ops.placeholder(dtypes.int32, name="dynamic_event_shape") feed_dict = { batch_shape_pl: np.array(batch_shape, dtype=np.int32), event_shape_pl: np.array(event_shape, dtype=np.int32) } fake_mvn_dynamic = self._cls()( distribution=base_distribution_class( validate_args=True, **base_distribution_kwargs), bijector=bs.Affine(shift=self._shift, scale_tril=self._tril), batch_shape=batch_shape_pl, event_shape=event_shape_pl, validate_args=True) fake_mvn_static = self._cls()(distribution=base_distribution_class( validate_args=True, **base_distribution_kwargs), bijector=bs.Affine( shift=self._shift, scale_tril=self._tril), batch_shape=batch_shape, event_shape=event_shape, validate_args=True) actual_mean = np.tile(self._shift, [2, 1]) # Affine elided this tile. actual_cov = np.matmul(self._tril, np.transpose(self._tril, [0, 2, 1])) def actual_mvn_log_prob(x): return np.concatenate([[ stats.multivariate_normal(actual_mean[i], actual_cov[i]).logpdf(x[:, i, :]) ] for i in range(len(actual_cov))]).T actual_mvn_entropy = np.concatenate([[ stats.multivariate_normal(actual_mean[i], actual_cov[i]).entropy() ] for i in range(len(actual_cov))]) self.assertAllEqual([3], fake_mvn_static.event_shape) self.assertAllEqual([2], fake_mvn_static.batch_shape) self.assertAllEqual(tensor_shape.TensorShape(None), fake_mvn_dynamic.event_shape) self.assertAllEqual(tensor_shape.TensorShape(None), fake_mvn_dynamic.batch_shape) x = fake_mvn_static.sample(5, seed=0).eval() for unsupported_fn in (fake_mvn_static.log_cdf, fake_mvn_static.cdf, fake_mvn_static.survival_function, fake_mvn_static.log_survival_function): with self.assertRaisesRegexp(NotImplementedError, not_implemented_message): unsupported_fn(x) num_samples = 5e3 for fake_mvn, feed_dict in ((fake_mvn_static, {}), (fake_mvn_dynamic, feed_dict)): # Ensure sample works by checking first, second moments. y = fake_mvn.sample(int(num_samples), seed=0) x = y[0:5, ...] sample_mean = math_ops.reduce_mean(y, 0) centered_y = array_ops.transpose(y - sample_mean, [1, 2, 0]) sample_cov = math_ops.matmul( centered_y, centered_y, transpose_b=True) / num_samples [ sample_mean_, sample_cov_, x_, fake_event_shape_, fake_batch_shape_, fake_log_prob_, fake_prob_, fake_entropy_, ] = sess.run([ sample_mean, sample_cov, x, fake_mvn.event_shape_tensor(), fake_mvn.batch_shape_tensor(), fake_mvn.log_prob(x), fake_mvn.prob(x), fake_mvn.entropy(), ], feed_dict=feed_dict) self.assertAllClose(actual_mean, sample_mean_, atol=0.1, rtol=0.1) self.assertAllClose(actual_cov, sample_cov_, atol=0., rtol=0.1) # Ensure all other functions work as intended. self.assertAllEqual([5, 2, 3], x_.shape) self.assertAllEqual([3], fake_event_shape_) self.assertAllEqual([2], fake_batch_shape_) self.assertAllClose(actual_mvn_log_prob(x_), fake_log_prob_, atol=0., rtol=1e-6) self.assertAllClose(np.exp(actual_mvn_log_prob(x_)), fake_prob_, atol=0., rtol=1e-5) self.assertAllClose(actual_mvn_entropy, fake_entropy_, atol=0., rtol=1e-6)
def average(a, axis=None, weights=None, returned=False): # pylint: disable=missing-docstring if axis is not None and not isinstance(axis, six.integer_types): # TODO(wangpeng): Support tuple of ints as `axis` raise ValueError('Argument `axis` must be an integer. ' f'Received axis={axis} (of type {type(axis)})') a = np_array_ops.array(a) if weights is None: # Treat all weights as 1 if not np.issubdtype(a.dtype.as_numpy_dtype, np.inexact): a = a.astype( np_utils.result_type(a.dtype, np_dtypes.default_float_type())) avg = math_ops.reduce_mean(a, axis=axis) if returned: if axis is None: weights_sum = array_ops.size(a) else: weights_sum = array_ops.shape(a)[axis] weights_sum = math_ops.cast(weights_sum, a.dtype) else: if np.issubdtype(a.dtype.as_numpy_dtype, np.inexact): out_dtype = np_utils.result_type(a.dtype, weights) else: out_dtype = np_utils.result_type(a.dtype, weights, np_dtypes.default_float_type()) a = np_array_ops.array(a, out_dtype) weights = np_array_ops.array(weights, out_dtype) def rank_equal_case(): control_flow_ops.Assert( math_ops.reduce_all( array_ops.shape(a) == array_ops.shape(weights)), [array_ops.shape(a), array_ops.shape(weights)]) weights_sum = math_ops.reduce_sum(weights, axis=axis) avg = math_ops.reduce_sum(a * weights, axis=axis) / weights_sum return avg, weights_sum if axis is None: avg, weights_sum = rank_equal_case() else: def rank_not_equal_case(): control_flow_ops.Assert( array_ops.rank(weights) == 1, [array_ops.rank(weights)]) weights_sum = math_ops.reduce_sum(weights) axes = ops.convert_to_tensor([[axis], [0]]) avg = math_ops.tensordot(a, weights, axes) / weights_sum return avg, weights_sum # We condition on rank rather than shape equality, because if we do the # latter, when the shapes are partially unknown but the ranks are known # and different, np_utils.cond will run shape checking on the true branch, # which will raise a shape-checking error. avg, weights_sum = np_utils.cond( math_ops.equal(array_ops.rank(a), array_ops.rank(weights)), rank_equal_case, rank_not_equal_case) avg = np_array_ops.array(avg) if returned: weights_sum = np_array_ops.broadcast_to(weights_sum, array_ops.shape(avg)) return avg, weights_sum return avg
def _single_op_with_attrs(): inputs = keras.Input(shape=(10,)) x = math_ops.reduce_mean(inputs, axis=1, keepdims=True) outputs = keras.layers.Dense(10)(x) return inputs, outputs
def computation(x): return math_ops.reduce_mean(x)
def testSampleLarge(self): mu = np.array([-1., 1], dtype=np.float32) scale_tril = np.array([[3., 0], [1, -2]], dtype=np.float32) / 3. true_mean = mu true_scale = scale_tril true_covariance = np.matmul(true_scale, true_scale.T) true_variance = np.diag(true_covariance) true_stddev = np.sqrt(true_variance) true_det_covariance = np.linalg.det(true_covariance) true_log_det_covariance = np.log(true_det_covariance) with self.test_session() as sess: dist = ds.MultivariateNormalTriL(loc=mu, scale_tril=scale_tril, validate_args=True) # The following distributions will test the KL divergence calculation. mvn_chol = ds.MultivariateNormalTriL( loc=np.array([0.5, 1.2], dtype=np.float32), scale_tril=np.array([[3., 0], [1, 2]], dtype=np.float32), validate_args=True) n = int(10e3) samps = dist.sample(n, seed=0) sample_mean = math_ops.reduce_mean(samps, 0) x = samps - sample_mean sample_covariance = math_ops.matmul(x, x, transpose_a=True) / n sample_kl_chol = math_ops.reduce_mean( dist.log_prob(samps) - mvn_chol.log_prob(samps), 0) analytical_kl_chol = ds.kl(dist, mvn_chol) scale = dist.scale.to_dense() [ sample_mean_, analytical_mean_, sample_covariance_, analytical_covariance_, analytical_variance_, analytical_stddev_, analytical_log_det_covariance_, analytical_det_covariance_, sample_kl_chol_, analytical_kl_chol_, scale_, ] = sess.run([ sample_mean, dist.mean(), sample_covariance, dist.covariance(), dist.variance(), dist.stddev(), dist.log_det_covariance(), dist.det_covariance(), sample_kl_chol, analytical_kl_chol, scale, ]) sample_variance_ = np.diag(sample_covariance_) sample_stddev_ = np.sqrt(sample_variance_) sample_det_covariance_ = np.linalg.det(sample_covariance_) sample_log_det_covariance_ = np.log(sample_det_covariance_) logging.vlog(2, "true_mean:\n{} ".format(true_mean)) logging.vlog(2, "sample_mean:\n{}".format(sample_mean_)) logging.vlog(2, "analytical_mean:\n{}".format(analytical_mean_)) logging.vlog(2, "true_covariance:\n{}".format(true_covariance)) logging.vlog(2, "sample_covariance:\n{}".format(sample_covariance_)) logging.vlog( 2, "analytical_covariance:\n{}".format(analytical_covariance_)) logging.vlog(2, "true_variance:\n{}".format(true_variance)) logging.vlog(2, "sample_variance:\n{}".format(sample_variance_)) logging.vlog( 2, "analytical_variance:\n{}".format(analytical_variance_)) logging.vlog(2, "true_stddev:\n{}".format(true_stddev)) logging.vlog(2, "sample_stddev:\n{}".format(sample_stddev_)) logging.vlog(2, "analytical_stddev:\n{}".format(analytical_stddev_)) logging.vlog( 2, "true_log_det_covariance:\n{}".format(true_log_det_covariance)) logging.vlog( 2, "sample_log_det_covariance:\n{}".format( sample_log_det_covariance_)) logging.vlog( 2, "analytical_log_det_covariance:\n{}".format( analytical_log_det_covariance_)) logging.vlog( 2, "true_det_covariance:\n{}".format(true_det_covariance)) logging.vlog( 2, "sample_det_covariance:\n{}".format(sample_det_covariance_)) logging.vlog( 2, "analytical_det_covariance:\n{}".format( analytical_det_covariance_)) logging.vlog(2, "true_scale:\n{}".format(true_scale)) logging.vlog(2, "scale:\n{}".format(scale_)) logging.vlog( 2, "kl_chol: analytical:{} sample:{}".format( analytical_kl_chol_, sample_kl_chol_)) self.assertAllClose(true_mean, sample_mean_, atol=0., rtol=0.03) self.assertAllClose(true_mean, analytical_mean_, atol=0., rtol=1e-6) self.assertAllClose(true_covariance, sample_covariance_, atol=0., rtol=0.03) self.assertAllClose(true_covariance, analytical_covariance_, atol=0., rtol=1e-6) self.assertAllClose(true_variance, sample_variance_, atol=0., rtol=0.02) self.assertAllClose(true_variance, analytical_variance_, atol=0., rtol=1e-6) self.assertAllClose(true_stddev, sample_stddev_, atol=0., rtol=0.01) self.assertAllClose(true_stddev, analytical_stddev_, atol=0., rtol=1e-6) self.assertAllClose(true_log_det_covariance, sample_log_det_covariance_, atol=0., rtol=0.04) self.assertAllClose(true_log_det_covariance, analytical_log_det_covariance_, atol=0., rtol=1e-6) self.assertAllClose(true_det_covariance, sample_det_covariance_, atol=0., rtol=0.03) self.assertAllClose(true_det_covariance, analytical_det_covariance_, atol=0., rtol=1e-6) self.assertAllClose(true_scale, scale_, atol=0., rtol=1e-6) self.assertAllClose(sample_kl_chol_, analytical_kl_chol_, atol=0., rtol=0.02)
def fn(): a = math_ops.add(x.value(), 1.0) # Make sure convert_to_tensor works correctly with list of TensorNodes. b = array_ops.stack([a, a], axis=0) return math_ops.reduce_mean(b)
def call(self, inputs, training=False): if self.virtual_batch_size is not None: # Virtual batches (aka ghost batches) can be simulated by reshaping the # Tensor and reusing the existing batch norm implementation original_shape = [-1] + inputs.shape.as_list()[1:] expanded_shape = [self.virtual_batch_size, -1] + original_shape[1:] # Will cause errors if virtual_batch_size does not divide the batch size inputs = array_ops.reshape(inputs, expanded_shape) def undo_virtual_batching(outputs): outputs = array_ops.reshape(outputs, original_shape) return outputs if self.fused: outputs = self._fused_batch_norm(inputs, training=training) if self.virtual_batch_size is not None: # Currently never reaches here since fused_batch_norm does not support # virtual batching return undo_virtual_batching(outputs) return outputs # Compute the axes along which to reduce the mean / variance input_shape = inputs.get_shape() ndims = len(input_shape) reduction_axes = [i for i in range(ndims) if i not in self.axis] if self.virtual_batch_size is not None: del reduction_axes[1] # Do not reduce along virtual batch dim # Broadcasting only necessary for single-axis batch norm where the axis is # not the last dimension broadcast_shape = [1] * ndims broadcast_shape[self.axis[0]] = input_shape[self.axis[0]].value def _broadcast(v): if (v is not None and len(v.get_shape()) != ndims and reduction_axes != list(range(ndims - 1))): return array_ops.reshape(v, broadcast_shape) return v scale, offset = _broadcast(self.gamma), _broadcast(self.beta) def _compose_transforms(scale, offset, then_scale, then_offset): if then_scale is not None: scale *= then_scale offset *= then_scale if then_offset is not None: offset += then_offset return (scale, offset) # Determine a boolean value for `training`: could be True, False, or None. training_value = utils.constant_value(training) if training_value is not False: if self.adjustment: adj_scale, adj_bias = self.adjustment(array_ops.shape(inputs)) # Adjust only during training. adj_scale = utils.smart_cond( training, lambda: adj_scale, lambda: array_ops.ones_like(adj_scale)) adj_bias = utils.smart_cond( training, lambda: adj_bias, lambda: array_ops.zeros_like(adj_bias)) scale, offset = _compose_transforms(adj_scale, adj_bias, scale, offset) # Some of the computations here are not necessary when training==False # but not a constant. However, this makes the code simpler. keep_dims = self.virtual_batch_size is not None or len( self.axis) > 1 mean, variance = nn.moments(inputs, reduction_axes, keep_dims=keep_dims) moving_mean = self.moving_mean moving_variance = self.moving_variance mean = utils.smart_cond(training, lambda: mean, lambda: moving_mean) variance = utils.smart_cond(training, lambda: variance, lambda: moving_variance) if self.renorm: r, d, new_mean, new_variance = self._renorm_correction_and_moments( mean, variance, training) # When training, the normalized values (say, x) will be transformed as # x * gamma + beta without renorm, and (x * r + d) * gamma + beta # = x * (r * gamma) + (d * gamma + beta) with renorm. r = _broadcast(array_ops.stop_gradient(r, name='renorm_r')) d = _broadcast(array_ops.stop_gradient(d, name='renorm_d')) scale, offset = _compose_transforms(r, d, scale, offset) else: new_mean, new_variance = mean, variance if self.virtual_batch_size is not None: # This isn't strictly correct since in ghost batch norm, you are # supposed to sequentially update the moving_mean and moving_variance # with each sub-batch. However, since the moving statistics are only # used during evaluation, it is more efficient to just update in one # step and should not make a significant difference in the result. new_mean = math_ops.reduce_mean(new_mean, axis=1, keep_dims=True) new_variance = math_ops.reduce_mean(new_variance, axis=1, keep_dims=True) def _do_update(var, value): return moving_averages.assign_moving_average(var, value, self.momentum, zero_debias=False) mean_update = utils.smart_cond( training, lambda: _do_update(self.moving_mean, new_mean), lambda: self.moving_mean) variance_update = utils.smart_cond( training, lambda: _do_update(self.moving_variance, new_variance), lambda: self.moving_variance) if context.in_graph_mode(): self.add_update(mean_update, inputs=inputs) self.add_update(variance_update, inputs=inputs) else: mean, variance = self.moving_mean, self.moving_variance outputs = nn.batch_normalization(inputs, _broadcast(mean), _broadcast(variance), offset, scale, self.epsilon) # If some components of the shape got lost due to adjustments, fix that. outputs.set_shape(input_shape) if self.virtual_batch_size is not None: return undo_virtual_batching(outputs) return outputs
def body(i, x, y): s = array_ops.slice(x, [i, i, i], [1, 1, 2048]) y = y + math_ops.reduce_mean(s) i = i + 1 return (i, x, y)
def loss(x, l): return math_ops.reduce_mean( nn_ops.softmax_cross_entropy_with_logits(logits=x, labels=l), constant_op.constant([0]))
def result(self): """Add option to remove summary.""" if (self.curve == metrics_utils.AUCCurve.PR and self.summation_method == metrics_utils.AUCSummationMethod.INTERPOLATION): # This use case is different and is handled separately. return self.interpolate_pr_auc() # Set `x` and `y` values for the curves based on `curve` config. recall = math_ops.div_no_nan( self.true_positives, self.true_positives + self.false_negatives) if self.curve == metrics_utils.AUCCurve.ROC: fp_rate = math_ops.div_no_nan( self.false_positives, self.false_positives + self.true_negatives) x = fp_rate y = recall else: # curve == 'PR'. precision = math_ops.div_no_nan( self.true_positives, self.true_positives + self.false_positives) x = recall y = precision # Find the rectangle heights based on `summation_method`. if self.summation_method == metrics_utils.AUCSummationMethod.INTERPOLATION: # Note: the case ('PR', 'interpolation') has been handled above. heights = (y[:self.num_thresholds - 1] + y[1:]) / 2. elif self.summation_method == metrics_utils.AUCSummationMethod.MINORING: heights = math_ops.minimum(y[:self.num_thresholds - 1], y[1:]) else: # self.summation_method = metrics_utils.AUCSummationMethod.MAJORING: heights = math_ops.maximum(y[:self.num_thresholds - 1], y[1:]) # Sum up the areas of all the rectangles. if self.multi_label: riemann_terms = math_ops.multiply( x[:self.num_thresholds - 1] - x[1:], heights) by_label_auc = math_ops.reduce_sum(riemann_terms, name=self.name + '_by_label', axis=0) if self._summarize: if self.label_weights is None: # Unweighted average of the label AUCs. return math_ops.reduce_mean(by_label_auc, name=self.name) else: # Weighted average of the label AUCs. return math_ops.div_no_nan(math_ops.reduce_sum( math_ops.multiply(by_label_auc, self.label_weights)), math_ops.reduce_sum( self.label_weights), name=self.name) else: return by_label_auc else: if self._summarize: return math_ops.reduce_sum(math_ops.multiply( x[:self.num_thresholds - 1] - x[1:], heights), name=self.name) else: return math_ops.multiply(x[:self.num_thresholds - 1] - x[1:], heights)
def call(self, inputs): out = self.bias(inputs[0]) self.add_loss(MAE()(inputs[1], out, inputs[2])) self.add_loss( math_ops.reduce_mean(inputs[2] * mae(inputs[1], out))) return out
def ngrams(data, width, axis=-1, reduction_type=None, string_separator=" ", name=None): """Create a tensor of n-grams based on the input data `data`. Creates a tensor of n-grams based on `data`. The n-grams are of width `width` and are created along axis `axis`; the n-grams are created by combining windows of `width` adjacent elements from `data` using `reduction_type`. This op is intended to cover basic use cases; more complex combinations can be created using the sliding_window op. Args: data: The data to reduce. width: The width of the ngram window. If there is not sufficient data to fill out the ngram window, the resulting ngram will be empty. axis: The axis to create ngrams along. Note that for string join reductions, only axis '-1' is supported; for other reductions, any positive or negative axis can be used. Should be a constant. reduction_type: A member of the Reduction enum. Should be a constant. Currently supports: * `Reduction.SUM`: Add values in the window. * `Reduction.MEAN`: Average values in the window. * `Reduction.STRING_JOIN`: Join strings in the window. Note that axis must be -1 here. string_separator: The separator string used for `Reduction.STRING_JOIN`. Ignored otherwise. Must be a string constant, not a Tensor. name: The op name. Returns: A tensor of ngrams. Raises: InvalidArgumentError: if `reduction_type` is either None or not a Reduction, or if `reduction_type` is STRING_JOIN and `axis` is not -1. """ with ops.name_scope(name, "NGrams", [data, width]): if reduction_type is None: raise errors.InvalidArgumentError( None, None, "reduction_type must be specified.") if not isinstance(reduction_type, Reduction): raise errors.InvalidArgumentError( None, None, "reduction_type must be a Reduction.") # TODO(b/122967921): Lift this restriction after ragged_reduce_join is done. if reduction_type is Reduction.STRING_JOIN and axis != -1: raise errors.InvalidArgumentError( None, None, "%s requires that ngrams' 'axis' parameter be -1." % Reduction.STRING_JOIN.name) windowed_data = sliding_window(data, width, axis) if axis < 0: reduction_axis = axis else: reduction_axis = axis + 1 # Ragged reduction ops work on both Tensor and RaggedTensor, so we can # use them here regardless of the type of tensor in 'windowed_data'. if reduction_type is Reduction.SUM: return math_ops.reduce_sum(windowed_data, reduction_axis) elif reduction_type is Reduction.MEAN: return math_ops.reduce_mean(windowed_data, reduction_axis) elif reduction_type is Reduction.STRING_JOIN: if isinstance(data, ragged_tensor.RaggedTensor): return ragged_functional_ops.map_flat_values( string_ops.reduce_join, windowed_data, axis=axis, separator=string_separator) else: return string_ops.reduce_join(windowed_data, axis=axis, separator=string_separator)
def testEmptyGradients(self): with self.session(use_gpu=True): x = array_ops.zeros([0, 3]) y = math_ops.reduce_mean(x, [1]) error = gradient_checker.compute_gradient_error(x, [0, 3], y, [0]) self.assertEqual(error, 0)
def test_loss(): test_prediction = line_template(test_input) return math_ops.reduce_mean( math_ops.square(test_prediction - test_output))
def normal_function(): x = random_ops.random_uniform((2, 10)) y = random_ops.random_uniform((10, 2)) return math_ops.reduce_mean(math_ops.matmul(x, y))