def _log_prob(self, x): x = self._assert_valid_sample(x) # broadcast logits or x if need be. logits = self.logits if (not x.get_shape().is_fully_defined() or not logits.get_shape().is_fully_defined() or x.get_shape() != logits.get_shape()): logits = array_ops.ones_like(x, dtype=logits.dtype) * logits x = array_ops.ones_like(logits, dtype=x.dtype) * x logits_shape = array_ops.shape(logits) if logits.get_shape().ndims == 2: logits_2d = logits x_2d = x else: logits_2d = array_ops.reshape(logits, [-1, self.event_size]) x_2d = array_ops.reshape(x, [-1, self.event_size]) # compute the normalization constant log_norm_const = (math_ops.lgamma(self.event_size) + (self.event_size - 1) * math_ops.log(self.temperature)) # compute the unnormalized density log_softmax = nn_ops.log_softmax(logits_2d - x_2d * self.temperature) log_unnorm_prob = math_ops.reduce_sum(log_softmax, [-1], keep_dims=False) # combine unnormalized density with normalization constant log_prob = log_norm_const + log_unnorm_prob ret = array_ops.reshape(log_prob, logits_shape) return ret
def log_prob(self, event, name="log_prob"): """Log of the probability mass function. Args: event: `int32` or `int64` binary Tensor. name: A name for this operation (optional). Returns: The log-probabilities of the events. """ # TODO(jaana): The current sigmoid_cross_entropy_with_logits has # inconsistent behavior for logits = inf/-inf. with ops.name_scope(self.name): with ops.name_scope(name, values=[self.logits, event]): event = ops.convert_to_tensor(event, name="event") event = math_ops.cast(event, self.logits.dtype) logits = self.logits # sigmoid_cross_entropy_with_logits doesn't broadcast shape, # so we do this here. # TODO(b/30637701): Check dynamic shape, and don't broadcast if the # dynamic shapes are the same. if (not event.get_shape().is_fully_defined() or not logits.get_shape().is_fully_defined() or event.get_shape() != logits.get_shape()): logits = array_ops.ones_like(event) * logits event = array_ops.ones_like(logits) * event return -nn.sigmoid_cross_entropy_with_logits(logits, event)
def _log_prob(self, x): x = self._assert_valid_sample(x) # broadcast logits or x if need be. logits = self.logits if (not x.get_shape().is_fully_defined() or not logits.get_shape().is_fully_defined() or x.get_shape() != logits.get_shape()): logits = array_ops.ones_like(x, dtype=logits.dtype) * logits x = array_ops.ones_like(logits, dtype=x.dtype) * x logits_shape = array_ops.shape(math_ops.reduce_sum(logits, axis=[-1])) logits_2d = array_ops.reshape(logits, [-1, self.event_size]) x_2d = array_ops.reshape(x, [-1, self.event_size]) # compute the normalization constant k = math_ops.cast(self.event_size, x.dtype) log_norm_const = (math_ops.lgamma(k) + (k - 1.) * math_ops.log(self.temperature)) # compute the unnormalized density log_softmax = nn_ops.log_softmax(logits_2d - x_2d * self._temperature_2d) log_unnorm_prob = math_ops.reduce_sum(log_softmax, [-1], keepdims=False) # combine unnormalized density with normalization constant log_prob = log_norm_const + log_unnorm_prob # Reshapes log_prob to be consistent with shape of user-supplied logits ret = array_ops.reshape(log_prob, logits_shape) return ret
def full_exp_with_logits(name, labels=None, logits=None): """Computes exponential loss given `logits`. Args: name: A name for the operation (optional). labels: A `Tensor` of the same type and shape as `logits`. logits: A `Tensor` of type `float32` or `float64`. Returns: A `Tensor` of the same shape as `logits` with the componentwise exponential losses. Raises: ValueError: If `logits` and `labels` do not have the same shape. """ with ops.name_scope(name, "exp_loss", [logits, labels]) as name: logits = ops.convert_to_tensor(logits, name="logits") labels = ops.convert_to_tensor(labels, name="labels") try: labels.get_shape().merge_with(logits.get_shape()) except ValueError: raise ValueError("logits and labels must have the same shape (%s vs %s)" % (logits.get_shape(), labels.get_shape())) # Default threshold of 0 to switch between classes zeros = array_ops.zeros_like(logits, dtype=logits.dtype) ones = array_ops.ones_like(logits, dtype=logits.dtype) neg_ones = -array_ops.ones_like(logits, dtype=logits.dtype) # Convert labels to 1 and -1 cond_labels = (labels > zeros) labels_converted = array_ops.where(cond_labels, ones, neg_ones) return math_ops.exp(-1.0 * logits * labels_converted)
def grad(dmm, dr): return [ math_ops.matmul(dmm, b, transpose_b=True) + math_ops.matmul(array_ops.ones_like(b * dr), b, transpose_b=True), math_ops.matmul(a, dmm, transpose_b=True) + math_ops.matmul(a, array_ops.ones_like(a) * dr, transpose_b=True) ]
def _sample_n(self, n, seed=None): n_draws = math_ops.cast(self.total_count, dtype=dtypes.int32) k = self.event_shape_tensor()[0] # broadcast the total_count and logits to same shape n_draws = array_ops.ones_like( self.logits[..., 0], dtype=n_draws.dtype) * n_draws logits = array_ops.ones_like( n_draws[..., array_ops.newaxis], dtype=self.logits.dtype) * self.logits # flatten the total_count and logits flat_logits = array_ops.reshape(logits, [-1, k]) # [B1B2...Bm, k] flat_ndraws = n * array_ops.reshape(n_draws, [-1]) # [B1B2...Bm] # computes each total_count and logits situation by map_fn def _sample_single(args): logits, n_draw = args[0], args[1] # [K], [] x = random_ops.multinomial(logits[array_ops.newaxis, ...], n_draw, seed) # [1, n*n_draw] x = array_ops.reshape(x, shape=[n, -1]) # [n, n_draw] x = math_ops.reduce_sum(array_ops.one_hot(x, depth=k), axis=-2) # [n, k] return x x = functional_ops.map_fn( _sample_single, [flat_logits, flat_ndraws], dtype=self.dtype) # [B1B2...Bm, n, k] # reshape the results to proper shape x = array_ops.transpose(x, perm=[1, 0, 2]) final_shape = array_ops.concat([[n], self.batch_shape_tensor(), [k]], 0) x = array_ops.reshape(x, final_shape) # [n, B1, B2,..., Bm, k] return x
def _BiasAddGradGrad(op, received_grad): """Gradient for the BiasAddGrad op. Args: op: BiasAddGrad op for which we are calculating gradients. received_grad: The gradients passed to the BiasAddGrad op. Returns: A single gradient Tensor for the input to BiasAddGrad (which is the gradient of the bias term in BiasAdd) """ try: data_format = op.get_attr("data_format") except ValueError: data_format = None shape = array_ops.shape(op.inputs[0]) rank = array_ops.rank(op.inputs[0]) bias_shape = array_ops.shape(received_grad) if data_format == b"NCHW": expanded_shape = array_ops.concat([ array_ops.ones_like(shape[:-3]), bias_shape, array_ops.ones_like(shape[-2:]) ], 0) tile_mults = array_ops.concat([shape[:-3], [1], shape[-2:]], 0) else: expanded_shape = array_ops.concat( [array_ops.ones_like(shape[:-1]), bias_shape], 0) tile_mults = array_ops.concat([shape[:-1], [1]], 0) expanded_grad = array_ops.reshape(received_grad, expanded_shape) return array_ops.tile(expanded_grad, tile_mults)
def sample_n(self, n, seed=None, name="sample_n"): """Sample `n` observations from the Beta Distributions. Args: n: `Scalar` `Tensor` of type `int32` or `int64`, the number of observations to sample. seed: Python integer, the random seed. name: The name to give this op. Returns: samples: `[n, ...]`, a `Tensor` of `n` samples for each of the distributions determined by broadcasting the hyperparameters. """ with ops.name_scope(self.name): with ops.name_scope(name, values=[self.a, self.b, n]): a = array_ops.ones_like(self._a_b_sum, dtype=self.dtype) * self.a b = array_ops.ones_like(self._a_b_sum, dtype=self.dtype) * self.b n = ops.convert_to_tensor(n, name="n") gamma1_sample = random_ops.random_gamma( [n,], a, dtype=self.dtype, seed=seed) gamma2_sample = random_ops.random_gamma( [n,], b, dtype=self.dtype, seed=seed) beta_sample = gamma1_sample / (gamma1_sample + gamma2_sample) n_val = tensor_util.constant_value(n) final_shape = tensor_shape.vector(n_val).concatenate( self._a_b_sum.get_shape()) beta_sample.set_shape(final_shape) return beta_sample
def _sample_n(self, n, seed=None): a = array_ops.ones_like(self.a_b_sum, dtype=self.dtype) * self.a b = array_ops.ones_like(self.a_b_sum, dtype=self.dtype) * self.b gamma1_sample = random_ops.random_gamma([n], a, dtype=self.dtype, seed=seed) gamma2_sample = random_ops.random_gamma([n], b, dtype=self.dtype, seed=seed) beta_sample = gamma1_sample / (gamma1_sample + gamma2_sample) return beta_sample
def _sample_n(self, n, seed=None): a = array_ops.ones_like(self.a_b_sum, dtype=self.dtype) * self.a b = array_ops.ones_like(self.a_b_sum, dtype=self.dtype) * self.b gamma1_sample = random_ops.random_gamma( [n,], a, dtype=self.dtype, seed=seed) gamma2_sample = random_ops.random_gamma( [n,], b, dtype=self.dtype, seed=distribution_util.gen_new_seed(seed, "beta")) beta_sample = gamma1_sample / (gamma1_sample + gamma2_sample) return beta_sample
def grad_fn(inputs, trainable_variables, unused_outputs, unused_grad_outputs): grad_inputs = [ array_ops.ones_like(t) * (i + 1.) for i, t in enumerate(inputs) ] grad_vars = [ array_ops.ones_like(t) * (i + len(inputs) + 1.) for i, t in enumerate(trainable_variables) ] return grad_inputs, grad_vars
def exp_with_logits(name, eps, labels=None, logits=None): """Computes exponential loss given `logits`. The loss returns is exp(-targets*modified_predictions), where modified_predictions are 1 if sigmoid is >= 0.5+eps (eg we predict positive class), -1 if sigmoid < 0.5-eps (e.g. we predict negative class) and ax+b in the interval 0.5-eps, 0.5+eps, where a = 1/eps, b=1/(2eps). Args: name: A name for the operation (optional). eps: For the range (0.5-eps, 0.5+eps) we set the predictions to be ax+b. labels: A `Tensor` of the same type and shape as `logits`. logits: A `Tensor` of type `float32` or `float64`. Returns: A `Tensor` of the same shape as `logits` with the componentwise exponential losses. Raises: ValueError: If `logits` and `labels` do not have the same shape. """ with ops.name_scope(name, "exp_loss", [logits, labels]) as name: logits = ops.convert_to_tensor(logits, name="logits") labels = ops.convert_to_tensor(labels, name="labels") try: labels.get_shape().merge_with(logits.get_shape()) except ValueError: raise ValueError("logits and labels must have the same shape (%s vs %s)" % (logits.get_shape(), labels.get_shape())) # Default threshold to switch between classes zeros = array_ops.zeros_like(logits, dtype=logits.dtype) ones = array_ops.ones_like(logits, dtype=logits.dtype) neg_ones = -array_ops.ones_like(logits, dtype=logits.dtype) # Convert labels to 1 and -1 cond_labels = (labels > zeros) labels_converted = array_ops.where(cond_labels, ones, neg_ones) # Convert predictions to 1 and -1 # The loss we build is min(1, max(-1,ax+b)) # where a=1/eps, b=-1/2eps. a = 1.0 / eps b = -1.0 / 2 / eps probs = math_ops.sigmoid(logits) y = a * probs + b # Build max(-1, ax+b) cond = (y < -1) max_res = array_ops.where(cond, neg_ones, y) # Build min part cond = (max_res > 1) min_res = array_ops.where(cond, ones, max_res) preds_converted = min_res return math_ops.exp(-preds_converted * labels_converted)
def _moment(self, n): """Compute the n'th (uncentered) moment.""" expanded_concentration1 = array_ops.ones_like( self.total_concentration, dtype=self.dtype) * self.concentration1 expanded_concentration0 = array_ops.ones_like( self.total_concentration, dtype=self.dtype) * self.concentration0 beta_arg0 = 1 + n / expanded_concentration1 beta_arg = array_ops.stack([beta_arg0, expanded_concentration0], -1) log_moment = math_ops.log(expanded_concentration0) + special_math_ops.lbeta( beta_arg) return math_ops.exp(log_moment)
def _sample_n(self, n, seed=None): expanded_concentration1 = array_ops.ones_like( self.total_concentration, dtype=self.dtype) * self.concentration1 expanded_concentration0 = array_ops.ones_like( self.total_concentration, dtype=self.dtype) * self.concentration0 shape = array_ops.concat([[n], self.batch_shape_tensor()], 0) uniform_sample = random_ops.random_uniform( shape=shape, minval=0.0, maxval=1.0, dtype=self.dtype, seed=seed) kumaraswamy_sample = (1 - uniform_sample**(1. / expanded_concentration0))**( 1. / expanded_concentration1) return kumaraswamy_sample
def _log_prob(self, counts): if self.validate_args: counts = distribution_util.embed_check_nonnegative_discrete( counts, check_integer=True) counts *= array_ops.ones_like(self.probs) probs = self.probs * array_ops.ones_like(counts) safe_domain = array_ops.where( math_ops.equal(counts, 0.), array_ops.zeros_like(probs), probs) return counts * math_ops.log1p(-safe_domain) + math_ops.log(probs)
def _log_prob(self, x): if self.validate_args: x = distribution_util.embed_check_nonnegative_integer_form(x) else: # For consistency with cdf, we take the floor. x = math_ops.floor(x) x *= array_ops.ones_like(self.probs) probs = self.probs * array_ops.ones_like(x) safe_domain = array_ops.where( math_ops.equal(x, 0.), array_ops.zeros_like(probs), probs) return x * math_ops.log1p(-safe_domain) + math_ops.log(probs)
def broadcast_weights(weights, values): """Broadcast `weights` to the same shape as `values`. This returns a version of `weights` following the same broadcast rules as `mul(weights, values)`, but limited to the weights shapes allowed by `assert_broadcastable`. When computing a weighted average, use this function to broadcast `weights` before summing them; e.g., `reduce_sum(w * v) / reduce_sum(_broadcast_weights(w, v))`. Args: weights: `Tensor` whose shape is broadcastable to `values` according to the rules of `assert_broadcastable`. values: `Tensor` of any shape. Returns: `weights` broadcast to `values` shape according to the rules of `assert_broadcastable`. """ with ops.name_scope(None, "broadcast_weights", (weights, values)) as scope: values = ops.convert_to_tensor(values, name="values") weights = ops.convert_to_tensor( weights, dtype=values.dtype.base_dtype, name="weights") # Try static check for exact match. weights_shape = weights.get_shape() values_shape = values.get_shape() if (weights_shape.is_fully_defined() and values_shape.is_fully_defined() and weights_shape.is_compatible_with(values_shape)): return weights with ops.control_dependencies((assert_broadcastable(weights, values),)): return math_ops.multiply( weights, array_ops.ones_like(values), name=scope)
def reduce_mean(rt_input, axis=None, name=None): """For docs, see: _RAGGED_REDUCE_DOCSTRING.""" with ops.name_scope(name, 'RaggedReduceMean', [rt_input, axis]): total = reduce_sum(rt_input, axis) if ragged_tensor.is_ragged(rt_input): ones = ragged_factory_ops.from_nested_row_splits( array_ops.ones_like(rt_input.inner_values), rt_input.nested_row_splits) else: ones = array_ops.ones_like(rt_input) count = reduce_sum(ones, axis) if ragged_tensor.is_ragged(total): return ragged_factory_ops.from_nested_row_splits( total.inner_values / count.inner_values, total.nested_row_splits) else: return total / count
def testContribSignalSTFT(self): ws = 512 hs = 128 dims = (ws * 20,) shape = BATCH_DIMS + dims data = np.arange(np.prod(shape)) / np.prod(dims) np.random.seed(123) np.random.shuffle(data) data = np.reshape(data.astype(np.float32), shape) window = sps.get_window("hann", ws) expected = sps.stft( data, nperseg=ws, noverlap=ws - hs, boundary=None, window=window)[2] expected = np.swapaxes(expected, -1, -2) expected *= window.sum() # scipy divides by window sum with self.test_session() as sess: with self.test_scope(): ph = array_ops.placeholder( dtypes.as_dtype(data.dtype), shape=data.shape) out = signal.stft(ph, ws, hs) grad = gradients_impl.gradients(out, ph, grad_ys=array_ops.ones_like(out)) # For gradients, we simply verify that they compile & execute. value, _ = sess.run([out, grad], {ph: data}) self.assertAllClose(expected, value, rtol=RTOL, atol=ATOL)
def _variance(self): p = self.p * array_ops.expand_dims(array_ops.ones_like(self.n), -1) outer_prod = math_ops.batch_matmul( array_ops.expand_dims(self._mean_val, -1), array_ops.expand_dims(p, -2)) return array_ops.batch_matrix_set_diag( -outer_prod, self._mean_val - self._mean_val * p)
def mode(self, name="mode"): """Mode of the distribution. Note that the mode for the Beta distribution is only defined when `alpha > 1`. This returns the mode when `alpha > 1`, and NaN otherwise. If `self.allow_nan_stats` is `False`, an exception will be raised rather than returning `NaN`. Args: name: The name for this op. Returns: Mode of the Dirichlet distribution. """ with ops.name_scope(self.name): with ops.op_scope([self._alpha, self._alpha_0], name): one = constant_op.constant(1, self.dtype) mode = (self._alpha - 1)/ ( array_ops.expand_dims(self._alpha_0, -1) - math_ops.cast( self.event_shape()[0], self.dtype)) if self.allow_nan_stats: return math_ops.select( math_ops.greater(self._alpha, 1), mode, (constant_op.constant(float("NaN"), dtype=self.dtype) * array_ops.ones_like(self._alpha, dtype=self.dtype))) else: return control_flow_ops.with_dependencies([ check_ops.assert_less( one, self._alpha, message="mode not defined for components of alpha <= 1") ], mode)
def testMaskingSingleInput(self): class MaskedLayer(base_layers.Layer): def call(self, inputs, mask=None): if mask is not None: return inputs * mask return inputs def compute_mask(self, inputs, mask=None): return array_ops.ones_like(inputs) if context.in_graph_mode(): x = base_layers.Input(shape=(32,)) y = MaskedLayer()(x) # pylint: disable=not-callable network = base_layers.Network(x, y) # test callability on Input x_2 = base_layers.Input(shape=(32,)) y_2 = network(x_2) self.assertEqual(y_2.get_shape().as_list(), [None, 32]) # test callability on regular tensor x_2 = array_ops.placeholder(dtype='float32', shape=(None, 32)) y_2 = network(x_2) self.assertEqual(y_2.get_shape().as_list(), [None, 32]) else: a = constant_op.constant([2] * 32) mask = constant_op.constant([0, 1] * 16) a._keras_mask = mask b = MaskedLayer().apply(a) self.assertTrue(hasattr(b, '_keras_mask')) self.assertAllEqual(self.evaluate(array_ops.ones_like(mask)), self.evaluate(getattr(b, '_keras_mask'))) self.assertAllEqual(self.evaluate(a * mask), self.evaluate(b))
def testCustomGrad(self): def fn(a, b, c): return core_layers.dense(a, 10, use_bias=False) + math_ops.matmul(b, c) def grad_fn(inputs, trainable_variables, unused_outputs, unused_grad_outputs): grad_inputs = [ array_ops.ones_like(t) * (i + 1.) for i, t in enumerate(inputs) ] grad_vars = [ array_ops.ones_like(t) * (i + len(inputs) + 1.) for i, t in enumerate(trainable_variables) ] return grad_inputs, grad_vars a = random_ops.random_uniform([11, 6]) b = random_ops.random_uniform([11, 7]) c = random_ops.random_uniform([7, 10]) w = random_ops.random_uniform([6, 10]) out = rev_block_lib._fn_with_custom_grad(grad_fn)(fn)(a, b, c) loss = math_ops.reduce_mean(out) grads = gradients_impl.gradients( loss, [a, b, c, variables.trainable_variables()[0]]) expected_grads = [ array_ops.ones_like(t) * (i + 1.) for i, t in enumerate([a, b, c, w]) ] with self.test_session() as sess: sess.run(variables.global_variables_initializer()) g_val, eg_val = sess.run([grads, expected_grads]) for g1, g2 in zip(g_val, eg_val): self.assertAllClose(g1, g2)
def _survival_function(self, y): lower_cutoff = self._lower_cutoff upper_cutoff = self._upper_cutoff # Recall the promise: # survival_function(y) := P[Y > y] # = 0, if y >= upper_cutoff, # = 1, if y < lower_cutoff, # = P[X > y], otherwise. # P[Y > j] = P[ceiling(Y) > j] since mass is only at integers, not in # between. j = math_ops.ceil(y) # P[X > j], used when lower_cutoff < X < upper_cutoff. result_so_far = self.distribution.survival_function(j) # Broadcast, because it's possible that this is a single distribution being # evaluated on a number of samples, or something like that. j += array_ops.zeros_like(result_so_far) # Re-define values at the cutoffs. if lower_cutoff is not None: result_so_far = math_ops.select(j < lower_cutoff, array_ops.ones_like(result_so_far), result_so_far) if upper_cutoff is not None: result_so_far = math_ops.select(j >= upper_cutoff, array_ops.zeros_like(result_so_far), result_so_far) return result_so_far
def hinge_loss(labels, logits, weights=1.0, scope=None, loss_collection=ops.GraphKeys.LOSSES, reduction=Reduction.SUM_BY_NONZERO_WEIGHTS): """Adds a hinge loss to the training procedure. Args: labels: The ground truth output tensor. Its shape should match the shape of logits. The values of the tensor are expected to be 0.0 or 1.0. logits: The logits, a float tensor. weights: Optional `Tensor` whose rank is either 0, or the same rank as `labels`, and must be broadcastable to `labels` (i.e., all dimensions must be either `1`, or the same as the corresponding `losses` dimension). scope: The scope for the operations performed in computing the loss. loss_collection: collection to which the loss will be added. reduction: Type of reduction to apply to loss. Returns: Weighted loss float `Tensor`. If `reduction` is `NONE`, this has the same shape as `labels`; otherwise, it is scalar. Raises: ValueError: If the shapes of `logits` and `labels` don't match. """ with ops.name_scope(scope, "hinge_loss", (logits, labels, weights)) as scope: logits = math_ops.to_float(logits) labels = math_ops.to_float(labels) logits.get_shape().assert_is_compatible_with(labels.get_shape()) # We first need to convert binary labels to -1/1 labels (as floats). all_ones = array_ops.ones_like(labels) labels = math_ops.subtract(2 * labels, all_ones) losses = nn_ops.relu( math_ops.subtract(all_ones, math_ops.multiply(labels, logits))) return compute_weighted_loss( losses, weights, scope, loss_collection, reduction=reduction)
def gradient_clipping(grads_and_vars): """Internal function for adaptive clipping.""" grads, variables = zip(*grads_and_vars) norm = clip_ops.global_norm(grads) max_norm, log_mean = _adaptive_max_norm(norm, std_factor, decay, global_step, epsilon, name) # reports the max gradient norm for debugging if report_summary: summary.scalar("global_norm/adaptive_max_gradient_norm", max_norm) # factor will be 1. if norm is smaller than max_norm factor = array_ops.where(norm < max_norm, array_ops.ones_like(norm), math_ops.exp(log_mean) / norm) if static_max_norm is not None: factor = math_ops.minimum(static_max_norm / norm, factor) # apply factor clipped_grads = [] for grad in grads: if grad is None: clipped_grads.append(None) elif isinstance(grad, ops.IndexedSlices): clipped_grads.append( ops.IndexedSlices(grad.values * factor, grad.indices, grad.dense_shape)) else: clipped_grads.append(grad * factor) return list(zip(clipped_grads, variables))
def call(self, inputs, mask=None): self._validate_call_args(inputs=inputs, mask=mask) q = inputs[0] v = inputs[1] k = inputs[2] if len(inputs) > 2 else v q_mask = mask[0] if mask else None v_mask = mask[1] if mask else None scores = self._calculate_scores(query=q, key=k) if v_mask is not None: # Mask of shape [batch_size, 1, Tv]. v_mask = array_ops.expand_dims(v_mask, axis=-2) if self.causal: # Creates a lower triangular mask, so position i cannot attend to # positions j>i. This prevents the flow of information from the future # into the past. scores_shape = array_ops.shape(scores) # causal_mask_shape = [1, Tq, Tv]. causal_mask_shape = array_ops.concat( [array_ops.ones_like(scores_shape[:-2]), scores_shape[-2:]], axis=0) causal_mask = _lower_triangular_mask(causal_mask_shape) else: causal_mask = None scores_mask = _merge_masks(v_mask, causal_mask) result = self._apply_scores(scores=scores, value=v, scores_mask=scores_mask) if q_mask is not None: # Mask of shape [batch_size, Tq, 1]. q_mask = array_ops.expand_dims(q_mask, axis=-1) result *= math_ops.cast(q_mask, dtype=result.dtype) return result
def _num_present(losses, weights, per_batch=False): """Computes the number of elements in the loss function induced by `weights`. A given weights tensor induces different numbers of usable elements in the `losses` tensor. The `weights` tensor is broadcast across `losses` for all possible dimensions. For example, if `losses` is a tensor of dimension `[4, 5, 6, 3]` and `weights` is a tensor of shape `[4, 5]`, then `weights` is, in effect, tiled to match the shape of `losses`. Following this effective tile, the total number of present elements is the number of non-zero weights. Args: losses: `Tensor` of shape `[batch_size, d1, ... dN]`. weights: `Tensor` of shape `[]`, `[batch_size]` or `[batch_size, d1, ... dK]`, where K < N. per_batch: Whether to return the number of elements per batch or as a sum total. Returns: The number of present (non-zero) elements in the losses tensor. If `per_batch` is `True`, the value is returned as a tensor of size `[batch_size]`. Otherwise, a single scalar tensor is returned. """ with ops.name_scope(None, "num_present", (losses, weights)) as scope: weights = math_ops.to_float(weights) present = array_ops.where( math_ops.equal(weights, 0.0), array_ops.zeros_like(weights), array_ops.ones_like(weights)) present = weights_broadcast_ops.broadcast_weights(present, losses) if per_batch: return math_ops.reduce_sum( present, axis=math_ops.range(1, array_ops.rank(present)), keep_dims=True, name=scope) return math_ops.reduce_sum(present, name=scope)
def pdf(self, x, name="pdf"): """The PDF of observations in `x` under these Uniform distribution(s). Args: x: tensor of dtype `dtype`, must be broadcastable with `a` and `b`. name: The name to give this op. Returns: pdf: tensor of dtype `dtype`, the pdf values of `x`. If `x` is `nan`, will return `nan`. """ with ops.name_scope(self.name): with ops.op_scope([self.a, self.b, x], name): x = ops.convert_to_tensor(x, name="x") if x.dtype != self.dtype: raise TypeError("Input x dtype does not match dtype: %s vs. %s" % (x.dtype, self.dtype)) broadcasted_x = x * self._ones() return math_ops.select( math_ops.is_nan(broadcasted_x), broadcasted_x, math_ops.select( math_ops.logical_or(broadcasted_x < self.a, broadcasted_x > self.b), array_ops.zeros_like(broadcasted_x), (1.0 / self.range()) * array_ops.ones_like(broadcasted_x)))
def _log_cdf(self, y): lower_cutoff = self._lower_cutoff upper_cutoff = self._upper_cutoff # Recall the promise: # cdf(y) := P[Y <= y] # = 1, if y >= upper_cutoff, # = 0, if y < lower_cutoff, # = P[X <= y], otherwise. # P[Y <= j] = P[floor(Y) <= j] since mass is only at integers, not in # between. j = math_ops.floor(y) result_so_far = self.distribution.log_cdf(j) # Broadcast, because it's possible that this is a single distribution being # evaluated on a number of samples, or something like that. j += array_ops.zeros_like(result_so_far) # Re-define values at the cutoffs. if lower_cutoff is not None: neg_inf = -np.inf * array_ops.ones_like(result_so_far) result_so_far = math_ops.select(j < lower_cutoff, neg_inf, result_so_far) if upper_cutoff is not None: result_so_far = math_ops.select(j >= upper_cutoff, array_ops.zeros_like(result_so_far), result_so_far) return result_so_far
def _stddev(self): return self.scale * array_ops.ones_like( self.loc) * math.pi / math.sqrt(3)
def _ndtri(p): """Implements ndtri core logic.""" # Constants used in piece-wise rational approximations. Taken from the cephes # library: # https://github.com/scipy/scipy/blob/master/scipy/special/cephes/ndtri.c p0 = list( reversed([ -5.99633501014107895267E1, 9.80010754185999661536E1, -5.66762857469070293439E1, 1.39312609387279679503E1, -1.23916583867381258016E0 ])) q0 = list( reversed([ 1.0, 1.95448858338141759834E0, 4.67627912898881538453E0, 8.63602421390890590575E1, -2.25462687854119370527E2, 2.00260212380060660359E2, -8.20372256168333339912E1, 1.59056225126211695515E1, -1.18331621121330003142E0 ])) p1 = list( reversed([ 4.05544892305962419923E0, 3.15251094599893866154E1, 5.71628192246421288162E1, 4.40805073893200834700E1, 1.46849561928858024014E1, 2.18663306850790267539E0, -1.40256079171354495875E-1, -3.50424626827848203418E-2, -8.57456785154685413611E-4 ])) q1 = list( reversed([ 1.0, 1.57799883256466749731E1, 4.53907635128879210584E1, 4.13172038254672030440E1, 1.50425385692907503408E1, 2.50464946208309415979E0, -1.42182922854787788574E-1, -3.80806407691578277194E-2, -9.33259480895457427372E-4 ])) p2 = list( reversed([ 3.23774891776946035970E0, 6.91522889068984211695E0, 3.93881025292474443415E0, 1.33303460815807542389E0, 2.01485389549179081538E-1, 1.23716634817820021358E-2, 3.01581553508235416007E-4, 2.65806974686737550832E-6, 6.23974539184983293730E-9 ])) q2 = list( reversed([ 1.0, 6.02427039364742014255E0, 3.67983563856160859403E0, 1.37702099489081330271E0, 2.16236993594496635890E-1, 1.34204006088543189037E-2, 3.28014464682127739104E-4, 2.89247864745380683936E-6, 6.79019408009981274425E-9 ])) def _create_polynomial(var, coeffs): """Compute n_th order polynomial via Horner's method.""" if not coeffs: return 0. return coeffs[0] + _create_polynomial(var, coeffs[1:]) * var maybe_complement_p = array_ops.where(p > 1. - np.exp(-2.), 1. - p, p) # Write in an arbitrary value in place of 0 for p since 0 will cause NaNs # later on. The result from the computation when p == 0 is not used so any # number that doesn't result in NaNs is fine. sanitized_mcp = array_ops.where(maybe_complement_p <= 0., 0.5 * array_ops.ones_like(p), maybe_complement_p) # Compute x for p > exp(-2): x/sqrt(2pi) = w + w**3 P0(w**2)/Q0(w**2). w = sanitized_mcp - 0.5 ww = w**2 x_for_big_p = w + w * ww * (_create_polynomial(ww, p0) / _create_polynomial(ww, q0)) x_for_big_p *= -np.sqrt(2. * np.pi) # Compute x for p <= exp(-2): x = z - log(z)/z - (1/z) P(1/z) / Q(1/z), # where z = sqrt(-2. * log(p)), and P/Q are chosen between two different # arrays based on wether p < exp(-32). z = math_ops.sqrt(-2. * math_ops.log(sanitized_mcp)) first_term = z - math_ops.log(z) / z second_term_small_p = (_create_polynomial(1. / z, p2) / _create_polynomial(1. / z, q2)) / z second_term_otherwise = (_create_polynomial(1. / z, p1) / _create_polynomial(1. / z, q1)) / z x_for_small_p = first_term - second_term_small_p x_otherwise = first_term - second_term_otherwise x = array_ops.where(sanitized_mcp > np.exp(-2.), x_for_big_p, array_ops.where(z >= 8.0, x_for_small_p, x_otherwise)) x = array_ops.where(p > 1. - np.exp(-2.), x, -x) infinity = constant_op.constant(np.inf, dtype=x.dtype) * array_ops.ones_like(x) x_nan_replaced = array_ops.where(p <= 0.0, -infinity, array_ops.where(p >= 1.0, infinity, x)) return x_nan_replaced
def _compute_ones_matrix_shape(self): if self.batch_shape.is_fully_defined(): batch_shape = self.batch_shape.concatenate([1, 1]) return np.ones_like(batch_shape, dtype=np.int32) return array_ops.concat( [array_ops.ones_like(self.batch_shape_tensor()), [1, 1]], 0)
def CheckUnitary(self, x, tol): # Tests that x[...,:,:]^H * x[...,:,:] is close to the identity. xx = math_ops.matmul(x, x, adjoint_a=True) identity = array_ops.matrix_band_part(array_ops.ones_like(xx), 0, 0) self.assertAllClose(identity, xx, atol=tol)
def big(): return array_ops.ones_like(gini, dtype=dtypes.float32) * 10000000.
def training_graph(self, input_data, input_labels, random_seed, data_spec, sparse_features=None, input_weights=None): """Constructs a TF graph for training a random tree. Args: input_data: A tensor or placeholder for input data. input_labels: A tensor or placeholder for labels associated with input_data. random_seed: The random number generator seed to use for this tree. 0 means use the current time as the seed. data_spec: A data_ops.TensorForestDataSpec object specifying the original feature/columns of the data. sparse_features: A tf.SparseTensor for sparse input data. input_weights: A float tensor or placeholder holding per-input weights, or None if all inputs are to be weighted equally. Returns: The last op in the random tree training graph. """ epoch = math_ops.to_int32(get_epoch_variable()) serialized_input_spec = data_spec.SerializeToString() if input_weights is None: input_weights = [] if input_data is None: input_data = [] sparse_indices = [] sparse_values = [] sparse_shape = [] if sparse_features is not None: sparse_indices = sparse_features.indices sparse_values = sparse_features.values sparse_shape = sparse_features.dense_shape # Count extremely random stats. (node_sums, node_squares, splits_indices, splits_sums, splits_squares, totals_indices, totals_sums, totals_squares, input_leaves) = (tensor_forest_ops.count_extremely_random_stats( input_data, sparse_indices, sparse_values, sparse_shape, input_labels, input_weights, self.variables.tree, self.variables.tree_thresholds, self.variables.node_to_accumulator_map, self.variables.candidate_split_features, self.variables.candidate_split_thresholds, self.variables.start_epoch, epoch, input_spec=serialized_input_spec, num_classes=self.params.num_output_columns, regression=self.params.regression)) node_update_ops = [] node_update_ops.append( state_ops.assign_add(self.variables.node_sums, node_sums)) splits_update_ops = [] splits_update_ops.append( tensor_forest_ops.scatter_add_ndim( self.variables.candidate_split_sums, splits_indices, splits_sums)) splits_update_ops.append( tensor_forest_ops.scatter_add_ndim(self.variables.accumulator_sums, totals_indices, totals_sums)) if self.params.regression: node_update_ops.append( state_ops.assign_add(self.variables.node_squares, node_squares)) splits_update_ops.append( tensor_forest_ops.scatter_add_ndim( self.variables.candidate_split_squares, splits_indices, splits_squares)) splits_update_ops.append( tensor_forest_ops.scatter_add_ndim( self.variables.accumulator_squares, totals_indices, totals_squares)) # Sample inputs. update_indices, feature_updates, threshold_updates = ( tensor_forest_ops.sample_inputs( input_data, sparse_indices, sparse_values, sparse_shape, input_weights, self.variables.node_to_accumulator_map, input_leaves, self.variables.candidate_split_features, self.variables.candidate_split_thresholds, input_spec=serialized_input_spec, split_initializations_per_input=( self.params.split_initializations_per_input), split_sampling_random_seed=random_seed)) update_features_op = state_ops.scatter_update( self.variables.candidate_split_features, update_indices, feature_updates) update_thresholds_op = state_ops.scatter_update( self.variables.candidate_split_thresholds, update_indices, threshold_updates) # Calculate finished nodes. with ops.control_dependencies(splits_update_ops): # Passing input_leaves to finished nodes here means that nodes that # have become stale won't be deallocated until an input reaches them, # because we're trying to avoid considering every fertile node for # performance reasons. finished, stale = tensor_forest_ops.finished_nodes( input_leaves, self.variables.node_to_accumulator_map, self.variables.candidate_split_sums, self.variables.candidate_split_squares, self.variables.accumulator_sums, self.variables.accumulator_squares, self.variables.start_epoch, epoch, num_split_after_samples=self.params.split_after_samples, min_split_samples=self.params.min_split_samples, dominate_method=self.params.dominate_method, dominate_fraction=self.params.dominate_fraction) # Update leaf scores. # TODO(thomaswc): Store the leaf scores in a TopN and only update the # scores of the leaves that were touched by this batch of input. children = array_ops.squeeze(array_ops.slice(self.variables.tree, [0, 0], [-1, 1]), squeeze_dims=[1]) is_leaf = math_ops.equal(constants.LEAF_NODE, children) leaves = math_ops.to_int32( array_ops.squeeze(array_ops.where(is_leaf), squeeze_dims=[1])) non_fertile_leaves = array_ops.boolean_mask( leaves, math_ops.less( array_ops.gather(self.variables.node_to_accumulator_map, leaves), 0)) # TODO(gilberth): It should be possible to limit the number of non # fertile leaves we calculate scores for, especially since we can only take # at most array_ops.shape(finished)[0] of them. with ops.control_dependencies(node_update_ops): sums = array_ops.gather(self.variables.node_sums, non_fertile_leaves) if self.params.regression: squares = array_ops.gather(self.variables.node_squares, non_fertile_leaves) non_fertile_leaf_scores = self._variance(sums, squares) else: non_fertile_leaf_scores = self._weighted_gini(sums) # Calculate best splits. with ops.control_dependencies(splits_update_ops): split_indices = tensor_forest_ops.best_splits( finished, self.variables.node_to_accumulator_map, self.variables.candidate_split_sums, self.variables.candidate_split_squares, self.variables.accumulator_sums, self.variables.accumulator_squares, regression=self.params.regression) # Grow tree. with ops.control_dependencies( [update_features_op, update_thresholds_op]): (tree_update_indices, tree_children_updates, tree_threshold_updates, new_eot) = (tensor_forest_ops.grow_tree( self.variables.end_of_tree, self.variables.node_to_accumulator_map, finished, split_indices, self.variables.candidate_split_features, self.variables.candidate_split_thresholds)) tree_update_op = state_ops.scatter_update(self.variables.tree, tree_update_indices, tree_children_updates) thresholds_update_op = state_ops.scatter_update( self.variables.tree_thresholds, tree_update_indices, tree_threshold_updates) # TODO(thomaswc): Only update the epoch on the new leaves. new_epoch_updates = epoch * array_ops.ones_like( tree_threshold_updates, dtype=dtypes.int32) epoch_update_op = state_ops.scatter_update( self.variables.start_epoch, tree_update_indices, new_epoch_updates) # Update fertile slots. with ops.control_dependencies([tree_update_op]): (n2a_map_updates, a2n_map_updates, accumulators_cleared, accumulators_allocated) = (tensor_forest_ops.update_fertile_slots( finished, non_fertile_leaves, non_fertile_leaf_scores, self.variables.end_of_tree, self.variables.accumulator_sums, self.variables.node_to_accumulator_map, stale, self.variables.node_sums, regression=self.params.regression)) # Ensure end_of_tree doesn't get updated until UpdateFertileSlots has # used it to calculate new leaves. gated_new_eot, = control_flow_ops.tuple( [new_eot], control_inputs=[n2a_map_updates]) eot_update_op = state_ops.assign(self.variables.end_of_tree, gated_new_eot) updates = [] updates.append(eot_update_op) updates.append(tree_update_op) updates.append(thresholds_update_op) updates.append(epoch_update_op) updates.append( state_ops.scatter_update(self.variables.node_to_accumulator_map, n2a_map_updates[0], n2a_map_updates[1])) updates.append( state_ops.scatter_update(self.variables.accumulator_to_node_map, a2n_map_updates[0], a2n_map_updates[1])) cleared_and_allocated_accumulators = array_ops.concat_v2( [accumulators_cleared, accumulators_allocated], 0) # Calculate values to put into scatter update for candidate counts. # Candidate split counts are always reset back to 0 for both cleared # and allocated accumulators. This means some accumulators might be doubly # reset to 0 if the were released and not allocated, then later allocated. split_values = array_ops.tile( array_ops.expand_dims( array_ops.expand_dims( array_ops.zeros_like(cleared_and_allocated_accumulators, dtype=dtypes.float32), 1), 2), [ 1, self.params.num_splits_to_consider, self.params.num_output_columns ]) updates.append( state_ops.scatter_update(self.variables.candidate_split_sums, cleared_and_allocated_accumulators, split_values)) if self.params.regression: updates.append( state_ops.scatter_update( self.variables.candidate_split_squares, cleared_and_allocated_accumulators, split_values)) # Calculate values to put into scatter update for total counts. total_cleared = array_ops.tile( array_ops.expand_dims( math_ops.negative( array_ops.ones_like(accumulators_cleared, dtype=dtypes.float32)), 1), [1, self.params.num_output_columns]) total_reset = array_ops.tile( array_ops.expand_dims( array_ops.zeros_like(accumulators_allocated, dtype=dtypes.float32), 1), [1, self.params.num_output_columns]) accumulator_updates = array_ops.concat_v2([total_cleared, total_reset], 0) updates.append( state_ops.scatter_update(self.variables.accumulator_sums, cleared_and_allocated_accumulators, accumulator_updates)) if self.params.regression: updates.append( state_ops.scatter_update(self.variables.accumulator_squares, cleared_and_allocated_accumulators, accumulator_updates)) # Calculate values to put into scatter update for candidate splits. split_features_updates = array_ops.tile( array_ops.expand_dims( math_ops.negative( array_ops.ones_like(cleared_and_allocated_accumulators)), 1), [1, self.params.num_splits_to_consider]) updates.append( state_ops.scatter_update(self.variables.candidate_split_features, cleared_and_allocated_accumulators, split_features_updates)) updates += self.finish_iteration() return control_flow_ops.group(*updates)
def _mean(self): return self.loc * array_ops.ones_like(self.scale)
def _stddev(self): return self.scale * array_ops.ones_like(self.loc)
def _ones(self): return array_ops.ones_like(self._df + self._mu + self._sigma)
def tpu_fn(): results = mid_level_api.dequeue() mid_level_api.apply_gradients((None, None, array_ops.ones_like(results[2]))) return results
def random_normal_correlated_columns(shape, mean=0.0, stddev=1.0, dtype=dtypes.float32, eps=1e-4, seed=None): """Batch matrix with (possibly complex) Gaussian entries and correlated cols. Returns random batch matrix `A` with specified element-wise `mean`, `stddev`, living close to an embedded hyperplane. Suppose `shape[-2:] = (M, N)`. If `M < N`, `A` is a random `M x N` [batch] matrix with iid Gaussian entries. If `M >= N`, then the colums of `A` will be made almost dependent as follows: ``` L = random normal N x N-1 matrix, mean = 0, stddev = 1 / sqrt(N - 1) B = random normal M x N-1 matrix, mean = 0, stddev = stddev. G = (L B^H)^H, a random normal M x N matrix, living on N-1 dim hyperplane E = a random normal M x N matrix, mean = 0, stddev = eps mu = a constant M x N matrix, equal to the argument "mean" A = G + E + mu ``` Args: shape: Python list of integers. Shape of the returned tensor. Must be at least length two. mean: `Tensor` giving mean of normal to sample from. stddev: `Tensor` giving stdev of normal to sample from. dtype: `TensorFlow` `dtype` or numpy dtype eps: Distance each column is perturbed from the low-dimensional subspace. seed: Python integer seed for the RNG. Returns: `Tensor` with desired shape and dtype. Raises: ValueError: If `shape` is not at least length 2. """ dtype = dtypes.as_dtype(dtype) if len(shape) < 2: raise ValueError( "Argument shape must be at least length 2. Found: %s" % shape) # Shape is the final shape, e.g. [..., M, N] shape = list(shape) batch_shape = shape[:-2] m, n = shape[-2:] # If there is only one column, "they" are by definition correlated. if n < 2 or n < m: return random_normal(shape, mean=mean, stddev=stddev, dtype=dtype, seed=seed) # Shape of the matrix with only n - 1 columns that we will embed in higher # dimensional space. smaller_shape = batch_shape + [m, n - 1] # Shape of the embedding matrix, mapping batch matrices # from [..., N-1, M] to [..., N, M] embedding_mat_shape = batch_shape + [n, n - 1] # This stddev for the embedding_mat ensures final result has correct stddev. stddev_mat = 1 / np.sqrt(n - 1) with ops.name_scope("random_normal_correlated_columns"): smaller_mat = random_normal(smaller_shape, mean=0.0, stddev=stddev_mat, dtype=dtype, seed=seed) if seed is not None: seed += 1287 embedding_mat = random_normal(embedding_mat_shape, dtype=dtype, seed=seed) embedded_t = math_ops.matmul(embedding_mat, smaller_mat, transpose_b=True) embedded = array_ops.matrix_transpose(embedded_t) mean_mat = array_ops.ones_like(embedded) * mean return embedded + random_normal(shape, stddev=eps, dtype=dtype) + mean_mat
def call(self, inputs, training=None): if training is None: training = keras.backend.learning_phase() return tf_utils.smart_cond( training, lambda: array_ops.ones_like(inputs), lambda: array_ops.zeros_like(inputs))
def _entropy(self): # Use broadcasting rules to calculate the full broadcast scale. scale = self.scale * array_ops.ones_like(self.loc) return 0.5 * math.log(2. * math.pi * math.e) + math_ops.log(scale)
def training_graph(self, input_data, input_labels, **tree_kwargs): """Constructs a TF graph for training a random forest. Args: input_data: A tensor or dict of string->Tensor for input data. input_labels: A tensor or placeholder for labels associated with input_data. **tree_kwargs: Keyword arguments passed to each tree's training_graph. Returns: The last op in the random forest training graph. Raises: NotImplementedError: If trying to use bagging with sparse features. """ processed_dense_features, processed_sparse_features, data_spec = ( data_ops.ParseDataTensorOrDict(input_data)) if input_labels is not None: labels = data_ops.ParseLabelTensorOrDict(input_labels) data_spec = data_spec or self.get_default_data_spec(input_data) tree_graphs = [] for i in range(self.params.num_trees): with ops.device(self.device_assigner.get_device(i)): seed = self.params.base_random_seed if seed != 0: seed += i # If using bagging, randomly select some of the input. tree_data = processed_dense_features tree_labels = labels if self.params.bagging_fraction < 1.0: # TODO(gilberth): Support bagging for sparse features. if processed_sparse_features is not None: raise NotImplementedError( 'Bagging not supported with sparse features.') # TODO(thomaswc): This does sampling without replacment. Consider # also allowing sampling with replacement as an option. batch_size = array_ops.strided_slice( array_ops.shape(processed_dense_features), [0], [1]) r = random_ops.random_uniform(batch_size, seed=seed) mask = math_ops.less( r, array_ops.ones_like(r) * self.params.bagging_fraction) gather_indices = array_ops.squeeze(array_ops.where(mask), squeeze_dims=[1]) # TODO(thomaswc): Calculate out-of-bag data and labels, and store # them for use in calculating statistics later. tree_data = array_ops.gather(processed_dense_features, gather_indices) tree_labels = array_ops.gather(labels, gather_indices) if self.params.bagged_features: if processed_sparse_features is not None: raise NotImplementedError( 'Feature bagging not supported with sparse features.' ) tree_data = self._bag_features(i, tree_data) initialization = self.trees[i].tree_initialization() with ops.control_dependencies([initialization]): tree_graphs.append(self.trees[i].training_graph( tree_data, tree_labels, seed, data_spec=data_spec, sparse_features=processed_sparse_features, **tree_kwargs)) return control_flow_ops.group(*tree_graphs, name='train')
def _entropy(self): # Use broadcasting rules to calculate the full broadcast sigma. scale = self.scale * array_ops.ones_like(self.loc) return 2 + math_ops.log(scale)
def confusion_matrix(labels, predictions, num_classes=None, dtype=dtypes.int32, name=None, weights=None): """Computes the confusion matrix from predictions and labels. Calculate the Confusion Matrix for a pair of prediction and label 1-D int arrays. The matrix columns represent the prediction labels and the rows represent the real labels. The confusion matrix is always a 2-D array of shape `[n, n]`, where `n` is the number of valid labels for a given classification task. Both prediction and labels must be 1-D arrays of the same shape in order for this function to work. If `num_classes` is None, then `num_classes` will be set to the one plus the maximum value in either predictions or labels. Class labels are expected to start at 0. E.g., if `num_classes` was three, then the possible labels would be `[0, 1, 2]`. If `weights` is not `None`, then each prediction contributes its corresponding weight to the total value of the confusion matrix cell. For example: ```python tf.contrib.metrics.confusion_matrix([1, 2, 4], [2, 2, 4]) ==> [[0 0 0 0 0] [0 0 1 0 0] [0 0 1 0 0] [0 0 0 0 0] [0 0 0 0 1]] ``` Note that the possible labels are assumed to be `[0, 1, 2, 3, 4]`, resulting in a 5x5 confusion matrix. Args: labels: 1-D `Tensor` of real labels for the classification task. predictions: 1-D `Tensor` of predictions for a given classification. num_classes: The possible number of labels the classification task can have. If this value is not provided, it will be calculated using both predictions and labels array. dtype: Data type of the confusion matrix. name: Scope name. weights: An optional `Tensor` whose shape matches `predictions`. Returns: A k X k matrix representing the confusion matrix, where k is the number of possible labels in the classification task. Raises: ValueError: If both predictions and labels are not 1-D vectors and have mismatched shapes, or if `weights` is not `None` and its shape doesn't match `predictions`. """ with ops.name_scope(name, 'confusion_matrix', (predictions, labels, num_classes, weights)) as name: labels, predictions = remove_squeezable_dimensions( ops.convert_to_tensor(labels, name='labels'), ops.convert_to_tensor(predictions, name='predictions')) predictions = math_ops.cast(predictions, dtypes.int64) labels = math_ops.cast(labels, dtypes.int64) if num_classes is None: num_classes = math_ops.maximum(math_ops.reduce_max(predictions), math_ops.reduce_max(labels)) + 1 if weights is not None: predictions.get_shape().assert_is_compatible_with( weights.get_shape()) weights = math_ops.cast(weights, dtype) shape = array_ops.stack([num_classes, num_classes]) indices = array_ops.transpose(array_ops.stack([labels, predictions])) values = (array_ops.ones_like(predictions, dtype) if weights is None else weights) cm_sparse = sparse_tensor.SparseTensor( indices=indices, values=values, dense_shape=math_ops.to_int64(shape)) zero_matrix = array_ops.zeros(math_ops.to_int32(shape), dtype) return sparse_ops.sparse_add(zero_matrix, cm_sparse)
def _compute_sampled_logits(weights, biases, inputs, labels, num_sampled, num_classes, num_true=1, sampled_values=None, subtract_log_q=True, remove_accidental_hits=False, partition_strategy="mod", name=None): """Helper function for nce_loss and sampled_softmax_loss functions. Computes sampled output training logits and labels suitable for implementing e.g. noise-contrastive estimation (see nce_loss) or sampled softmax (see sampled_softmax_loss). Note: In the case where num_true > 1, we assign to each target class the target probability 1 / num_true so that the target probabilities sum to 1 per-example. Args: weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor` objects whose concatenation along dimension 0 has shape `[num_classes, dim]`. The (possibly-partitioned) class embeddings. biases: A `Tensor` of shape `[num_classes]`. The class biases. inputs: A `Tensor` of shape `[batch_size, dim]`. The forward activations of the input network. labels: A `Tensor` of type `int64` and shape `[batch_size, num_true]`. The target classes. Note that this format differs from the `labels` argument of `nn.softmax_cross_entropy_with_logits`. num_sampled: An `int`. The number of classes to randomly sample per batch. num_classes: An `int`. The number of possible classes. num_true: An `int`. The number of target classes per training example. sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`, `sampled_expected_count`) returned by a `*_candidate_sampler` function. (if None, we default to `log_uniform_candidate_sampler`) subtract_log_q: A `bool`. whether to subtract the log expected count of the labels in the sample to get the logits of the true labels. Default is True. Turn off for Negative Sampling. remove_accidental_hits: A `bool`. whether to remove "accidental hits" where a sampled class equals one of the target classes. Default is False. partition_strategy: A string specifying the partitioning strategy, relevant if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported. Default is `"mod"`. See `tf.nn.embedding_lookup` for more details. name: A name for the operation (optional). Returns: out_logits, out_labels: `Tensor` objects each with shape `[batch_size, num_true + num_sampled]`, for passing to either `nn.sigmoid_cross_entropy_with_logits` (NCE) or `nn.softmax_cross_entropy_with_logits` (sampled softmax). """ if not isinstance(weights, list): weights = [weights] with ops.op_scope(weights + [biases, inputs, labels], name, "compute_sampled_logits"): if labels.dtype != dtypes.int64: labels = math_ops.cast(labels, dtypes.int64) labels_flat = array_ops.reshape(labels, [-1]) # Sample the negative labels. # sampled shape: [num_sampled] tensor # true_expected_count shape = [batch_size, 1] tensor # sampled_expected_count shape = [num_sampled] tensor if sampled_values is None: sampled_values = candidate_sampling_ops.log_uniform_candidate_sampler( true_classes=labels, num_true=num_true, num_sampled=num_sampled, unique=True, range_max=num_classes) # NOTE: pylint cannot tell that 'sampled_values' is a sequence # pylint: disable=unpacking-non-sequence sampled, true_expected_count, sampled_expected_count = sampled_values # pylint: enable=unpacking-non-sequence # labels_flat is a [batch_size * num_true] tensor # sampled is a [num_sampled] int tensor all_ids = array_ops.concat(0, [labels_flat, sampled]) # weights shape is [num_classes, dim] all_w = embedding_ops.embedding_lookup( weights, all_ids, partition_strategy=partition_strategy) all_b = embedding_ops.embedding_lookup(biases, all_ids) # true_w shape is [batch_size * num_true, dim] # true_b is a [batch_size * num_true] tensor true_w = array_ops.slice( all_w, [0, 0], array_ops.pack([array_ops.shape(labels_flat)[0], -1])) true_b = array_ops.slice(all_b, [0], array_ops.shape(labels_flat)) # inputs shape is [batch_size, dim] # true_w shape is [batch_size * num_true, dim] # row_wise_dots is [batch_size, num_true, dim] dim = array_ops.shape(true_w)[1:2] new_true_w_shape = array_ops.concat(0, [[-1, num_true], dim]) row_wise_dots = math_ops.mul( array_ops.expand_dims(inputs, 1), array_ops.reshape(true_w, new_true_w_shape)) # We want the row-wise dot plus biases which yields a # [batch_size, num_true] tensor of true_logits. dots_as_matrix = array_ops.reshape(row_wise_dots, array_ops.concat(0, [[-1], dim])) true_logits = array_ops.reshape(_sum_rows(dots_as_matrix), [-1, num_true]) true_b = array_ops.reshape(true_b, [-1, num_true]) true_logits += true_b # Lookup weights and biases for sampled labels. # sampled_w shape is [num_sampled, dim] # sampled_b is a [num_sampled] float tensor sampled_w = array_ops.slice( all_w, array_ops.pack([array_ops.shape(labels_flat)[0], 0]), [-1, -1]) sampled_b = array_ops.slice(all_b, array_ops.shape(labels_flat), [-1]) # inputs has shape [batch_size, dim] # sampled_w has shape [num_sampled, dim] # sampled_b has shape [num_sampled] # Apply X*W'+B, which yields [batch_size, num_sampled] sampled_logits = math_ops.matmul(inputs, sampled_w, transpose_b=True) + sampled_b if remove_accidental_hits: acc_hits = candidate_sampling_ops.compute_accidental_hits( labels, sampled, num_true=num_true) acc_indices, acc_ids, acc_weights = acc_hits # This is how SparseToDense expects the indices. acc_indices_2d = array_ops.reshape(acc_indices, [-1, 1]) acc_ids_2d_int32 = array_ops.reshape( math_ops.cast(acc_ids, dtypes.int32), [-1, 1]) sparse_indices = array_ops.concat( 1, [acc_indices_2d, acc_ids_2d_int32], "sparse_indices") # Create sampled_logits_shape = [batch_size, num_sampled] sampled_logits_shape = array_ops.concat(0, [ array_ops.shape(labels)[:1], array_ops.expand_dims(num_sampled, 0) ]) if sampled_logits.dtype != acc_weights.dtype: acc_weights = math_ops.cast(acc_weights, sampled_logits.dtype) sampled_logits += sparse_ops.sparse_to_dense( sparse_indices, sampled_logits_shape, acc_weights, default_value=0.0, validate_indices=False) if subtract_log_q: # Subtract log of Q(l), prior probability that l appears in sampled. true_logits -= math_ops.log(true_expected_count) sampled_logits -= math_ops.log(sampled_expected_count) # Construct output logits and labels. The true labels/logits start at col 0. out_logits = array_ops.concat(1, [true_logits, sampled_logits]) # true_logits is a float tensor, ones_like(true_logits) is a float tensor # of ones. We then divide by num_true to ensure the per-example labels sum # to 1.0, i.e. form a proper probability distribution. out_labels = array_ops.concat(1, [ array_ops.ones_like(true_logits) / num_true, array_ops.zeros_like(sampled_logits) ]) return out_logits, out_labels
def _unbounded_exponential_log_prob(x): """An exponential distribution with log-likelihood NaN for x < 0.""" per_element_potentials = array_ops.where( x < 0, np.nan * array_ops.ones_like(x), -x) return math_ops.reduce_sum(per_element_potentials)
def _create_tpu_estimator_spec(self, features, mode, logits, labels=None, optimizer=None, train_op_fn=None, regularization_losses=None): """Returns an `model_fn._TPUEstimatorSpec`. Args: features: Input `dict` of `Tensor` or `SparseTensor` objects. mode: Estimator's `ModeKeys`. logits: logits `Tensor` with shape `[D0, D1, ... DN, n_classes]`. For many applications, the shape is `[batch_size, n_classes]`. labels: Labels with shape matching `logits`. Can be multi-hot `Tensor` with shape `[D0, D1, ... DN, n_classes]` or `SparseTensor` with `dense_shape` `[D0, D1, ... DN, ?]`. `labels` is required argument when `mode` equals `TRAIN` or `EVAL`. optimizer: `Optimizer` instance to optimize the loss in TRAIN mode. Namely, sets `train_op = optimizer.minimize(loss, global_step)`, which updates variables and increments `global_step`. train_op_fn: Function that takes a scalar loss `Tensor` and returns `train_op`. Used if `optimizer` is `None`. regularization_losses: A list of additional scalar losses to be added to the training loss, such as regularization losses. These losses are usually expressed as a batch average, so for best results users need to set `loss_reduction=SUM_OVER_BATCH_SIZE` or `loss_reduction=SUM_OVER_NONZERO_WEIGHTS` when creating the head to avoid scaling errors. Returns: `model_fn._TPUEstimatorSpec`. Raises: ValueError: If both `train_op_fn` and `optimizer` are `None` in TRAIN mode, or if both are set. """ with ops.name_scope(self._name, 'head'): logits = head_lib._check_logits_final_dim(logits, self.logits_dimension) # pylint:disable=protected-access # Predict. pred_keys = prediction_keys.PredictionKeys with ops.name_scope(None, 'predictions', (logits, )): probabilities = math_ops.sigmoid(logits, name=pred_keys.PROBABILITIES) predictions = { pred_keys.LOGITS: logits, pred_keys.PROBABILITIES: probabilities, } if mode == model_fn.ModeKeys.PREDICT: classifier_output = head_lib._classification_output( # pylint:disable=protected-access scores=probabilities, n_classes=self._n_classes, label_vocabulary=self._label_vocabulary) return model_fn._TPUEstimatorSpec( # pylint:disable=protected-access mode=model_fn.ModeKeys.PREDICT, predictions=predictions, export_outputs={ _DEFAULT_SERVING_KEY: classifier_output, head_lib._CLASSIFY_SERVING_KEY: classifier_output, # pylint:disable=protected-access head_lib._PREDICT_SERVING_KEY: ( # pylint:disable=protected-access export_output.PredictOutput(predictions)) }) (training_loss, unreduced_loss, weights, processed_labels) = self.create_loss(features=features, mode=mode, logits=logits, labels=labels) if regularization_losses: regularization_loss = math_ops.add_n(regularization_losses) regularized_training_loss = math_ops.add_n( [training_loss, regularization_loss]) else: regularization_loss = None regularized_training_loss = training_loss # Eval. if mode == model_fn.ModeKeys.EVAL: return model_fn._TPUEstimatorSpec( # pylint:disable=protected-access mode=model_fn.ModeKeys.EVAL, predictions=predictions, loss=regularized_training_loss, eval_metrics=head_lib._create_eval_metrics_tuple( # pylint:disable=protected-access self._eval_metric_ops, { 'labels': processed_labels, 'probabilities': probabilities, 'weights': weights, 'unreduced_loss': unreduced_loss, 'regularization_loss': regularization_loss, })) # Train. if optimizer is not None: if train_op_fn is not None: raise ValueError( 'train_op_fn and optimizer cannot both be set.') train_op = optimizer.minimize( regularized_training_loss, global_step=training_util.get_global_step()) elif train_op_fn is not None: train_op = train_op_fn(regularized_training_loss) else: raise ValueError( 'train_op_fn and optimizer cannot both be None.') train_op = head_lib._append_update_ops(train_op) # pylint:disable=protected-access # Only summarize mean_loss for SUM reduction to preserve backwards # compatibility. Otherwise skip it to avoid unnecessary computation. if self._loss_reduction == losses.Reduction.SUM: example_weight_sum = math_ops.reduce_sum( weights * array_ops.ones_like(unreduced_loss)) mean_loss = training_loss / example_weight_sum else: mean_loss = None with ops.name_scope(''): keys = metric_keys.MetricKeys summary.scalar( head_lib._summary_key(self._name, keys.LOSS), # pylint:disable=protected-access regularized_training_loss) if mean_loss is not None: summary.scalar( head_lib._summary_key(self._name, keys.LOSS_MEAN), # pylint:disable=protected-access mean_loss) if regularization_loss is not None: summary.scalar( head_lib._summary_key(self._name, keys.LOSS_REGULARIZATION), # pylint:disable=protected-access regularization_loss) return model_fn._TPUEstimatorSpec( # pylint:disable=protected-access mode=model_fn.ModeKeys.TRAIN, predictions=predictions, loss=regularized_training_loss, train_op=train_op)
def clip_by_norm(t, clip_norm, axes=None, name=None): """Clips tensor values to a maximum L2-norm. Given a tensor `t`, and a maximum clip value `clip_norm`, this operation normalizes `t` so that its L2-norm is less than or equal to `clip_norm`, along the dimensions given in `axes`. Specifically, in the default case where all dimensions are used for calculation, if the L2-norm of `t` is already less than or equal to `clip_norm`, then `t` is not modified. If the L2-norm is greater than `clip_norm`, then this operation returns a tensor of the same type and shape as `t` with its values set to: `t * clip_norm / l2norm(t)` In this case, the L2-norm of the output tensor is `clip_norm`. As another example, if `t` is a matrix and `axes == [1]`, then each row of the output will have L2-norm less than or equal to `clip_norm`. If `axes == [0]` instead, each column of the output will be clipped. Code example: >>> some_nums = tf.constant([[1, 2, 3, 4, 5]], dtype=tf.float32) >>> tf.clip_by_norm(some_nums, 2.0).numpy() array([[0.26967996, 0.5393599 , 0.80903983, 1.0787199 , 1.3483998 ]], dtype=float32) This operation is typically used to clip gradients before applying them with an optimizer. Most gradient data is a collection of different shaped tensors for different parts of the model. Thus, this is a common usage: ``` # Get your gradients after training loss_value, grads = grad(model, features, labels) # Apply some clipping grads = [tf.clip_by_norm(g, norm) for g in grads] # Continue on with training optimizer.apply_gradients(grads) ``` Args: t: A `Tensor` or `IndexedSlices`. This must be a floating point type. clip_norm: A 0-D (scalar) `Tensor` > 0. A maximum clipping value, also floating point axes: A 1-D (vector) `Tensor` of type int32 containing the dimensions to use for computing the L2-norm. If `None` (the default), uses all dimensions. name: A name for the operation (optional). Returns: A clipped `Tensor` or `IndexedSlices`. Raises: ValueError: If the clip_norm tensor is not a 0-D scalar tensor. TypeError: If dtype of the input is not a floating point or complex type. """ with ops.name_scope(name, "clip_by_norm", [t, clip_norm]) as name: values = ops.convert_to_tensor( t.values if isinstance(t, ops.IndexedSlices) else t, name="t") # Calculate L2-norm, clip elements by ratio of clip_norm to L2-norm l2sum = math_ops.reduce_sum(values * values, axes, keepdims=True) pred = l2sum > 0 # Two-tap tf.where trick to bypass NaN gradients l2sum_safe = array_ops.where(pred, l2sum, array_ops.ones_like(l2sum)) l2norm = array_ops.where(pred, math_ops.sqrt(l2sum_safe), l2sum) intermediate = values * clip_norm # Assert that the shape is compatible with the initial shape, # to prevent unintentional broadcasting. values.shape.assert_is_compatible_with(intermediate.shape) values_clip = array_ops.identity( intermediate / math_ops.maximum(l2norm, clip_norm), name=name) if isinstance(t, ops.IndexedSlices): return ops.IndexedSlices(values_clip, t.indices, t.dense_shape) return values_clip
def _mini_batch_training_op(self, inputs, cluster_idx_list, cluster_centers, total_counts): """Creates an op for training for mini batch case. Args: inputs: list of input Tensors. cluster_idx_list: A vector (or list of vectors). Each element in the vector corresponds to an input row in 'inp' and specifies the cluster id corresponding to the input. cluster_centers: Tensor Ref of cluster centers. total_counts: Tensor Ref of cluster counts. Returns: An op for doing an update of mini-batch k-means. """ update_ops = [] for inp, cluster_idx in zip(inputs, cluster_idx_list): with ops.colocate_with(inp, ignore_existing=True): assert total_counts is not None cluster_idx = array_ops.reshape(cluster_idx, [-1]) # Dedupe the unique ids of cluster_centers being updated so that updates # can be locally aggregated. unique_ids, unique_idx = array_ops.unique(cluster_idx) num_unique_cluster_idx = array_ops.size(unique_ids) # Fetch the old values of counts and cluster_centers. with ops.colocate_with(total_counts, ignore_existing=True): old_counts = array_ops.gather(total_counts, unique_ids) # TODO(agarwal): This colocation seems to run into problems. Fix it. with ops.colocate_with(cluster_centers, ignore_existing=True): old_cluster_centers = array_ops.gather( cluster_centers, unique_ids) # Locally aggregate the increment to counts. count_updates = math_ops.unsorted_segment_sum( array_ops.ones_like(unique_idx, dtype=total_counts.dtype), unique_idx, num_unique_cluster_idx) # Locally compute the sum of inputs mapped to each id. # For a cluster with old cluster value x, old count n, and with data # d_1,...d_k newly assigned to it, we recompute the new value as # \\(x += (sum_i(d_i) - k * x) / (n + k)\\). # Compute \\(sum_i(d_i)\\), see comment above. cluster_center_updates = math_ops.unsorted_segment_sum( inp, unique_idx, num_unique_cluster_idx) # Shape to enable broadcasting count_updates and learning_rate to inp. # It extends the shape with 1's to match the rank of inp. broadcast_shape = array_ops.concat([ array_ops.reshape(num_unique_cluster_idx, [1]), array_ops.ones(array_ops.reshape( array_ops.rank(inp) - 1, [1]), dtype=dtypes.int32) ], 0) # Subtract k * x, see comment above. cluster_center_updates -= math_ops.cast( array_ops.reshape(count_updates, broadcast_shape), inp.dtype) * old_cluster_centers learning_rate = math_ops.reciprocal( math_ops.cast(old_counts + count_updates, inp.dtype)) learning_rate = array_ops.reshape(learning_rate, broadcast_shape) # scale by 1 / (n + k), see comment above. cluster_center_updates *= learning_rate # Apply the updates. update_counts = state_ops.scatter_add(total_counts, unique_ids, count_updates) update_cluster_centers = state_ops.scatter_add( cluster_centers, unique_ids, cluster_center_updates) update_ops.extend([update_counts, update_cluster_centers]) return control_flow_ops.group(*update_ops)
def mean(self): return self._mu * array_ops.ones_like(self._sigma)
def _list_mle_loss(labels, logits, weights=None, lambda_weight=None, reduction=core_losses.Reduction.SUM_BY_NONZERO_WEIGHTS, name=None, seed=None): """Computes the ListMLE loss [Xia et al. 2008] for a list. Given the labels of graded relevance l_i and the logits s_i, we calculate the ListMLE loss for the given list. The `lambda_weight` re-weights examples based on l_i and r_i. The recommended weighting scheme is the formulation presented in the "Position-Aware ListMLE" paper (Lan et. al) and available using create_p_list_mle_lambda_weight() factory function above. Args: labels: A `Tensor` of the same shape as `logits` representing graded relevance. logits: A `Tensor` with shape [batch_size, list_size]. Each value is the ranking score of the corresponding item. weights: A scalar, a `Tensor` with shape [batch_size, 1] for list-wise weights, or a `Tensor` with shape [batch_size, list_size] for item-wise weights. lambda_weight: A `DCGLambdaWeight` instance. reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to reduce training loss over batch. name: A string used as the name for this loss. seed: A randomization seed used when shuffling ground truth permutations. Returns: An op for the ListMLE loss. """ with ops.name_scope(name, 'list_mle_loss', (labels, logits, weights)): is_label_valid = utils.is_label_valid(labels) # Reset the invalid labels to 0 and reset the invalid logits to a logit with # ~= 0 contribution. labels = array_ops.where(is_label_valid, labels, array_ops.zeros_like(labels)) logits = array_ops.where( is_label_valid, logits, math_ops.log(_EPSILON) * array_ops.ones_like(logits)) weights = 1.0 if weights is None else ops.convert_to_tensor(weights) weights = array_ops.squeeze(weights) # Shuffle labels and logits to add randomness to sort. shuffled_indices = utils.shuffle_valid_indices(is_label_valid, seed) shuffled_labels = array_ops.gather_nd(labels, shuffled_indices) shuffled_logits = array_ops.gather_nd(logits, shuffled_indices) sorted_labels, sorted_logits = utils.sort_by_scores( shuffled_labels, [shuffled_labels, shuffled_logits]) raw_max = math_ops.reduce_max(sorted_logits, axis=1, keepdims=True) sorted_logits = sorted_logits - raw_max sums = math_ops.cumsum(math_ops.exp(sorted_logits), axis=1, reverse=True) sums = math_ops.log(sums) - sorted_logits if lambda_weight is not None and isinstance(lambda_weight, ListMLELambdaWeight): sums *= lambda_weight.individual_weights(sorted_labels) negative_log_likelihood = math_ops.reduce_sum(sums, 1) return core_losses.compute_weighted_loss( negative_log_likelihood, weights=weights, reduction=reduction)
def _ones(self): return array_ops.ones_like(self.a + self.b)
def call(self, inputs, training=None): if training is None: training = K.learning_phase() in_eager_mode = context.executing_eagerly() if self.virtual_batch_size is not None: # Virtual batches (aka ghost batches) can be simulated by reshaping the # Tensor and reusing the existing batch norm implementation original_shape = [-1] + inputs.shape.as_list()[1:] expanded_shape = [self.virtual_batch_size, -1] + original_shape[1:] # Will cause errors if virtual_batch_size does not divide the batch size inputs = array_ops.reshape(inputs, expanded_shape) def undo_virtual_batching(outputs): outputs = array_ops.reshape(outputs, original_shape) return outputs if self.fused: outputs = self._fused_batch_norm(inputs, training=training) if self.virtual_batch_size is not None: # Currently never reaches here since fused_batch_norm does not support # virtual batching outputs = undo_virtual_batching(outputs) return outputs # Compute the axes along which to reduce the mean / variance input_shape = inputs.get_shape() ndims = len(input_shape) reduction_axes = [i for i in range(ndims) if i not in self.axis] if self.virtual_batch_size is not None: del reduction_axes[1] # Do not reduce along virtual batch dim # Broadcasting only necessary for single-axis batch norm where the axis is # not the last dimension broadcast_shape = [1] * ndims broadcast_shape[self.axis[0]] = input_shape.dims[self.axis[0]].value def _broadcast(v): if (v is not None and len(v.get_shape()) != ndims and reduction_axes != list(range(ndims - 1))): return array_ops.reshape(v, broadcast_shape) return v scale, offset = _broadcast(self.gamma), _broadcast(self.beta) def _compose_transforms(scale, offset, then_scale, then_offset): if then_scale is not None: scale *= then_scale offset *= then_scale if then_offset is not None: offset += then_offset return (scale, offset) # Determine a boolean value for `training`: could be True, False, or None. training_value = tf_utils.constant_value(training) if training_value is not False: if self.adjustment: adj_scale, adj_bias = self.adjustment(array_ops.shape(inputs)) # Adjust only during training. adj_scale = tf_utils.smart_cond(training, lambda: adj_scale, lambda: array_ops.ones_like(adj_scale)) adj_bias = tf_utils.smart_cond(training, lambda: adj_bias, lambda: array_ops.zeros_like(adj_bias)) scale, offset = _compose_transforms(adj_scale, adj_bias, scale, offset) # Some of the computations here are not necessary when training==False # but not a constant. However, this makes the code simpler. keep_dims = self.virtual_batch_size is not None or len(self.axis) > 1 mean, variance = self._moments( math_ops.cast(inputs, self._param_dtype), reduction_axes, keep_dims=keep_dims) moving_mean = self.moving_mean moving_variance = self.moving_variance mean = tf_utils.smart_cond(training, lambda: mean, lambda: moving_mean) variance = tf_utils.smart_cond(training, lambda: variance, lambda: moving_variance) if self.virtual_batch_size is not None: # This isn't strictly correct since in ghost batch norm, you are # supposed to sequentially update the moving_mean and moving_variance # with each sub-batch. However, since the moving statistics are only # used during evaluation, it is more efficient to just update in one # step and should not make a significant difference in the result. new_mean = math_ops.reduce_mean(mean, axis=1, keepdims=True) new_variance = math_ops.reduce_mean(variance, axis=1, keepdims=True) else: new_mean, new_variance = mean, variance if self.renorm: r, d, new_mean, new_variance = self._renorm_correction_and_moments( new_mean, new_variance, training) # When training, the normalized values (say, x) will be transformed as # x * gamma + beta without renorm, and (x * r + d) * gamma + beta # = x * (r * gamma) + (d * gamma + beta) with renorm. r = _broadcast(array_ops.stop_gradient(r, name='renorm_r')) d = _broadcast(array_ops.stop_gradient(d, name='renorm_d')) scale, offset = _compose_transforms(r, d, scale, offset) if distribution_strategy_context.in_cross_replica_context(): strategy = distribution_strategy_context.get_strategy() def _do_update(var, value): """Compute the updates for mean and variance.""" if in_eager_mode and not self.trainable: return return strategy.extended.update( var, self._assign_moving_average, (value, self.momentum), group=False) # We need to unwrap the moving_mean or moving_variance in the case of # training being false to match the output of true_fn and false_fn # in the smart cond. mean_update = tf_utils.smart_cond( training, lambda: _do_update(self.moving_mean, new_mean), lambda: strategy.unwrap(self.moving_mean)) variance_update = tf_utils.smart_cond( training, lambda: _do_update(self.moving_variance, new_variance), lambda: strategy.unwrap(self.moving_variance)) else: def _do_update(var, value): """Compute the updates for mean and variance.""" if in_eager_mode and not self.trainable: return return self._assign_moving_average(var, value, self.momentum) mean_update = tf_utils.smart_cond( training, lambda: _do_update(self.moving_mean, new_mean), lambda: self.moving_mean) variance_update = tf_utils.smart_cond( training, lambda: _do_update(self.moving_variance, new_variance), lambda: self.moving_variance) if not context.executing_eagerly(): self.add_update(mean_update, inputs=True) self.add_update(variance_update, inputs=True) else: mean, variance = self.moving_mean, self.moving_variance mean = math_ops.cast(mean, inputs.dtype) variance = math_ops.cast(variance, inputs.dtype) if offset is not None: offset = math_ops.cast(offset, inputs.dtype) if scale is not None: scale = math_ops.cast(scale, inputs.dtype) # TODO(reedwm): Maybe do math in float32 if given float16 inputs, if doing # math in float16 hurts validation accuracy of popular models like resnet. outputs = nn.batch_normalization(inputs, _broadcast(mean), _broadcast(variance), offset, scale, self.epsilon) # If some components of the shape got lost due to adjustments, fix that. outputs.set_shape(input_shape) if self.virtual_batch_size is not None: outputs = undo_virtual_batching(outputs) return outputs
def _renorm_correction_and_moments(self, mean, variance, training): """Returns the correction and update values for renorm.""" stddev = math_ops.sqrt(variance + self.epsilon) # Compute the average mean and standard deviation, as if they were # initialized with this batch's moments. mixed_renorm_mean = (self.renorm_mean + (1. - self.renorm_mean_weight) * mean) mixed_renorm_stddev = (self.renorm_stddev + (1. - self.renorm_stddev_weight) * stddev) # Compute the corrections for batch renorm. r = stddev / mixed_renorm_stddev d = (mean - mixed_renorm_mean) / mixed_renorm_stddev # Ensure the corrections use pre-update moving averages. with ops.control_dependencies([r, d]): mean = array_ops.identity(mean) stddev = array_ops.identity(stddev) rmin, rmax, dmax = [ self.renorm_clipping.get(key) for key in ['rmin', 'rmax', 'dmax'] ] if rmin is not None: r = math_ops.maximum(r, rmin) if rmax is not None: r = math_ops.minimum(r, rmax) if dmax is not None: d = math_ops.maximum(d, -dmax) d = math_ops.minimum(d, dmax) # When not training, use r=1, d=0, and decay=1 meaning no updates. r = _smart_select(training, lambda: r, lambda: array_ops.ones_like(r)) d = _smart_select(training, lambda: d, lambda: array_ops.zeros_like(d)) decay = _smart_select(training, lambda: self.renorm_momentum, lambda: 1.) def _update_renorm_variable(var, weight, value): """Updates a moving average and weight, returns the unbiased value.""" # Update the variables without zero debiasing. The debiasing will be # accomplished by dividing the exponential moving average by the weight. # For example, after a single update, the moving average would be # (1-decay) * value. and the weight will be 1-decay, with their ratio # giving value. # Make sure the weight is not updated until before r and d computation. value = array_ops.identity(value) with ops.control_dependencies([value]): weight_value = array_ops.constant(1., dtype=weight.dtype) new_var = moving_averages.assign_moving_average(var, value, decay, zero_debias=False) new_weight = moving_averages.assign_moving_average( weight, weight_value, decay, zero_debias=False) return new_var / new_weight with ops.colocate_with(self.moving_mean): new_mean = _update_renorm_variable(self.renorm_mean, self.renorm_mean_weight, mean) with ops.colocate_with(self.moving_variance): new_stddev = _update_renorm_variable(self.renorm_stddev, self.renorm_stddev_weight, stddev) # Make sqrt(moving_variance + epsilon) = new_stddev. new_variance = math_ops.square(new_stddev) - self.epsilon return (r, d, new_mean, new_variance)
def _renorm_correction_and_moments(self, mean, variance, training): """Returns the correction and update values for renorm.""" stddev = math_ops.sqrt(variance + self.epsilon) # Compute the average mean and standard deviation, as if they were # initialized with this batch's moments. mixed_renorm_mean = (self.renorm_mean + (1. - self.renorm_mean_weight) * mean) mixed_renorm_stddev = (self.renorm_stddev + (1. - self.renorm_stddev_weight) * stddev) # Compute the corrections for batch renorm. r = stddev / mixed_renorm_stddev d = (mean - mixed_renorm_mean) / mixed_renorm_stddev # Ensure the corrections use pre-update moving averages. with ops.control_dependencies([r, d]): mean = array_ops.identity(mean) stddev = array_ops.identity(stddev) rmin, rmax, dmax = [self.renorm_clipping.get(key) for key in ['rmin', 'rmax', 'dmax']] if rmin is not None: r = math_ops.maximum(r, rmin) if rmax is not None: r = math_ops.minimum(r, rmax) if dmax is not None: d = math_ops.maximum(d, -dmax) d = math_ops.minimum(d, dmax) # When not training, use r=1, d=0. r = tf_utils.smart_cond(training, lambda: r, lambda: array_ops.ones_like(r)) d = tf_utils.smart_cond(training, lambda: d, lambda: array_ops.zeros_like(d)) def _update_renorm_variable(var, weight, value): """Updates a moving average and weight, returns the unbiased value.""" value = array_ops.identity(value) def _do_update(): """Updates the var and weight, returns their updated ratio.""" # Update the variables without zero debiasing. The debiasing will be # accomplished by dividing the exponential moving average by the weight. # For example, after a single update, the moving average would be # (1-decay) * value. and the weight will be 1-decay, with their ratio # giving the value. # Make sure the weight is not updated until before r and d computation. with ops.control_dependencies([value]): weight_value = array_ops.constant(1., dtype=weight.dtype) new_var = self._assign_moving_average(var, value, self.renorm_momentum) new_weight = self._assign_moving_average(weight, weight_value, self.renorm_momentum) # TODO(yuefengz): the updates to var and weighted can not be batched # together if we fetch their updated values here. Consider calculating # new values and delaying the updates. return new_var / new_weight def _fake_update(): return array_ops.identity(var) return tf_utils.smart_cond(training, _do_update, _fake_update) # TODO(yuefengz): colocate the operations new_mean = _update_renorm_variable(self.renorm_mean, self.renorm_mean_weight, mean) new_stddev = _update_renorm_variable(self.renorm_stddev, self.renorm_stddev_weight, stddev) # Make sqrt(moving_variance + epsilon) = new_stddev. new_variance = math_ops.square(new_stddev) - self.epsilon return (r, d, new_mean, new_variance)
def _variance(self): p = self.probs * array_ops.ones_like( self.total_count)[..., array_ops.newaxis] return self._mean_val - self._mean_val * p
def call(self, inputs): return keras.backend.in_train_phase( lambda: array_ops.ones_like(inputs), lambda: array_ops.zeros_like(inputs))
def _create_hook_ops(self, input_row_indices, input_col_indices, train_ops): """Creates ops to update is_row_sweep_var, global_step and completed_sweeps. Creates two boolean tensors `processed_rows` and `processed_cols`, which keep track of which rows/cols have been processed during the current sweep. Returns ops that should be run after each row / col update. - When `self._is_row_sweep_var` is True, it sets processed_rows[input_row_indices] to True. - When `self._is_row_sweep_var` is False, it sets processed_cols[input_col_indices] to True. Args: input_row_indices: A Tensor. The indices of the input rows that are processed during the current sweep. input_col_indices: A Tensor. The indices of the input columns that are processed during the current sweep. train_ops: A list of ops. The ops created by this function have control dependencies on `train_ops`. Returns: A tuple consisting of: update_op: An op to be run jointly with training. It updates the state and increments counters (global step and completed sweeps). is_sweep_done_var: A Boolean tf.Variable, specifies whether the sweep is done, i.e. all rows (during a row sweep) or all columns (during a column sweep) have been processed. switch_op: An op to be run in `self.before_run` when the sweep is done. """ processed_rows_init = array_ops.fill(dims=[self._num_rows], value=False) with ops.colocate_with(processed_rows_init): processed_rows = variable_scope.variable( processed_rows_init, collections=[ops.GraphKeys.GLOBAL_VARIABLES], trainable=False, name="sweep_hook_processed_rows") processed_cols_init = array_ops.fill(dims=[self._num_cols], value=False) with ops.colocate_with(processed_cols_init): processed_cols = variable_scope.variable( processed_cols_init, collections=[ops.GraphKeys.GLOBAL_VARIABLES], trainable=False, name="sweep_hook_processed_cols") switch_ops = control_flow_ops.group( state_ops.assign(self._is_row_sweep_var, math_ops.logical_not(self._is_row_sweep_var)), state_ops.assign(processed_rows, processed_rows_init), state_ops.assign(processed_cols, processed_cols_init)) is_sweep_done_var = variable_scope.variable( False, collections=[ops.GraphKeys.GLOBAL_VARIABLES], trainable=False, name="is_sweep_done") # After running the `train_ops`, updates `processed_rows` or # `processed_cols` tensors, depending on whether this is a row or col sweep. with ops.control_dependencies(train_ops): with ops.colocate_with(processed_rows): update_processed_rows = state_ops.scatter_update( processed_rows, input_row_indices, math_ops.logical_and( self._is_row_sweep_var, array_ops.ones_like(input_row_indices, dtype=dtypes.bool))) with ops.colocate_with(processed_cols): update_processed_cols = state_ops.scatter_update( processed_cols, input_col_indices, math_ops.logical_and( math_ops.logical_not(self._is_row_sweep_var), array_ops.ones_like(input_col_indices, dtype=dtypes.bool))) update_processed_op = control_flow_ops.group( update_processed_rows, update_processed_cols) with ops.control_dependencies([update_processed_op]): is_sweep_done = math_ops.logical_or( math_ops.reduce_all(processed_rows), math_ops.reduce_all(processed_cols)) # Increments global step. global_step = framework_variables.get_global_step() if global_step is not None: global_step_incr_op = state_ops.assign_add( global_step, 1, name="global_step_incr").op else: global_step_incr_op = control_flow_ops.no_op() # Increments completed sweeps. completed_sweeps_incr_op = state_ops.assign_add( self._completed_sweeps_var, math_ops.cast(is_sweep_done, dtypes.int32), use_locking=True).op update_ops = control_flow_ops.group( global_step_incr_op, completed_sweeps_incr_op, state_ops.assign(is_sweep_done_var, is_sweep_done)) return update_ops, is_sweep_done_var, switch_ops