def _broadcast_uniform_partitioned_dimension(self, axis, lengths): """Broadcasts the partitioned dimension `axis` to match `lengths`.""" axis_dim_size = self.dimension_size(axis) partitioned_sizes = list(self._partitioned_dim_sizes[:axis]) if lengths.shape.ndims == 0: lengths = array_ops.where( math_ops.equal(axis_dim_size, 1), lengths, axis_dim_size) repeats = array_ops.where(math_ops.equal(axis_dim_size, 1), lengths, 1) splits = array_ops.stack([0, self.num_slices_in_dimension(axis)]) else: splits = math_ops.range( array_ops.size(lengths, out_type=self.dim_size_dtype) + 1) repeats = lengths partitioned_sizes.append(lengths) for dim_size in self._partitioned_dim_sizes[axis + 1:]: if dim_size.shape.ndims == 0: partitioned_sizes.append(dim_size) splits *= dim_size else: partitioned_sizes.append( ragged_util.repeat_ranges(dim_size, splits, repeats)) splits = array_ops.gather( ragged_util.lengths_to_splits(dim_size), splits) inner_sizes = self._inner_dim_sizes return RaggedTensorDynamicShape(partitioned_sizes, inner_sizes)
def _survival_function(self, y): low = self._low high = self._high # Recall the promise: # survival_function(y) := P[Y > y] # = 0, if y >= high, # = 1, if y < low, # = P[X > y], otherwise. # P[Y > j] = P[ceiling(Y) > j] since mass is only at integers, not in # between. j = math_ops.ceil(y) # P[X > j], used when low < X < high. result_so_far = self.distribution.survival_function(j) # Broadcast, because it's possible that this is a single distribution being # evaluated on a number of samples, or something like that. j += array_ops.zeros_like(result_so_far) # Re-define values at the cutoffs. if low is not None: result_so_far = array_ops.where(j < low, array_ops.ones_like(result_so_far), result_so_far) if high is not None: result_so_far = array_ops.where(j >= high, array_ops.zeros_like(result_so_far), result_so_far) return result_so_far
def _variance(self): var = (self._ones() * math_ops.square(self.sigma) * self.df / (self.df - 2)) # When 1 < df <= 2, variance is infinite. inf = np.array(np.inf, dtype=self.dtype.as_numpy_dtype()) result_where_defined = array_ops.where( math_ops.greater(self.df, array_ops.fill(self.batch_shape(), 2.)), var, array_ops.fill( self.batch_shape(), inf, name="inf")) if self.allow_nan_stats: nan = np.array(np.nan, dtype=self.dtype.as_numpy_dtype()) return array_ops.where( math_ops.greater(self.df, self._ones()), result_where_defined, array_ops.fill( self.batch_shape(), nan, name="nan")) else: return control_flow_ops.with_dependencies( [ check_ops.assert_less( array_ops.ones( (), dtype=self.dtype), self.df, message="variance not defined for components of df <= 1"), ], result_where_defined)
def _safe_div(numerator, denominator, name="value"): """Computes a safe divide which returns 0 if the denominator is zero. Note that the function contains an additional conditional check that is necessary for avoiding situations where the loss is zero causing NaNs to creep into the gradient computation. Args: numerator: An arbitrary `Tensor`. denominator: A `Tensor` whose shape matches `numerator` and whose values are assumed to be non-negative. name: An optional name for the returned op. Returns: The element-wise value of the numerator divided by the denominator. """ if compat.forward_compatible(2018, 11, 1): return math_ops.div_no_nan(numerator, denominator, name=name) return array_ops.where( math_ops.greater(denominator, 0), math_ops.div(numerator, array_ops.where( math_ops.equal(denominator, 0), array_ops.ones_like(denominator), denominator)), array_ops.zeros_like(numerator), name=name)
def focal_loss(prediction_tensor, target_tensor, weights=None, alpha=0.25, gamma=2): r"""Compute focal loss for predictions. Multi-labels Focal loss formula: FL = -alpha * (z-p)^gamma * log(p) -(1-alpha) * p^gamma * log(1-p) ,which alpha = 0.25, gamma = 2, p = sigmoid(x), z = target_tensor. Args: prediction_tensor: A float tensor of shape [batch_size, num_anchors, num_classes] representing the predicted logits for each class target_tensor: A float tensor of shape [batch_size, num_anchors, num_classes] representing one-hot encoded classification targets weights: A float tensor of shape [batch_size, num_anchors] alpha: A scalar tensor for focal loss alpha hyper-parameter gamma: A scalar tensor for focal loss gamma hyper-parameter Returns: loss: A (scalar) tensor representing the value of the loss function """ sigmoid_p = tf.nn.sigmoid(prediction_tensor) zeros = array_ops.zeros_like(sigmoid_p, dtype=sigmoid_p.dtype) pos_p_sub = array_ops.where(target_tensor >= sigmoid_p, target_tensor - sigmoid_p, zeros) neg_p_sub = array_ops.where(target_tensor > zeros, zeros, sigmoid_p) per_entry_cross_ent = - alpha * (pos_p_sub ** gamma) * tf.log(tf.clip_by_value(sigmoid_p, 1e-8, 1.0)) \ - (1 - alpha) * (neg_p_sub ** gamma) * tf.log(tf.clip_by_value(1.0 - sigmoid_p, 1e-8, 1.0)) return tf.reduce_mean(per_entry_cross_ent)
def _variance(self): # We need to put the tf.where inside the outer tf.where to ensure we never # hit a NaN in the gradient. denom = array_ops.where(math_ops.greater(self.df, 2.), self.df - 2., array_ops.ones_like(self.df)) # Abs(scale) superfluous. var = (array_ops.ones(self.batch_shape_tensor(), dtype=self.dtype) * math_ops.square(self.scale) * self.df / denom) # When 1 < df <= 2, variance is infinite. inf = np.array(np.inf, dtype=self.dtype.as_numpy_dtype()) result_where_defined = array_ops.where( self.df > array_ops.fill(self.batch_shape_tensor(), 2.), var, array_ops.fill(self.batch_shape_tensor(), inf, name="inf")) if self.allow_nan_stats: nan = np.array(np.nan, dtype=self.dtype.as_numpy_dtype()) return array_ops.where( math_ops.greater( self.df, array_ops.ones(self.batch_shape_tensor(), dtype=self.dtype)), result_where_defined, array_ops.fill(self.batch_shape_tensor(), nan, name="nan")) else: return control_flow_ops.with_dependencies( [ check_ops.assert_less( array_ops.ones([], dtype=self.dtype), self.df, message="variance not defined for components of df <= 1"), ], result_where_defined)
def _safe_div(numerator, denominator, name="value"): """Computes a safe divide which returns 0 if the denominator is zero. Note that the function contains an additional conditional check that is necessary for avoiding situations where the loss is zero causing NaNs to creep into the gradient computation. Args: numerator: An arbitrary `Tensor`. denominator: `Tensor` whose shape matches `numerator` and whose values are assumed to be non-negative. name: An optional name for the returned op. Returns: The element-wise value of the numerator divided by the denominator. """ if isinstance(denominator, float): if math_ops.equal(denominator, 0.0): return ops.convert_to_tensor(0.0, dtype=numerator.dtype) return math_ops.div(numerator, denominator) if context.in_eager_mode() and denominator._rank() == 0: # pylint: disable=protected-access if math_ops.equal(denominator, 0.0): return ops.convert_to_tensor(0.0, dtype=numerator.dtype) return math_ops.div(numerator, denominator) return array_ops.where( math_ops.greater(denominator, 0), math_ops.div(numerator, array_ops.where( math_ops.equal(denominator, 0), array_ops.ones_like(denominator), denominator)), array_ops.zeros_like(numerator), name=name)
def body(time, outputs_ta, state, inputs, finished, sequence_lengths): """Internal while_loop body. Args: time: scalar int32 tensor. outputs_ta: structure of TensorArray. state: (structure of) state tensors and TensorArrays. inputs: (structure of) input tensors. finished: bool tensor (keeping track of what's finished). sequence_lengths: int32 tensor (keeping track of time of finish). Returns: `(time + 1, outputs_ta, next_state, next_inputs, next_finished, next_sequence_lengths)`. ``` """ (next_outputs, decoder_state, next_inputs, decoder_finished) = decoder.step(time, inputs, state) next_finished = math_ops.logical_or(decoder_finished, finished) if maximum_iterations is not None: next_finished = math_ops.logical_or( next_finished, time + 1 >= maximum_iterations) next_sequence_lengths = array_ops.where( math_ops.logical_and(math_ops.logical_not(finished), next_finished), array_ops.fill(array_ops.shape(sequence_lengths), time + 1), sequence_lengths) nest.assert_same_structure(state, decoder_state) nest.assert_same_structure(outputs_ta, next_outputs) nest.assert_same_structure(inputs, next_inputs) # Zero out output values past finish if impute_finished: emit = nest.map_structure( lambda out, zero: array_ops.where(finished, zero, out), next_outputs, zero_outputs) else: emit = next_outputs # Copy through states past finish def _maybe_copy_state(new, cur): # TensorArrays and scalar states get passed through. if isinstance(cur, tensor_array_ops.TensorArray): pass_through = True else: new.set_shape(cur.shape) pass_through = (new.shape.ndims == 0) return new if pass_through else array_ops.where(finished, cur, new) if impute_finished: next_state = nest.map_structure( _maybe_copy_state, decoder_state, state) else: next_state = decoder_state outputs_ta = nest.map_structure(lambda ta, out: ta.write(time, out), outputs_ta, emit) return (time + 1, outputs_ta, next_state, next_inputs, next_finished, next_sequence_lengths)
def _cdf(self, x): broadcasted_x = x * array_ops.ones(self.batch_shape()) zeros = array_ops.zeros_like(x + self.a + self.b, dtype=self.dtype) ones = array_ops.ones_like(x + self.a + self.b, dtype=self.dtype) result_if_not_big = array_ops.where( x < self.a, zeros, (broadcasted_x - self.a) / self.range()) return array_ops.where(x >= self.b, ones, result_if_not_big)
def per_example_quantile_regression_loss(labels, weights, predictions, quantile): """Smoothed loss for quantile regression. The standard quantile regression loss is quantile*(y-y') when y>y' and (quantile-1)*(y-y') otherwise, y' is a prediction, y is a label. The impl below is this loss but squared in the region where the loss value < 1. Args: labels: Rank 2 (N, D) tensor of per-example labels. weights: Rank 2 (N, 1) tensor of per-example weights. predictions: Rank 2 (N, D) tensor of per-example predictions. quantile: The quantile to use. Returns: loss: A Rank 2 (N, 1) tensor of per-example quantile loss. update_op: An update operation to update the loss's internal state. """ labels = math_ops.to_float(labels) error = labels - predictions square_loss_right = array_ops.where(error * quantile < 1.0, math_ops.square(quantile * error), quantile * error) square_loss_left = array_ops.where(error * (quantile - 1) < 1, math_ops.square((quantile - 1) * error), (quantile - 1) * error) unweighted_loss = array_ops.where(error > 0, square_loss_right, square_loss_left) if weights is None: return unweighted_loss, control_flow_ops.no_op() else: return unweighted_loss * weights, control_flow_ops.no_op()
def _log_cdf(self, y): low = self._low high = self._high # Recall the promise: # cdf(y) := P[Y <= y] # = 1, if y >= high, # = 0, if y < low, # = P[X <= y], otherwise. # P[Y <= j] = P[floor(Y) <= j] since mass is only at integers, not in # between. j = math_ops.floor(y) result_so_far = self.distribution.log_cdf(j) # Broadcast, because it's possible that this is a single distribution being # evaluated on a number of samples, or something like that. j += array_ops.zeros_like(result_so_far) # Re-define values at the cutoffs. if low is not None: neg_inf = -np.inf * array_ops.ones_like(result_so_far) result_so_far = array_ops.where(j < low, neg_inf, result_so_far) if high is not None: result_so_far = array_ops.where(j >= high, array_ops.zeros_like(result_so_far), result_so_far) return result_so_far
def _cdf(self, y): lower_cutoff = self._lower_cutoff upper_cutoff = self._upper_cutoff # Recall the promise: # cdf(y) := P[Y <= y] # = 1, if y >= upper_cutoff, # = 0, if y < lower_cutoff, # = P[X <= y], otherwise. # P[Y <= j] = P[floor(Y) <= j] since mass is only at integers, not in # between. j = math_ops.floor(y) # P[X <= j], used when lower_cutoff < X < upper_cutoff. result_so_far = self.distribution.cdf(j) # Broadcast, because it's possible that this is a single distribution being # evaluated on a number of samples, or something like that. j += array_ops.zeros_like(result_so_far) # Re-define values at the cutoffs. if lower_cutoff is not None: result_so_far = array_ops.where(j < lower_cutoff, array_ops.zeros_like(result_so_far), result_so_far) if upper_cutoff is not None: result_so_far = array_ops.where(j >= upper_cutoff, array_ops.ones_like(result_so_far), result_so_far) return result_so_far
def per_example_maxent_loss(labels, weights, logits, num_classes, eps=1e-15): """Maximum entropy loss for multiclass problems. Maximum entropy is a generalization of logistic loss for the case when more than 2 classes are present. Args: labels: Rank 2 (N, 1) or Rank 1 (N) tensor of per-example labels. weights: Rank 2 (N, 1) tensor of per-example weights. logits: Rank 2 (N, K) tensor of per-example predictions, K - num of classes. num_classes: number of classes in classification task. Used to expand label indices into one-hot encodings. eps: tolerance, used as a minimum possible value. Returns: loss: A Rank 2 (N, 1) tensor of per-example maxent loss update_op: An update operation to update the loss's internal state. """ labels = math_ops.to_int64(labels) # If labels are of rank 1, make them rank 2. labels_shape = labels.get_shape() if len(labels_shape) != 2: labels = array_ops.expand_dims(labels, 1) # Labels are indices of classes, convert them to one hot encodings. target_one_hot = array_ops.one_hot(indices=labels, depth=num_classes) labels = math_ops.reduce_sum( input_tensor=target_one_hot, reduction_indices=[1]) labels = math_ops.to_float(labels) # Calculate softmax probabilities for each class. unnormalized_probs = math_ops.exp(logits) normalizers = math_ops.reduce_sum(unnormalized_probs, 1, keepdims=True) softmax_predictions = math_ops.divide(unnormalized_probs, math_ops.add(normalizers, eps)) # Pull out the probabilities for real label. probs_for_real_class = math_ops.reduce_sum(labels * softmax_predictions, 1) # Add handling for values near 0 and 1. zeros = array_ops.zeros_like(probs_for_real_class, dtype=logits.dtype) + eps one_minus_eps = array_ops.ones_like( probs_for_real_class, dtype=logits.dtype) - eps # Take maximum(eps, pred) cond = (probs_for_real_class >= eps) probs_for_real_class = array_ops.where(cond, probs_for_real_class, zeros) # Take minimum(1-eps, pred) cond = (probs_for_real_class <= 1 - eps) probs_for_real_class = array_ops.where(cond, probs_for_real_class, one_minus_eps) unweighted_loss = array_ops.expand_dims(-math_ops.log(probs_for_real_class), 1) if weights is None: return unweighted_loss, control_flow_ops.no_op() else: return unweighted_loss * weights, control_flow_ops.no_op()
def _cdf(self, x): broadcast_shape = array_ops.broadcast_dynamic_shape( array_ops.shape(x), self.batch_shape_tensor()) zeros = array_ops.zeros(broadcast_shape, dtype=self.dtype) ones = array_ops.ones(broadcast_shape, dtype=self.dtype) broadcasted_x = x * ones result_if_not_big = array_ops.where( x < self.low, zeros, (broadcasted_x - self.low) / self.range()) return array_ops.where(x >= self.high, ones, result_if_not_big)
def _nest_where(vals, cases): assert len(vals) == len(cases) - 1 if len(vals) == 1: return array_ops.where( math_ops.less(l1_norm, const(vals[0])), cases[0], cases[1]) else: return array_ops.where( math_ops.less(l1_norm, const(vals[0])), cases[0], _nest_where(vals[1:], cases[1:]))
def _get_coordinatewise_learning_rate(self, grad, var): # Compute the learning rate using a moving average for the diagonal of BB^T avg_first = self.get_slot(var, 'first_moment') avg_second = self.get_slot(var, 'second_moment') decay_tensor = math_ops.cast(self._decay_tensor, var.dtype) batch_size = math_ops.cast(self._batch_size_tensor, var.dtype) # Create an estimator for the moving average of gradient mean and variance # via Welford's algorithm if isinstance(grad, ops.Tensor): delta = grad - avg_first first_moment_update = avg_first.assign_add( array_ops.where(self._counter < 1, math_ops.cast(1, var.dtype), 1. - decay_tensor) * delta) with ops.control_dependencies([first_moment_update]): second_moment_update = avg_second.assign_add( math_ops.cast(self._counter < 1, var.dtype) * -(1. - decay_tensor) * ( avg_second - decay_tensor * math_ops.square(delta))) diag_preconditioner = control_flow_ops.with_dependencies( [second_moment_update], clip_ops.clip_by_value(avg_second, 1e-12, 1e12)) elif isinstance(grad, ops.IndexedSlices): delta = grad.values - array_ops.gather_nd(avg_first, grad.indices) first_moment_update = state_ops.scatter_add( avg_first, grad.indices, array_ops.where(self._counter < 1, math_ops.cast(1., var.dtype), 1. - decay_tensor) * delta) with ops.control_dependencies([first_moment_update]): avg_second = state_ops.scatter_add( avg_second, grad.indices, math_ops.cast(self._counter < 1, var.dtype) * -(1. - decay_tensor) * ( array_ops.gather_nd(avg_second, grad.indices) - decay_tensor * math_ops.square(delta))) avg_second = array_ops.gather_nd(avg_second, grad.indices) # TODO(b/70783772) diag_preconditioner = clip_ops.clip_by_value(avg_second, 1e-12, 1e12) else: raise errors.InvalidArgumentError( None, None, 'grad must of type Tensor or IndexedSlice') diag_preconditioner *= batch_size if self._use_single_learning_rate: diag_preconditioner = math_ops.reduce_mean(diag_preconditioner) # From Theorem 2 Corollary 1 of Mandt et al. 2017 return 2. * batch_size / ( math_ops.cast(self._total_num_examples, var.dtype.base_dtype) * diag_preconditioner)
def _prob(self, x): broadcasted_x = x * array_ops.ones(self.batch_shape_tensor()) return array_ops.where( math_ops.is_nan(broadcasted_x), broadcasted_x, array_ops.where( math_ops.logical_or(broadcasted_x < self.low, broadcasted_x >= self.high), array_ops.zeros_like(broadcasted_x), array_ops.ones_like(broadcasted_x) / self.range()))
def _prob(self, x): broadcasted_x = x * array_ops.ones(self.batch_shape()) return array_ops.where( math_ops.is_nan(broadcasted_x), broadcasted_x, array_ops.where( math_ops.logical_or(broadcasted_x < self.a, broadcasted_x > self.b), array_ops.zeros_like(broadcasted_x), (1. / self.range()) * array_ops.ones_like(broadcasted_x)))
def _loop_body(iter_, total, to_skip): total = array_ops.where( step <= to_skip, total, array_ops.where( to_skip > 0., total + (step - to_skip) * samples[..., iter_], total + step * samples[..., iter_])) to_skip = array_ops.where(step <= to_skip, to_skip - step, 0.) return [iter_ + 1, total, to_skip]
def exp_with_logits(name, eps, labels=None, logits=None): """Computes exponential loss given `logits`. The loss returns is exp(-targets*modified_predictions), where modified_predictions are 1 if sigmoid is >= 0.5+eps (eg we predict positive class), -1 if sigmoid < 0.5-eps (e.g. we predict negative class) and ax+b in the interval 0.5-eps, 0.5+eps, where a = 1/eps, b=1/(2eps). Args: name: A name for the operation (optional). eps: For the range (0.5-eps, 0.5+eps) we set the predictions to be ax+b. labels: A `Tensor` of the same type and shape as `logits`. logits: A `Tensor` of type `float32` or `float64`. Returns: A `Tensor` of the same shape as `logits` with the componentwise exponential losses. Raises: ValueError: If `logits` and `labels` do not have the same shape. """ with ops.name_scope(name, "exp_loss", [logits, labels]) as name: logits = ops.convert_to_tensor(logits, name="logits") labels = ops.convert_to_tensor(labels, name="labels") try: labels.get_shape().merge_with(logits.get_shape()) except ValueError: raise ValueError("logits and labels must have the same shape (%s vs %s)" % (logits.get_shape(), labels.get_shape())) # Default threshold to switch between classes zeros = array_ops.zeros_like(logits, dtype=logits.dtype) ones = array_ops.ones_like(logits, dtype=logits.dtype) neg_ones = -array_ops.ones_like(logits, dtype=logits.dtype) # Convert labels to 1 and -1 cond_labels = (labels > zeros) labels_converted = array_ops.where(cond_labels, ones, neg_ones) # Convert predictions to 1 and -1 # The loss we build is min(1, max(-1,ax+b)) # where a=1/eps, b=-1/2eps. a = 1.0 / eps b = -1.0 / 2 / eps probs = math_ops.sigmoid(logits) y = a * probs + b # Build max(-1, ax+b) cond = (y < -1) max_res = array_ops.where(cond, neg_ones, y) # Build min part cond = (max_res > 1) min_res = array_ops.where(cond, ones, max_res) preds_converted = min_res return math_ops.exp(-preds_converted * labels_converted)
def pick_vector(cond, true_vector, false_vector, name="pick_vector"): """Picks possibly different length row `Tensor`s based on condition. Value `Tensor`s should have exactly one dimension. If `cond` is a python Boolean or `tf.constant` then either `true_vector` or `false_vector` is immediately returned. I.e., no graph nodes are created and no validation happens. Args: cond: `Tensor`. Must have `dtype=tf.bool` and be scalar. true_vector: `Tensor` of one dimension. Returned when cond is `True`. false_vector: `Tensor` of one dimension. Returned when cond is `False`. name: `String`. The name to give this op. Example: ```python pick_vector(tf.less(0, 5), tf.range(10, 12), tf.range(15, 18)) # result is tensor: [10, 11]. pick_vector(tf.less(5, 0), tf.range(10, 12), tf.range(15, 18)) # result is tensor: [15, 16, 17]. ``` Returns: true_or_false_vector: `Tensor`. Raises: TypeError: if `cond.dtype != tf.bool` TypeError: if `cond` is not a constant and `true_vector.dtype != false_vector.dtype` """ with ops.name_scope(name, values=(cond, true_vector, false_vector)): cond = ops.convert_to_tensor(cond, name="cond") if cond.dtype != dtypes.bool: raise TypeError("%s.dtype=%s which is not %s" % (cond.name, cond.dtype, dtypes.bool)) cond_value_static = tensor_util.constant_value(cond) if cond_value_static is not None: return true_vector if cond_value_static else false_vector true_vector = ops.convert_to_tensor(true_vector, name="true_vector") false_vector = ops.convert_to_tensor(false_vector, name="false_vector") if true_vector.dtype != false_vector.dtype: raise TypeError( "%s.dtype=%s does not match %s.dtype=%s" % (true_vector.name, true_vector.dtype, false_vector.name, false_vector.dtype)) n = array_ops.shape(true_vector)[0] return array_ops.slice( array_ops.concat((true_vector, false_vector), 0), [array_ops.where(cond, 0, n)], [array_ops.where(cond, n, -1)])
def _forward_log_det_jacobian(self, x): if self._is_only_identity_multiplier: # TODO(jvdillon): We don't pad in this case and instead let the fldj be # applied via broadcast. d = math_ops.cast(array_ops.shape(x)[-1], dtype=self._scale.dtype) return math_ops.log(math_ops.abs(self._scale)) * array_ops.where( math_ops.equal(self._shaper.event_ndims, 0), 1., d) fldj = self._scale.sqrt_log_abs_det() # We need to squeeze off the padded dimension. start = array_ops.where(self._rank_two_event_ndims_one, 1, 0) return array_ops.reshape(fldj, array_ops.shape(fldj)[start:])
def clip_by_norm(t, clip_norm, axes=None, name=None): """Clips tensor values to a maximum L2-norm. Given a tensor `t`, and a maximum clip value `clip_norm`, this operation normalizes `t` so that its L2-norm is less than or equal to `clip_norm`, along the dimensions given in `axes`. Specifically, in the default case where all dimensions are used for calculation, if the L2-norm of `t` is already less than or equal to `clip_norm`, then `t` is not modified. If the L2-norm is greater than `clip_norm`, then this operation returns a tensor of the same type and shape as `t` with its values set to: `t * clip_norm / l2norm(t)` In this case, the L2-norm of the output tensor is `clip_norm`. As another example, if `t` is a matrix and `axes == [1]`, then each row of the output will have L2-norm equal to `clip_norm`. If `axes == [0]` instead, each column of the output will be clipped. This operation is typically used to clip gradients before applying them with an optimizer. Args: t: A `Tensor` or `IndexedSlices`. clip_norm: A 0-D (scalar) `Tensor` > 0. A maximum clipping value. axes: A 1-D (vector) `Tensor` of type int32 containing the dimensions to use for computing the L2-norm. If `None` (the default), uses all dimensions. name: A name for the operation (optional). Returns: A clipped `Tensor` or `IndexedSlices`. """ with ops.name_scope(name, "clip_by_norm", [t, clip_norm]) as name: values = ops.convert_to_tensor( t.values if isinstance(t, ops.IndexedSlices) else t, name="t") # Calculate L2-norm, clip elements by ratio of clip_norm to L2-norm l2sum = math_ops.reduce_sum(values * values, axes, keepdims=True) pred = l2sum > 0 # Two-tap tf.where trick to bypass NaN gradients l2sum_safe = array_ops.where(pred, l2sum, array_ops.ones_like(l2sum)) l2norm = array_ops.where(pred, math_ops.sqrt(l2sum_safe), l2sum) intermediate = values * clip_norm # Assert that the shape is compatible with the initial shape, # to prevent unintentional broadcasting. _ = values.shape.merge_with(intermediate.shape) values_clip = array_ops.identity( intermediate / math_ops.maximum(l2norm, clip_norm), name=name) if isinstance(t, ops.IndexedSlices): return ops.IndexedSlices(values_clip, t.indices, t.dense_shape) return values_clip
def _ndtr(x): """Implements ndtr core logic.""" half_sqrt_2 = constant_op.constant( 0.5 * math.sqrt(2.), dtype=x.dtype, name="half_sqrt_2") w = x * half_sqrt_2 z = math_ops.abs(w) y = array_ops.where(math_ops.less(z, half_sqrt_2), 1. + math_ops.erf(w), array_ops.where(math_ops.greater(w, 0.), 2. - math_ops.erfc(z), math_ops.erfc(z))) return 0.5 * y
def testShapeMismatch(self): c = np.random.randint(0, 2, 8).astype(np.bool) x = np.random.rand(16, 3, 2) * 100 y = np.random.rand(16, 3, 2) * 100 for t in [ np.float16, np.float32, np.float64, np.int32, np.int64, np.complex64, np.complex128 ]: xt = x.astype(t) yt = y.astype(t) with self.assertRaises(ValueError): array_ops.where(c, xt, yt)
def softplus_inverse(x, name=None): """Computes the inverse softplus, i.e., x = softplus_inverse(softplus(x)). Mathematically this op is equivalent to: ```none softplus_inverse = log(exp(x) - 1.) ``` Args: x: `Tensor`. Non-negative (not enforced), floating-point. name: A name for the operation (optional). Returns: `Tensor`. Has the same type/shape as input `x`. """ with ops.name_scope(name, "softplus_inverse", values=[x]): x = ops.convert_to_tensor(x, name="x") # We begin by deriving a more numerically stable softplus_inverse: # x = softplus(y) = Log[1 + exp{y}], (which means x > 0). # ==> exp{x} = 1 + exp{y} (1) # ==> y = Log[exp{x} - 1] (2) # = Log[(exp{x} - 1) / exp{x}] + Log[exp{x}] # = Log[(1 - exp{-x}) / 1] + Log[exp{x}] # = Log[1 - exp{-x}] + x (3) # (2) is the "obvious" inverse, but (3) is more stable than (2) for large x. # For small x (e.g. x = 1e-10), (3) will become -inf since 1 - exp{-x} will # be zero. To fix this, we use 1 - exp{-x} approx x for small x > 0. # # In addition to the numerically stable derivation above, we clamp # small/large values to be congruent with the logic in: # tensorflow/core/kernels/softplus_op.h # # Finally, we set the input to one whenever the input is too large or too # small. This ensures that no unchosen codepath is +/- inf. This is # necessary to ensure the gradient doesn't get NaNs. Recall that the # gradient of `where` behaves like `pred*pred_true + (1-pred)*pred_false` # thus an `inf` in an unselected path results in `0*inf=nan`. We are careful # to overwrite `x` with ones only when we will never actually use this # value. Note that we use ones and not zeros since `log(expm1(0.)) = -inf`. threshold = np.log(np.finfo(x.dtype.as_numpy_dtype).eps) + 2. is_too_small = math_ops.less(x, np.exp(threshold)) is_too_large = math_ops.greater(x, -threshold) too_small_value = math_ops.log(x) too_large_value = x # This `where` will ultimately be a NOP because we won't select this # codepath whenever we used the surrogate `ones_like`. x = array_ops.where(math_ops.logical_or(is_too_small, is_too_large), array_ops.ones_like(x), x) y = x + math_ops.log(-math_ops.expm1(-x)) # == log(expm1(x)) return array_ops.where(is_too_small, too_small_value, array_ops.where(is_too_large, too_large_value, y))
def sparsemax_loss(logits, sparsemax, labels, name=None): """Computes sparsemax loss function [1]. [1]: https://arxiv.org/abs/1602.02068 Args: logits: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`. sparsemax: A `Tensor`. Must have the same type as `logits`. labels: A `Tensor`. Must have the same type as `logits`. name: A name for the operation (optional). Returns: A `Tensor`. Has the same type as `logits`. """ with ops.name_scope(name, "sparsemax_loss", [logits, sparsemax, labels]) as name: logits = ops.convert_to_tensor(logits, name="logits") sparsemax = ops.convert_to_tensor(sparsemax, name="sparsemax") labels = ops.convert_to_tensor(labels, name="labels") # In the paper, they call the logits z. # A constant can be substracted from logits to make the algorithm # more numerically stable in theory. However, there are really no major # source numerical instability in this algorithm. z = logits # sum over support # Use a conditional where instead of a multiplication to support z = -inf. # If z = -inf, and there is no support (sparsemax = 0), a multiplication # would cause 0 * -inf = nan, which is not correct in this case. sum_s = array_ops.where( math_ops.logical_or(sparsemax > 0, math_ops.is_nan(sparsemax)), sparsemax * (z - 0.5 * sparsemax), array_ops.zeros_like(sparsemax)) # - z_k + ||q||^2 q_part = labels * (0.5 * labels - z) # Fix the case where labels = 0 and z = -inf, where q_part would # otherwise be 0 * -inf = nan. But since the lables = 0, no cost for # z = -inf should be consideredself. # The code below also coveres the case where z = inf. Howeverm in this # caose the sparsemax will be nan, which means the sum_s will also be nan, # therefor this case doesn't need addtional special treatment. q_part_safe = array_ops.where( math_ops.logical_and(math_ops.equal(labels, 0), math_ops.is_inf(z)), array_ops.zeros_like(z), q_part) return math_ops.reduce_sum(sum_s + q_part_safe, axis=1)
def compute_lr(self, grad, var): scaled_lr = self._learning_rate if self._skip_list is None or not any(v in var.name for v in self._skip_list): w_norm = linalg_ops.norm(var, ord=2) g_norm = linalg_ops.norm(grad, ord=2) trust_ratio = array_ops.where( math_ops.greater(w_norm, 0), array_ops.where( math_ops.greater(g_norm, 0), (self._eeta * w_norm / (g_norm + self._weight_decay * w_norm + self._epsilon)), 1.0), 1.0) scaled_lr = self._learning_rate * trust_ratio return scaled_lr
def body_fn(t, state, ta): inputs_t = array_ops.expand_dims( array_ops.gather(inputs_ta.read(t), i), 0) output, new_state = cell(inputs_t, state) output = array_ops.reshape(output, [-1]) # TODO(agarwal): one optimization that dynamic_rnn uses is to avoid the # array_ops.where when t < min(sequence_length). Doing that requires # supporting tf.cond pfor conversion. done = t >= sequence_length_i output = array_ops.where(done, zeros, output) ta = ta.write(t, output) new_state = [array_ops.where(done, s, ns) for s, ns in zip(nest.flatten(state), nest.flatten(new_state))] new_state = nest.pack_sequence_as(state, new_state) return t + 1, new_state, ta
def _MaximumMinimumGrad(op, grad, selector_op): """Factor out the code for the gradient of Maximum or Minimum.""" x = op.inputs[0] y = op.inputs[1] gdtype = grad.dtype sx = array_ops.shape(x) sy = array_ops.shape(y) gradshape = array_ops.shape(grad) zeros = array_ops.zeros(gradshape, gdtype) xmask = selector_op(x, y) rx, ry = gen_array_ops._broadcast_gradient_args(sx, sy) xgrad = array_ops.where(xmask, grad, zeros) ygrad = array_ops.where(xmask, zeros, grad) gx = array_ops.reshape(math_ops.reduce_sum(xgrad, rx), sx) gy = array_ops.reshape(math_ops.reduce_sum(ygrad, ry), sy) return (gx, gy)
def _loss(logits): """The loss of pairwise logits with l_i > l_j.""" return array_ops.where(math_ops.greater(logits, 0), 1. - math_ops.sigmoid(logits), math_ops.sigmoid(-logits))
def training_graph(self, input_data, input_labels, random_seed, data_spec, epoch=None, input_weights=None): """Constructs a TF graph for training a random tree. Args: input_data: A tensor or SparseTensor or placeholder for input data. input_labels: A tensor or placeholder for labels associated with input_data. random_seed: The random number generator seed to use for this tree. 0 means use the current time as the seed. data_spec: A list of tf.dtype values specifying the original types of each column. epoch: A tensor or placeholder for the epoch the training data comes from. input_weights: A float tensor or placeholder holding per-input weights, or None if all inputs are to be weighted equally. Returns: The last op in the random tree training graph. """ epoch = [0] if epoch is None else epoch if input_weights is None: input_weights = [] sparse_indices = [] sparse_values = [] sparse_shape = [] if isinstance(input_data, ops.SparseTensor): sparse_indices = input_data.indices sparse_values = input_data.values sparse_shape = input_data.shape input_data = [] # Count extremely random stats. (node_sums, node_squares, splits_indices, splits_sums, splits_squares, totals_indices, totals_sums, totals_squares, input_leaves) = (self.training_ops.count_extremely_random_stats( input_data, sparse_indices, sparse_values, sparse_shape, data_spec, input_labels, input_weights, self.variables.tree, self.variables.tree_thresholds, self.variables.node_to_accumulator_map, self.variables.candidate_split_features, self.variables.candidate_split_thresholds, self.variables.start_epoch, epoch, num_classes=self.params.num_output_columns, regression=self.params.regression)) node_update_ops = [] node_update_ops.append( state_ops.assign_add(self.variables.node_sums, node_sums)) splits_update_ops = [] splits_update_ops.append( self.training_ops.scatter_add_ndim( self.variables.candidate_split_sums, splits_indices, splits_sums)) splits_update_ops.append( self.training_ops.scatter_add_ndim(self.variables.accumulator_sums, totals_indices, totals_sums)) if self.params.regression: node_update_ops.append( state_ops.assign_add(self.variables.node_squares, node_squares)) splits_update_ops.append( self.training_ops.scatter_add_ndim( self.variables.candidate_split_squares, splits_indices, splits_squares)) splits_update_ops.append( self.training_ops.scatter_add_ndim( self.variables.accumulator_squares, totals_indices, totals_squares)) # Sample inputs. update_indices, feature_updates, threshold_updates = ( self.training_ops.sample_inputs( input_data, sparse_indices, sparse_values, sparse_shape, input_weights, self.variables.node_to_accumulator_map, input_leaves, self.variables.candidate_split_features, self.variables.candidate_split_thresholds, split_initializations_per_input=( self.params.split_initializations_per_input), split_sampling_random_seed=random_seed)) update_features_op = state_ops.scatter_update( self.variables.candidate_split_features, update_indices, feature_updates) update_thresholds_op = state_ops.scatter_update( self.variables.candidate_split_thresholds, update_indices, threshold_updates) # Calculate finished nodes. with ops.control_dependencies(splits_update_ops): finished, stale = self.training_ops.finished_nodes( self.variables.accumulator_to_node_map, self.variables.node_to_accumulator_map, self.variables.candidate_split_sums, self.variables.candidate_split_squares, self.variables.accumulator_sums, self.variables.accumulator_squares, self.variables.start_epoch, epoch, num_split_after_samples=self.params.split_after_samples, min_split_samples=self.params.min_split_samples) # Update leaf scores. # TODO(thomaswc): Store the leaf scores in a TopN and only update the # scores of the leaves that were touched by this batch of input. children = array_ops.squeeze(array_ops.slice(self.variables.tree, [0, 0], [-1, 1]), squeeze_dims=[1]) is_leaf = math_ops.equal(constants.LEAF_NODE, children) leaves = math_ops.to_int32( array_ops.squeeze(array_ops.where(is_leaf), squeeze_dims=[1])) non_fertile_leaves = array_ops.boolean_mask( leaves, math_ops.less( array_ops.gather(self.variables.node_to_accumulator_map, leaves), 0)) # TODO(gilberth): It should be possible to limit the number of non # fertile leaves we calculate scores for, especially since we can only take # at most array_ops.shape(finished)[0] of them. with ops.control_dependencies(node_update_ops): sums = array_ops.gather(self.variables.node_sums, non_fertile_leaves) if self.params.regression: squares = array_ops.gather(self.variables.node_squares, non_fertile_leaves) non_fertile_leaf_scores = self._variance(sums, squares) else: non_fertile_leaf_scores = self._weighted_gini(sums) # Calculate best splits. with ops.control_dependencies(splits_update_ops): split_indices = self.training_ops.best_splits( finished, self.variables.node_to_accumulator_map, self.variables.candidate_split_sums, self.variables.candidate_split_squares, self.variables.accumulator_sums, self.variables.accumulator_squares, regression=self.params.regression) # Grow tree. with ops.control_dependencies( [update_features_op, update_thresholds_op]): (tree_update_indices, tree_children_updates, tree_threshold_updates, new_eot) = (self.training_ops.grow_tree( self.variables.end_of_tree, self.variables.node_to_accumulator_map, finished, split_indices, self.variables.candidate_split_features, self.variables.candidate_split_thresholds)) tree_update_op = state_ops.scatter_update(self.variables.tree, tree_update_indices, tree_children_updates) thresholds_update_op = state_ops.scatter_update( self.variables.tree_thresholds, tree_update_indices, tree_threshold_updates) # TODO(thomaswc): Only update the epoch on the new leaves. new_epoch_updates = epoch * array_ops.ones_like( tree_threshold_updates, dtype=dtypes.int32) epoch_update_op = state_ops.scatter_update( self.variables.start_epoch, tree_update_indices, new_epoch_updates) # Update fertile slots. with ops.control_dependencies([tree_update_op]): (n2a_map_updates, a2n_map_updates, accumulators_cleared, accumulators_allocated) = (self.training_ops.update_fertile_slots( finished, non_fertile_leaves, non_fertile_leaf_scores, self.variables.end_of_tree, self.variables.accumulator_sums, self.variables.node_to_accumulator_map, stale, regression=self.params.regression)) # Ensure end_of_tree doesn't get updated until UpdateFertileSlots has # used it to calculate new leaves. gated_new_eot, = control_flow_ops.tuple( [new_eot], control_inputs=[n2a_map_updates]) eot_update_op = state_ops.assign(self.variables.end_of_tree, gated_new_eot) updates = [] updates.append(eot_update_op) updates.append(tree_update_op) updates.append(thresholds_update_op) updates.append(epoch_update_op) updates.append( state_ops.scatter_update(self.variables.node_to_accumulator_map, n2a_map_updates[0], n2a_map_updates[1])) updates.append( state_ops.scatter_update(self.variables.accumulator_to_node_map, a2n_map_updates[0], a2n_map_updates[1])) cleared_and_allocated_accumulators = array_ops.concat( 0, [accumulators_cleared, accumulators_allocated]) # Calculate values to put into scatter update for candidate counts. # Candidate split counts are always reset back to 0 for both cleared # and allocated accumulators. This means some accumulators might be doubly # reset to 0 if the were released and not allocated, then later allocated. split_values = array_ops.tile( array_ops.expand_dims( array_ops.expand_dims( array_ops.zeros_like(cleared_and_allocated_accumulators, dtype=dtypes.float32), 1), 2), [ 1, self.params.num_splits_to_consider, self.params.num_output_columns ]) updates.append( state_ops.scatter_update(self.variables.candidate_split_sums, cleared_and_allocated_accumulators, split_values)) if self.params.regression: updates.append( state_ops.scatter_update( self.variables.candidate_split_squares, cleared_and_allocated_accumulators, split_values)) # Calculate values to put into scatter update for total counts. total_cleared = array_ops.tile( array_ops.expand_dims( math_ops.neg( array_ops.ones_like(accumulators_cleared, dtype=dtypes.float32)), 1), [1, self.params.num_output_columns]) total_reset = array_ops.tile( array_ops.expand_dims( array_ops.zeros_like(accumulators_allocated, dtype=dtypes.float32), 1), [1, self.params.num_output_columns]) accumulator_updates = array_ops.concat(0, [total_cleared, total_reset]) updates.append( state_ops.scatter_update(self.variables.accumulator_sums, cleared_and_allocated_accumulators, accumulator_updates)) if self.params.regression: updates.append( state_ops.scatter_update(self.variables.accumulator_squares, cleared_and_allocated_accumulators, accumulator_updates)) # Calculate values to put into scatter update for candidate splits. split_features_updates = array_ops.tile( array_ops.expand_dims( math_ops.neg( array_ops.ones_like(cleared_and_allocated_accumulators)), 1), [1, self.params.num_splits_to_consider]) updates.append( state_ops.scatter_update(self.variables.candidate_split_features, cleared_and_allocated_accumulators, split_features_updates)) updates += self.finish_iteration() return control_flow_ops.group(*updates)
def _SelectGrad(op, grad): c = op.inputs[0] x = op.inputs[1] zeros = array_ops.zeros_like(x) return (None, array_ops.where(c, grad, zeros), array_ops.where(c, zeros, grad))
def reduce_weighted_logsumexp(logx, w=None, axis=None, keep_dims=False, return_sign=False, name=None): """Computes `log(abs(sum(weight * exp(elements across tensor dimensions))))`. If all weights `w` are known to be positive, it is more efficient to directly use `reduce_logsumexp`, i.e., `tf.reduce_logsumexp(logx + tf.log(w))` is more efficient than `du.reduce_weighted_logsumexp(logx, w)`. Reduces `input_tensor` along the dimensions given in `axis`. Unless `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in `axis`. If `keep_dims` is true, the reduced dimensions are retained with length 1. If `axis` has no entries, all dimensions are reduced, and a tensor with a single element is returned. This function is more numerically stable than log(sum(w * exp(input))). It avoids overflows caused by taking the exp of large inputs and underflows caused by taking the log of small inputs. For example: ```python x = tf.constant([[0., 0, 0], [0, 0, 0]]) w = tf.constant([[-1., 1, 1], [1, 1, 1]]) du.reduce_weighted_logsumexp(x, w) # ==> log(-1*1 + 1*1 + 1*1 + 1*1 + 1*1 + 1*1) = log(4) du.reduce_weighted_logsumexp(x, w, axis=0) # ==> [log(-1+1), log(1+1), log(1+1)] du.reduce_weighted_logsumexp(x, w, axis=1) # ==> [log(-1+1+1), log(1+1+1)] du.reduce_weighted_logsumexp(x, w, axis=1, keep_dims=True) # ==> [[log(-1+1+1)], [log(1+1+1)]] du.reduce_weighted_logsumexp(x, w, axis=[0, 1]) # ==> log(-1+5) ``` Args: logx: The tensor to reduce. Should have numeric type. w: The weight tensor. Should have numeric type identical to `logx`. axis: The dimensions to reduce. If `None` (the default), reduces all dimensions. Must be in the range `[-rank(input_tensor), rank(input_tensor))`. keep_dims: If true, retains reduced dimensions with length 1. return_sign: If `True`, returns the sign of the result. name: A name for the operation (optional). Returns: lswe: The `log(abs(sum(weight * exp(x))))` reduced tensor. sign: (Optional) The sign of `sum(weight * exp(x))`. """ with ops.name_scope(name, "reduce_weighted_logsumexp", [logx, w]): logx = ops.convert_to_tensor(logx, name="logx") if w is None: lswe = math_ops.reduce_logsumexp(logx, axis=axis, keep_dims=keep_dims) if return_sign: sgn = array_ops.ones_like(lswe) return lswe, sgn return lswe w = ops.convert_to_tensor(w, dtype=logx.dtype, name="w") log_absw_x = logx + math_ops.log(math_ops.abs(w)) max_log_absw_x = math_ops.reduce_max(log_absw_x, axis=axis, keep_dims=True) # If the largest element is `-inf` or `inf` then we don't bother subtracting # off the max. We do this because otherwise we'd get `inf - inf = NaN`. That # this is ok follows from the fact that we're actually free to subtract any # value we like, so long as we add it back after taking the `log(sum(...))`. max_log_absw_x = array_ops.where(math_ops.is_inf(max_log_absw_x), array_ops.zeros_like(max_log_absw_x), max_log_absw_x) wx_over_max_absw_x = (math_ops.sign(w) * math_ops.exp(log_absw_x - max_log_absw_x)) sum_wx_over_max_absw_x = math_ops.reduce_sum(wx_over_max_absw_x, axis=axis, keep_dims=keep_dims) if not keep_dims: max_log_absw_x = array_ops.squeeze(max_log_absw_x, axis) sgn = math_ops.sign(sum_wx_over_max_absw_x) lswe = max_log_absw_x + math_ops.log(sgn * sum_wx_over_max_absw_x) if return_sign: return lswe, sgn return lswe
def loop_fn(i): a_i = array_ops.gather(a, i) b_i = array_ops.gather(b, i) cond_i = array_ops.gather(cond, i) return array_ops.where(cond_i, a_i, b_i)
def get_losses(self, logits, localisations, gclasses, glocalisations, gscores, match_threshold=0.5, negative_ratio=2.5, alpha=1., label_smoothing=0., scope=None): """Loss functions for training the SSD 300 VGG network. This function defines the different loss components of the SSD, and adds them to the TF loss collection. Arguments: logits: (list of) predictions logits Tensors; localisations: (list of) localisations Tensors; gclasses: (list of) groundtruth labels Tensors; glocalisations: (list of) groundtruth localisations Tensors; gscores: (list of) groundtruth score Tensors; """ with tf.name_scope(scope, 'ssd_losses'): lshape = tfe.get_shape(logits[0], 5) num_classes = lshape[-1] # batch_size = lshape[0] # Flatten out all vectors! flogits = [] fgclasses = [] fgscores = [] flocalisations = [] fglocalisations = [] for i in range(len(logits)): flogits.append(tf.reshape(logits[i], [-1, num_classes])) fgclasses.append(tf.reshape(gclasses[i], [-1])) fgscores.append(tf.reshape(gscores[i], [-1])) flocalisations.append(tf.reshape(localisations[i], [-1, 4])) fglocalisations.append(tf.reshape(glocalisations[i], [-1, 4])) # And concat the crap! logits = tf.concat(flogits, axis=0) gclasses = tf.concat(fgclasses, axis=0) gscores = tf.concat(fgscores, axis=0) localisations = tf.concat(flocalisations, axis=0) glocalisations = tf.concat(fglocalisations, axis=0) dtype = logits.dtype # Compute positive matching mask... pmask = gclasses > 0 fpmask = tf.cast(pmask, dtype) n_positives = tf.reduce_sum(fpmask) # Hard negative mining... #for no_classes, we only care that false positive's label is 0 #this is why pmask sufice our needs no_classes = tf.cast(pmask, tf.int32) predictions = slim.softmax(logits) nmask = tf.logical_not(pmask) fnmask = tf.cast(nmask, dtype) nvalues = tf.where(nmask, predictions[:, 0], 1. - fnmask) nvalues_flat = tf.reshape(nvalues, [-1]) # Number of negative entries to select. max_neg_entries = tf.cast(tf.reduce_sum(fnmask), tf.int32) n_neg = tf.cast(negative_ratio * n_positives, tf.int32) n_neg = tf.minimum(n_neg, max_neg_entries) #avoid n_neg is zero, and cause error when doing top_k later on n_neg = tf.maximum(n_neg, 1) val, idxes = tf.nn.top_k(-nvalues_flat, k=n_neg) max_hard_pred = -val[-1] # Final negative mask, hard negative mining nmask = tf.logical_and(nmask, nvalues <= max_hard_pred) fnmask = tf.cast(nmask, dtype) # Add cross-entropy loss. with tf.name_scope('cross_entropy_pos'): total_cross_pos = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=gclasses) total_cross_pos = tf.reduce_sum(total_cross_pos * fpmask, name="cross_entropy_pos") tf.losses.add_loss(total_cross_pos) with tf.name_scope('cross_entropy_neg'): total_cross_neg = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=no_classes) total_cross_neg = tf.reduce_sum(total_cross_neg * fnmask, name="cross_entropy_neg") tf.losses.add_loss(total_cross_neg) # Add localization loss: smooth L1, L2, ... with tf.name_scope('localization'): # Weights Tensor: positive mask + random negative. weights = tf.expand_dims(alpha * fpmask, axis=-1) total_loc = custom_layers.abs_smooth_2(localisations - glocalisations) total_loc = tf.reduce_sum(total_loc * weights, name="localization") tf.losses.add_loss(total_loc) total_cross = tf.add(total_cross_pos, total_cross_neg, 'cross_entropy') # Add to EXTRA LOSSES TF.collection tf.add_to_collection('EXTRA_LOSSES', total_cross_pos) tf.add_to_collection('EXTRA_LOSSES', total_cross_neg) tf.add_to_collection('EXTRA_LOSSES', total_cross) tf.add_to_collection('EXTRA_LOSSES', total_loc) #stick with the orgiginal paper in terms of definig model loss model_loss = tf.get_collection(tf.GraphKeys.LOSSES) model_loss = tf.add_n(model_loss) model_loss = array_ops.where(tf.equal(n_positives, 0), array_ops.zeros_like(model_loss), tf.div(1.0, n_positives) * model_loss) #Add regularziaton loss regularization_losses = tf.get_collection( tf.GraphKeys.REGULARIZATION_LOSSES) regularization_loss = tf.add_n(regularization_losses, name='regularization_loss') #if model oss is zero, no need to do gradient update on this batch total_loss = array_ops.where( tf.equal(n_positives, 0), array_ops.zeros_like(model_loss), tf.add(model_loss, regularization_loss)) #debugging info tf.summary.scalar("postive_num", n_positives) tf.summary.scalar("negative_num", n_neg) tf.summary.scalar("regularization_loss", regularization_loss) # with tf.name_scope('variables_loc'): # selected_p = tf.boolean_mask(glocalisations, pmask) # p_mean, p_variance = tf.nn.moments(selected_p, [0]) # tf.summary.scalar("mean_cx", p_mean[0]) # tf.summary.scalar("mean_cy", p_mean[1]) # tf.summary.scalar("mean_w", p_mean[2]) # tf.summary.scalar("mean_h", p_mean[3]) # # tf.summary.scalar("var_cx", p_variance[0]) # tf.summary.scalar("var_cy", p_variance[1]) # tf.summary.scalar("var_w", p_variance[2]) # tf.summary.scalar("var_h", p_variance[3]) return total_loss
def log_poisson_loss(targets, log_input, compute_full_loss=False, name=None): """Computes log Poisson loss given `log_input`. Gives the log-likelihood loss between the prediction and the target under the assumption that the target has a Poisson distribution. Caveat: By default, this is not the exact loss, but the loss minus a constant term [log(z!)]. That has no effect for optimization, but does not play well with relative loss comparisons. To compute an approximation of the log factorial term, specify compute_full_loss=True to enable Stirling's Approximation. For brevity, let `c = log(x) = log_input`, `z = targets`. The log Poisson loss is -log(exp(-x) * (x^z) / z!) = -log(exp(-x) * (x^z)) + log(z!) ~ -log(exp(-x)) - log(x^z) [+ z * log(z) - z + 0.5 * log(2 * pi * z)] [ Note the second term is the Stirling's Approximation for log(z!). It is invariant to x and does not affect optimization, though important for correct relative loss comparisons. It is only computed when compute_full_loss == True. ] = x - z * log(x) [+ z * log(z) - z + 0.5 * log(2 * pi * z)] = exp(c) - z * c [+ z * log(z) - z + 0.5 * log(2 * pi * z)] Args: targets: A `Tensor` of the same type and shape as `log_input`. log_input: A `Tensor` of type `float32` or `float64`. compute_full_loss: whether to compute the full loss. If false, a constant term is dropped in favor of more efficient optimization. name: A name for the operation (optional). Returns: A `Tensor` of the same shape as `log_input` with the componentwise logistic losses. Raises: ValueError: If `log_input` and `targets` do not have the same shape. """ with ops.name_scope(name, "log_poisson_loss", [log_input, targets]) as name: log_input = ops.convert_to_tensor(log_input, name="log_input") targets = ops.convert_to_tensor(targets, name="targets") try: targets.get_shape().merge_with(log_input.get_shape()) except ValueError: raise ValueError( "log_input and targets must have the same shape (%s vs %s)" % (log_input.get_shape(), targets.get_shape())) result = math_ops.exp(log_input) - log_input * targets if compute_full_loss: # need to create constant tensors here so that their dtypes can be matched # to that of the targets. point_five = constant_op.constant(0.5, dtype=targets.dtype) two_pi = constant_op.constant(2 * math.pi, dtype=targets.dtype) stirling_approx = (targets * math_ops.log(targets)) - targets + ( point_five * math_ops.log(two_pi * targets)) zeros = array_ops.zeros_like(targets, dtype=targets.dtype) ones = array_ops.ones_like(targets, dtype=targets.dtype) cond = math_ops.logical_and(targets >= zeros, targets <= ones) result += array_ops.where(cond, zeros, stirling_approx) return result
def clip_by_norm(t, clip_norm, axes=None, name=None): """Clips tensor values to a maximum L2-norm. Given a tensor `t`, and a maximum clip value `clip_norm`, this operation normalizes `t` so that its L2-norm is less than or equal to `clip_norm`, along the dimensions given in `axes`. Specifically, in the default case where all dimensions are used for calculation, if the L2-norm of `t` is already less than or equal to `clip_norm`, then `t` is not modified. If the L2-norm is greater than `clip_norm`, then this operation returns a tensor of the same type and shape as `t` with its values set to: `t * clip_norm / l2norm(t)` In this case, the L2-norm of the output tensor is `clip_norm`. As another example, if `t` is a matrix and `axes == [1]`, then each row of the output will have L2-norm less than or equal to `clip_norm`. If `axes == [0]` instead, each column of the output will be clipped. Code example: >>> some_nums = tf.constant([[1, 2, 3, 4, 5]], dtype=tf.float32) >>> tf.clip_by_norm(some_nums, 2.0).numpy() array([[0.26967996, 0.5393599 , 0.80903983, 1.0787199 , 1.3483998 ]], dtype=float32) This operation is typically used to clip gradients before applying them with an optimizer. Most gradient data is a collection of different shaped tensors for different parts of the model. Thus, this is a common usage: ``` # Get your gradients after training loss_value, grads = grad(model, features, labels) # Apply some clipping grads = [tf.clip_by_norm(g, norm) for g in grads] # Continue on with training optimizer.apply_gradients(grads) ``` Args: t: A `Tensor` or `IndexedSlices`. This must be a floating point type. clip_norm: A 0-D (scalar) `Tensor` > 0. A maximum clipping value, also floating point axes: A 1-D (vector) `Tensor` of type int32 containing the dimensions to use for computing the L2-norm. If `None` (the default), uses all dimensions. name: A name for the operation (optional). Returns: A clipped `Tensor` or `IndexedSlices`. Raises: ValueError: If the clip_norm tensor is not a 0-D scalar tensor. TypeError: If dtype of the input is not a floating point or complex type. """ with ops.name_scope(name, "clip_by_norm", [t, clip_norm]) as name: values = ops.convert_to_tensor( t.values if isinstance(t, ops.IndexedSlices) else t, name="t") # Calculate L2-norm, clip elements by ratio of clip_norm to L2-norm l2sum = math_ops.reduce_sum(values * values, axes, keepdims=True) pred = l2sum > 0 # Two-tap tf.where trick to bypass NaN gradients l2sum_safe = array_ops.where(pred, l2sum, array_ops.ones_like(l2sum)) l2norm = array_ops.where(pred, math_ops.sqrt(l2sum_safe), l2sum) intermediate = values * clip_norm # Assert that the shape is compatible with the initial shape, # to prevent unintentional broadcasting. _ = values.shape.merge_with(intermediate.shape) values_clip = array_ops.identity( intermediate / math_ops.maximum(l2norm, clip_norm), name=name) if isinstance(t, ops.IndexedSlices): return ops.IndexedSlices(values_clip, t.indices, t.dense_shape) return values_clip
def _copy_one_through(output, new_output): copy_cond = (time >= sequence_length) with ops.colocate_with(new_output): return array_ops.where(copy_cond, output, new_output)
def safe_embedding_lookup_sparse( embedding_weights, sparse_ids, sparse_weights=None, combiner="mean", default_id=None, name="safe_embedding_lookup_sparse", partition_strategy=None, # no used max_norm=None, return_trainable=False, ): """Provides a dynamic version of `tf.nn.safe_embedding_lookup_sparse`. Lookup embedding results, accounting for empty features and invalid weights. Any IDs will be treated as valid include non-positive IDs. Invalid weights (<= 0) are pruned from input weights, as well as any IDs with non-positive weight. For an entry with no features, the embedding vector for `default_id` is returned, or the 0-vector if `default_id` is not supplied. The ids and weights may be multi-dimensional. Embeddings are always aggregated along the last dimension. Args: embedding_weights: A single `dynamic_embedding.Variable` instance representing the complete embedding tensor. sparse_ids: `SparseTensor` of shape `[d_0, d_1, ..., d_n]` containing the ids. `d_0` is typically batch size. sparse_weights: `SparseTensor` of same shape as `sparse_ids`, containing float weights corresponding to `sparse_ids`, or `None` if all weights are be assumed to be 1.0. combiner: A string specifying how to combine embedding results for each entry. Currently "mean", "sqrtn" and "sum" are supported, with "mean" the default. default_id: The id to use for an entry with no features. name: A name for this operation. Name is optional in graph mode and required in eager mode. partition_strategy: A string specifying the partitioning strategy. Currently `"div"` and `"mod"` are supported. Default is `"div"`. max_norm: If not `None`, all embeddings are l2-normalized to max_norm before combining. Returns: combined_embeddings: A dense `Tensor` of shape `[d_0, d_1, ..., d_{n-1}, e_1, ..., e_m]`. trainable_wrap: A TrainableWrapper object used to fill the Optimizers `var_list` Only provided if `return_trainable` is True. Raises: ValueError: if `embedding_weights` is empty. """ if embedding_weights is None: raise ValueError("Missing embedding_weights %s." % embedding_weights) if embedding_weights.key_dtype != sparse_ids.dtype: raise TypeError( "embedding_weights.key_dtype should be same with sparse_ids.dtype: " "{} vs. {}".format(embedding_weights.key_dtype, sparse_ids.dtype)) weights_dtype = sparse_weights.dtype if sparse_weights is not None else None if weights_dtype and embedding_weights.value_dtype != weights_dtype: raise TypeError( "embedding_weights.value_dtype should be same with sparse_weights.dtype" ": {} vs. {}".format(embedding_weights.value_dtype, weights_dtype)) scope = variable_scope.get_variable_scope() full_name = scope.name + "/" + name if scope.name else name with ops.name_scope(full_name + "/"): # Reshape higher-rank sparse ids and weights to linear segment ids. original_shape = sparse_ids.dense_shape original_rank_dim = tensor_shape.dimension_value( sparse_ids.dense_shape.get_shape()[0]) original_rank = (array_ops.size(original_shape) if original_rank_dim is None else original_rank_dim) sparse_ids = de.math.sparse_reshape( sparse_ids, [ math_ops.reduce_prod( array_ops.slice(original_shape, [0], [original_rank - 1])), array_ops.gather(original_shape, original_rank - 1), ], ) if sparse_weights is not None: sparse_weights = sparse_tensor.SparseTensor(sparse_ids.indices, sparse_weights.values, sparse_ids.dense_shape) # Prune invalid weights. if combiner != "sum": sparse_ids, sparse_weights = _prune_invalid_weights( sparse_ids, sparse_weights) # Fill in dummy values for empty features, if necessary. sparse_ids, is_row_empty = de.math.sparse_fill_empty_rows( sparse_ids, default_id or 0) if sparse_weights is not None: sparse_weights, _ = de.math.sparse_fill_empty_rows(sparse_weights, 1.0) result, trainable_ = embedding_lookup_sparse( embedding_weights, sparse_ids, sparse_weights, combiner=combiner, partition_strategy=partition_strategy, name=name + "/embedding_lookup_sparse", max_norm=max_norm, return_trainable=True, ) if default_id is None: # Broadcast is_row_empty to the same shape as embedding_lookup_result, # for use in Select. is_row_empty = array_ops.tile( array_ops.reshape(is_row_empty, [-1, 1]), array_ops.stack([1, array_ops.shape(result)[1]]), ) result = array_ops.where(is_row_empty, array_ops.zeros_like(result), result, name="where") # Reshape back from linear ids back into higher-dimensional dense result. final_result = array_ops.reshape( result, array_ops.concat( [ array_ops.slice( math_ops.cast(original_shape, dtypes.int32), [0], [original_rank - 1], ), array_ops.slice(array_ops.shape(result), [1], [-1]), ], 0, ), ) final_result.set_shape( tensor_shape.unknown_shape( (tensor_shape.Dimension(original_rank_dim) - 1).value).concatenate( result.get_shape()[1:])) return (final_result, trainable_) if return_trainable else final_result
def _single_seq_fn(): log_norm = math_ops.reduce_logsumexp(first_input, [1]) # Mask `log_norm` of the sequences with length <= zero. log_norm = array_ops.where(math_ops.less_equal(sequence_lengths, 0), array_ops.zeros_like(log_norm), log_norm) return log_norm
def _embedding_lookup_and_transform(params, ids, partition_strategy="mod", name=None, max_norm=None, transform_fn=None): """Helper function for embedding_lookup and _compute_sampled_logits. This function is a generalization of embedding_lookup that optionally applies a caller-specified transformation to each embedding. This is done through the `transform_fn` argument. If provided, the function is applied to each partitioned tensor of retrieved embeddings, colocated with the embeddings. This function will be called with a single `Tensor` argument of the same type as the `params` tensor and should return a `Tensor`. The shape of the argument will be the same as `params` except for the size of the first dimension. The first dimension of the result's shape must be the same size as the argument's. Args: params: See embedding_lookup. ids: See embedding_lookup. partition_strategy: See embedding_lookup. name: See embedding_lookup. max_norm: See embedding_lookup. transform_fn: An optional function to apply to each retrieved embedding. If max_norm is provided, transform_fn is applied to the norm-limited embeddings. Returns: See embedding_lookup for details. Raises: ValueError: If `params` is empty. """ if params is None or params in ((), []): raise ValueError("Need at least one param") if isinstance(params, variables.PartitionedVariable): params = list(params) # Iterate to get the underlying Variables. if not isinstance(params, list): params = [params] with ops.name_scope(name, "embedding_lookup", params + [ids]) as name: np = len(params) # Number of partitions # Preserve the resource variable status to avoid accidental dense reads. if not any( isinstance(p, resource_variable_ops.ResourceVariable) for p in params): params = ops.convert_n_to_tensor_or_indexed_slices(params, name="params") ids = ops.convert_to_tensor(ids, name="ids") if np == 1 and (not transform_fn or ids.get_shape().ndims == 1): with ops.colocate_with(params[0]): result = _clip(array_ops.gather(params[0], ids, name=name), ids, max_norm) if transform_fn: result = transform_fn(result) return result else: # Flatten the ids. There are two cases where we need to do this. # - There is more than one params tensor. # - There is a transform_fn and ids is not statically known to be 1-D. # We must flatten in this case because transform_fn expects a flat # tensor of embeddings. flat_ids = array_ops.reshape(ids, [-1]) original_indices = math_ops.range(array_ops.size(flat_ids)) # Create p_assignments and set new_ids depending on the strategy. if partition_strategy == "mod": p_assignments = flat_ids % np new_ids = flat_ids // np elif partition_strategy == "div": # Compute num_total_ids as the sum of dim-0 of params, then assign to # partitions based on a constant number of ids per partition. Optimize # if we already know the full shape statically. dim_0_size = params[0].get_shape()[0] for p in xrange(1, np): dim_0_size += params[p].get_shape()[0] if dim_0_size.value: num_total_ids = constant_op.constant( dim_0_size.value, flat_ids.dtype) else: dim_0_sizes = [] for p in xrange(np): if params[p].get_shape()[0].value is not None: dim_0_sizes.append(params[p].get_shape()[0].value) else: with ops.colocate_with(params[p]): dim_0_sizes.append( array_ops.shape(params[p])[0]) num_total_ids = math_ops.reduce_sum( math_ops.cast(array_ops.stack(dim_0_sizes), flat_ids.dtype)) ids_per_partition = num_total_ids // np extras = num_total_ids % np p_assignments = math_ops.maximum( flat_ids // (ids_per_partition + 1), (flat_ids - extras) // ids_per_partition) # Emulate a conditional using a boolean indicator tensor new_ids = array_ops.where( p_assignments < extras, flat_ids % (ids_per_partition + 1), (flat_ids - extras) % ids_per_partition) else: raise ValueError("Unrecognized partition strategy: " + partition_strategy) # Cast partition assignments to int32 for use in dynamic_partition. # There really should not be more than 2^32 partitions. p_assignments = math_ops.cast(p_assignments, dtypes.int32) # Partition list of ids based on assignments into np separate lists gather_ids = data_flow_ops.dynamic_partition( new_ids, p_assignments, np) # Similarly, partition the original indices. pindices = data_flow_ops.dynamic_partition(original_indices, p_assignments, np) # Do np separate lookups, finding embeddings for plist[p] in params[p] partitioned_result = [] for p in xrange(np): pids = gather_ids[p] with ops.colocate_with(params[p]): result = array_ops.gather(params[p], pids) if transform_fn: # If transform_fn is provided, the clip_by_norm precedes # the transform and hence must be co-located. See below # for the counterpart if transform_fn is not proveded. result = transform_fn(_clip(result, pids, max_norm)) partitioned_result.append(result) # Stitch these back together ret = data_flow_ops.parallel_dynamic_stitch(pindices, partitioned_result, name=name) # Determine the static element shape. if transform_fn is None: element_shape_s = params[0].get_shape()[1:] for p in params[1:]: element_shape_s = element_shape_s.merge_with( p.get_shape()[1:]) else: element_shape_s = ret.get_shape()[1:] # Compute the dynamic element shape. if element_shape_s.is_fully_defined(): element_shape_d = element_shape_s elif transform_fn is None: # It's important that we compute params[0].shape on the right device # to avoid data motion. with ops.colocate_with(params[0]): params_shape = array_ops.shape(params[0]) element_shape_d = params_shape[1:] else: element_shape_d = array_ops.shape(ret)[1:] # Reshape to reverse the flattening of ids. ret = array_ops.reshape( ret, array_ops.concat([array_ops.shape(ids), element_shape_d], 0)) # Normally the reshape is sufficient, but setting shape explicitly # teaches shape inference that params[1:].get_shape() matters # (in the case that transform_fn is None). ret.set_shape(ids.get_shape().concatenate(element_shape_s)) if not transform_fn: # If transform_fn was provided, the clip_by_norm was done above. ret = _clip(ret, ids, max_norm) return ret
def connected_components(images): """Labels the connected components in a batch of images. A component is a set of pixels in a single input image, which are all adjacent and all have the same non-zero value. The components using a squared connectivity of one (all True entries are joined with their neighbors above, below, left, and right). Components across all images have consecutive ids 1 through n. Components are labeled according to the first pixel of the component appearing in row-major order (lexicographic order by image_index_in_batch, row, col). Zero entries all have an output id of 0. This op is equivalent with `scipy.ndimage.measurements.label` on a 2D array with the default structuring element (which is the connectivity used here). Args: images: A 2D (H, W) or 3D (N, H, W) Tensor of boolean image(s). Returns: Components with the same shape as `images`. False entries in `images` have value 0, and all True entries map to a component id > 0. Raises: TypeError: if `images` is not 2D or 3D. """ with ops.name_scope("connected_components"): image_or_images = ops.convert_to_tensor(images, name="images") if len(image_or_images.get_shape()) == 2: images = image_or_images[None, :, :] elif len(image_or_images.get_shape()) == 3: images = image_or_images else: raise TypeError( "images should have rank 2 (HW) or 3 (NHW). Static shape is %s" % image_or_images.get_shape()) components = gen_image_ops.image_connected_components(images) # TODO(ringwalt): Component id renaming should be done in the op, to avoid # constructing multiple additional large tensors. components_flat = array_ops.reshape(components, [-1]) unique_ids, id_index = array_ops.unique(components_flat) id_is_zero = array_ops.where(math_ops.equal(unique_ids, 0))[:, 0] # Map each nonzero id to consecutive values. nonzero_consecutive_ids = math_ops.range( array_ops.shape(unique_ids)[0] - array_ops.shape(id_is_zero)[0]) + 1 def no_zero(): # No need to insert a zero into the ids. return nonzero_consecutive_ids def has_zero(): # Insert a zero in the consecutive ids where zero appears in unique_ids. # id_is_zero has length 1. zero_id_ind = math_ops.cast(id_is_zero[0], dtypes.int32) ids_before = nonzero_consecutive_ids[:zero_id_ind] ids_after = nonzero_consecutive_ids[zero_id_ind:] return array_ops.concat([ids_before, [0], ids_after], axis=0) new_ids = control_flow_ops.cond( math_ops.equal(array_ops.shape(id_is_zero)[0], 0), no_zero, has_zero) components = array_ops.reshape( array_ops.gather(new_ids, id_index), array_ops.shape(components)) if len(image_or_images.get_shape()) == 2: return components[0, :, :] else: return components
def _list_mle_loss(labels, logits, weights=None, lambda_weight=None, reduction=core_losses.Reduction.SUM_BY_NONZERO_WEIGHTS, name=None, seed=None): """Computes the ListMLE loss [Xia et al. 2008] for a list. Given the labels of graded relevance l_i and the logits s_i, we calculate the ListMLE loss for the given list. The `lambda_weight` re-weights examples based on l_i and r_i. The recommended weighting scheme is the formulation presented in the "Position-Aware ListMLE" paper (Lan et al.) and available using create_p_list_mle_lambda_weight() factory function above. Args: labels: A `Tensor` of the same shape as `logits` representing graded relevance. logits: A `Tensor` with shape [batch_size, list_size]. Each value is the ranking score of the corresponding item. weights: A scalar, a `Tensor` with shape [batch_size, 1] for list-wise weights, or a `Tensor` with shape [batch_size, list_size] for item-wise weights. lambda_weight: A `DCGLambdaWeight` instance. reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to reduce training loss over batch. name: A string used as the name for this loss. seed: A randomization seed used when shuffling ground truth permutations. Returns: An op for the ListMLE loss. """ with ops.name_scope(name, 'list_mle_loss', (labels, logits, weights)): is_label_valid = utils.is_label_valid(labels) # Reset the invalid labels to 0 and reset the invalid logits to a logit with # ~= 0 contribution. labels = array_ops.where(is_label_valid, labels, array_ops.zeros_like(labels)) logits = array_ops.where( is_label_valid, logits, math_ops.log(_EPSILON) * array_ops.ones_like(logits)) weights = 1.0 if weights is None else ops.convert_to_tensor(weights) weights = array_ops.squeeze(weights) # Shuffle labels and logits to add randomness to sort. shuffled_indices = utils.shuffle_valid_indices(is_label_valid, seed) shuffled_labels = array_ops.gather_nd(labels, shuffled_indices) shuffled_logits = array_ops.gather_nd(logits, shuffled_indices) sorted_labels, sorted_logits = utils.sort_by_scores( shuffled_labels, [shuffled_labels, shuffled_logits]) raw_max = math_ops.reduce_max(sorted_logits, axis=1, keepdims=True) sorted_logits = sorted_logits - raw_max sums = math_ops.cumsum(math_ops.exp(sorted_logits), axis=1, reverse=True) sums = math_ops.log(sums) - sorted_logits if lambda_weight is not None and isinstance(lambda_weight, ListMLELambdaWeight): sums *= lambda_weight.individual_weights(sorted_labels) negative_log_likelihood = math_ops.reduce_sum(sums, 1) return core_losses.compute_weighted_loss(negative_log_likelihood, weights=weights, reduction=reduction)
def mean_pairwise_squared_error(predictions, labels=None, weights=1.0, scope=None): """Adds a pairwise-errors-squared loss to the training procedure. Unlike `mean_squared_error`, which is a measure of the differences between corresponding elements of `predictions` and `labels`, `mean_pairwise_squared_error` is a measure of the differences between pairs of corresponding elements of `predictions` and `labels`. For example, if `labels`=[a, b, c] and `predictions`=[x, y, z], there are three pairs of differences are summed to compute the loss: loss = [ ((a-b) - (x-y)).^2 + ((a-c) - (x-z)).^2 + ((b-c) - (y-z)).^2 ] / 3 Note that since the inputs are of size [batch_size, d0, ... dN], the corresponding pairs are computed within each batch sample but not across samples within a batch. For example, if `predictions` represents a batch of 16 grayscale images of dimension [batch_size, 100, 200], then the set of pairs is drawn from each image, but not across images. `weights` acts as a coefficient for the loss. If a scalar is provided, then the loss is simply scaled by the given value. If `weights` is a tensor of size [batch_size], then the total loss for each sample of the batch is rescaled by the corresponding element in the `weights` vector. Args: predictions: The predicted outputs, a tensor of size [batch_size, d0, .. dN] where N+1 is the total number of dimensions in `predictions`. labels: The ground truth output tensor, whose shape must match the shape of the `predictions` tensor. weights: Coefficients for the loss a scalar, a tensor of shape [batch_size] or a tensor whose shape matches `predictions`. scope: The scope for the operations performed in computing the loss. Returns: A scalar `Tensor` representing the loss value. Raises: ValueError: If the shape of `predictions` doesn't match that of `labels` or if the shape of `weights` is invalid. """ with ops.name_scope(scope, "mean_pairwise_squared_error", [predictions, labels, weights]) as scope: predictions.get_shape().assert_is_compatible_with(labels.get_shape()) predictions = math_ops.cast(predictions, dtypes.float32) labels = math_ops.cast(labels, dtypes.float32) weights = math_ops.cast(ops.convert_to_tensor(weights), dtypes.float32) diffs = math_ops.subtract(predictions, labels) # Need to verify here since the function doesn't use compute_weighted_loss if diffs.get_shape().ndims is None: raise ValueError("diffs.get_shape().ndims cannot be None") if weights.get_shape().ndims is None: raise ValueError("weights.get_shape().ndims cannot be None") axis = list(range(1, diffs.get_shape().ndims)) sum_squares_diff_per_batch = math_ops.reduce_sum( math_ops.square(diffs), axis=axis) num_present_per_batch = _num_present(diffs, weights, per_batch=True) term1 = 2.0 * math_ops.div_no_nan( sum_squares_diff_per_batch, num_present_per_batch, name="value") sum_diff = math_ops.reduce_sum(diffs, axis=axis) term2 = 2.0 * math_ops.div_no_nan( math_ops.square(sum_diff), math_ops.square(num_present_per_batch), name="value") loss = _scale_losses(term1 - term2, weights) mean_loss = array_ops.where( math_ops.reduce_sum(num_present_per_batch) > 0, loss, array_ops.zeros_like(loss), name="value") add_loss(mean_loss) return mean_loss
def copy_fn(cur_i, cand_i): with ops.colocate_with(cand_i): return array_ops.where(elements_finished, cur_i, cand_i)
def interpolate_pr_auc(self): """Interpolation formula inspired by section 4 of Davis & Goadrich 2006. https://www.biostat.wisc.edu/~page/rocpr.pdf Note here we derive & use a closed formula not present in the paper as follows: Precision = TP / (TP + FP) = TP / P Modeling all of TP (true positive), FP (false positive) and their sum P = TP + FP (predicted positive) as varying linearly within each interval [A, B] between successive thresholds, we get Precision slope = dTP / dP = (TP_B - TP_A) / (P_B - P_A) = (TP - TP_A) / (P - P_A) Precision = (TP_A + slope * (P - P_A)) / P The area within the interval is (slope / total_pos_weight) times int_A^B{Precision.dP} = int_A^B{(TP_A + slope * (P - P_A)) * dP / P} int_A^B{Precision.dP} = int_A^B{slope * dP + intercept * dP / P} where intercept = TP_A - slope * P_A = TP_B - slope * P_B, resulting in int_A^B{Precision.dP} = TP_B - TP_A + intercept * log(P_B / P_A) Bringing back the factor (slope / total_pos_weight) we'd put aside, we get slope * [dTP + intercept * log(P_B / P_A)] / total_pos_weight where dTP == TP_B - TP_A. Note that when P_A == 0 the above calculation simplifies into int_A^B{Precision.dTP} = int_A^B{slope * dTP} = slope * (TP_B - TP_A) which is really equivalent to imputing constant precision throughout the first bucket having >0 true positives. Returns: pr_auc: an approximation of the area under the P-R curve. """ dtp = self.true_positives[:self.num_thresholds - 1] - self.true_positives[1:] p = self.true_positives + self.false_positives dp = p[:self.num_thresholds - 1] - p[1:] prec_slope = math_ops.div_no_nan( dtp, math_ops.maximum(dp, 0), name='prec_slope') intercept = self.true_positives[1:] - \ math_ops.multiply(prec_slope, p[1:]) safe_p_ratio = array_ops.where( math_ops.logical_and(p[:self.num_thresholds - 1] > 0, p[1:] > 0), math_ops.div_no_nan( p[:self.num_thresholds - 1], math_ops.maximum(p[1:], 0), name='recall_relative_ratio'), array_ops.ones_like(p[1:])) return math_ops.reduce_sum( math_ops.div_no_nan( prec_slope * (dtp + intercept * math_ops.log(safe_p_ratio)), math_ops.maximum(self.true_positives[1:] + self.false_negatives[1:], 0), name='pr_auc_increment'), name='interpolate_pr_auc')
def dense_to_sparse_non_scalar(tensor): indices = array_ops.where( array_ops.ones_like(tensor, dtype=dtypes.bool)) values = array_ops.gather_nd(tensor, indices) shape = array_ops.shape(tensor, out_type=dtypes.int64) return sparse_tensor.SparseTensorValue(indices, values, shape)
def sigmoid_cross_entropy_with_logits(_sentinel=None, # pylint: disable=invalid-name labels=None, logits=None, name=None): """Computes sigmoid cross entropy given `logits`. Measures the probability error in discrete classification tasks in which each class is independent and not mutually exclusive. For instance, one could perform multilabel classification where a picture can contain both an elephant and a dog at the same time. For brevity, let `x = logits`, `z = labels`. The logistic loss is z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x)) = z * -log(1 / (1 + exp(-x))) + (1 - z) * -log(exp(-x) / (1 + exp(-x))) = z * log(1 + exp(-x)) + (1 - z) * (-log(exp(-x)) + log(1 + exp(-x))) = z * log(1 + exp(-x)) + (1 - z) * (x + log(1 + exp(-x)) = (1 - z) * x + log(1 + exp(-x)) = x - x * z + log(1 + exp(-x)) For x < 0, to avoid overflow in exp(-x), we reformulate the above x - x * z + log(1 + exp(-x)) = log(exp(x)) - x * z + log(1 + exp(-x)) = - x * z + log(1 + exp(x)) Hence, to ensure stability and avoid overflow, the implementation uses this equivalent formulation max(x, 0) - x * z + log(1 + exp(-abs(x))) `logits` and `labels` must have the same type and shape. Args: _sentinel: Used to prevent positional parameters. Internal, do not use. labels: A `Tensor` of the same type and shape as `logits`. logits: A `Tensor` of type `float32` or `float64`. name: A name for the operation (optional). Returns: A `Tensor` of the same shape as `logits` with the componentwise logistic losses. Raises: ValueError: If `logits` and `labels` do not have the same shape. """ # pylint: disable=protected-access nn_ops._ensure_xent_args("sigmoid_cross_entropy_with_logits", _sentinel, labels, logits) # pylint: enable=protected-access with ops.name_scope(name, "logistic_loss", [logits, labels]) as name: logits = ops.convert_to_tensor(logits, name="logits") labels = ops.convert_to_tensor(labels, name="labels") try: labels.get_shape().merge_with(logits.get_shape()) except ValueError: raise ValueError("logits and labels must have the same shape (%s vs %s)" % (logits.get_shape(), labels.get_shape())) # The logistic loss formula from above is # x - x * z + log(1 + exp(-x)) # For x < 0, a more numerically stable formula is # -x * z + log(1 + exp(x)) # Note that these two expressions can be combined into the following: # max(x, 0) - x * z + log(1 + exp(-abs(x))) # To allow computing gradients at zero, we define custom versions of max and # abs functions. zeros = array_ops.zeros_like(logits, dtype=logits.dtype) cond = (logits >= zeros) relu_logits = array_ops.where(cond, logits, zeros) neg_abs_logits = array_ops.where(cond, -logits, logits) return math_ops.add(relu_logits - logits * labels, math_ops.log1p(math_ops.exp(neg_abs_logits)), name=name)
def triplet_semihard_loss(labels, embeddings, margin=1.0): """Computes the triplet loss with semi-hard negative mining. The loss encourages the positive distances (between a pair of embeddings with the same labels) to be smaller than the minimum negative distance among which are at least greater than the positive distance plus the margin constant (called semi-hard negative) in the mini-batch. If no such negative exists, uses the largest negative distance instead. See: https://arxiv.org/abs/1503.03832. Args: labels: 1-D tf.int32 `Tensor` with shape [batch_size] of multiclass integer labels. embeddings: 2-D float `Tensor` of embedding vectors. Embeddings should be l2 normalized. margin: Float, margin term in the loss definition. Returns: triplet_loss: tf.float32 scalar. """ # Reshape [batch_size] label tensor to a [batch_size, 1] label tensor. lshape = array_ops.shape(labels) assert lshape.shape == 1 labels = array_ops.reshape(labels, [lshape[0], 1]) # Build pairwise squared distance matrix. pdist_matrix = pairwise_distance(embeddings, squared=True) # Build pairwise binary adjacency matrix. adjacency = math_ops.equal(labels, array_ops.transpose(labels)) # Invert so we can select negatives only. adjacency_not = math_ops.logical_not(adjacency) batch_size = array_ops.size(labels) # Compute the mask. pdist_matrix_tile = array_ops.tile(pdist_matrix, [batch_size, 1]) mask = math_ops.logical_and( array_ops.tile(adjacency_not, [batch_size, 1]), math_ops.greater( pdist_matrix_tile, array_ops.reshape(array_ops.transpose(pdist_matrix), [-1, 1]))) mask_final = array_ops.reshape( math_ops.greater( math_ops.reduce_sum(math_ops.cast(mask, dtype=dtypes.float32), 1, keepdims=True), 0.0), [batch_size, batch_size]) mask_final = array_ops.transpose(mask_final) adjacency_not = math_ops.cast(adjacency_not, dtype=dtypes.float32) mask = math_ops.cast(mask, dtype=dtypes.float32) # negatives_outside: smallest D_an where D_an > D_ap. negatives_outside = array_ops.reshape( masked_minimum(pdist_matrix_tile, mask), [batch_size, batch_size]) negatives_outside = array_ops.transpose(negatives_outside) # negatives_inside: largest D_an. negatives_inside = array_ops.tile( masked_maximum(pdist_matrix, adjacency_not), [1, batch_size]) semi_hard_negatives = array_ops.where(mask_final, negatives_outside, negatives_inside) loss_mat = math_ops.add(margin, pdist_matrix - semi_hard_negatives) mask_positives = math_ops.cast(adjacency, dtype=dtypes.float32) - array_ops.diag( array_ops.ones([batch_size])) # In lifted-struct, the authors multiply 0.5 for upper triangular # in semihard, they take all positive pairs except the diagonal. num_positives = math_ops.reduce_sum(mask_positives) triplet_loss = math_ops.truediv(math_ops.reduce_sum( math_ops.maximum(math_ops.multiply(loss_mat, mask_positives), 0.0)), num_positives, name='triplet_semihard_loss') return triplet_loss
def focal_loss(target_tensor, prediction_tensor, classes_num, gamma=2., alpha=.25, e=0.1): # classes_num contains sample number of each classes ''' prediction_tensor is the output tensor with shape [None, 100], where 100 is the number of classes target_tensor is the label tensor, same shape as predcition_tensor ''' import tensorflow as tf from tensorflow.python.ops import array_ops from keras import backend as K #1# get focal loss with no balanced weight which presented in paper function (4) zeros = array_ops.zeros_like(prediction_tensor, dtype=prediction_tensor.dtype) one_minus_p = array_ops.where(tf.greater(target_tensor, zeros), target_tensor - prediction_tensor, zeros) FT = -1 * (one_minus_p**gamma) * tf.log( tf.clip_by_value(prediction_tensor, 1e-8, 1.0)) #2# get balanced weight alpha classes_weight = array_ops.zeros_like(prediction_tensor, dtype=prediction_tensor.dtype) total_num = float(sum(classes_num)) classes_w_t1 = [total_num / ff for ff in classes_num] sum_ = sum(classes_w_t1) classes_w_t2 = [ff / sum_ for ff in classes_w_t1] #scale classes_w_tensor = tf.convert_to_tensor(classes_w_t2, dtype=prediction_tensor.dtype) classes_weight += classes_w_tensor alpha = array_ops.where(tf.greater(target_tensor, zeros), classes_weight, zeros) #3# get balanced focal loss balanced_fl = alpha * FT balanced_fl = tf.reduce_mean(balanced_fl) #4# add other op to prevent overfit # reference : https://spaces.ac.cn/archives/4493 nb_classes = len(classes_num) fianal_loss = (1 - e) * balanced_fl + e * K.categorical_crossentropy( K.ones_like(prediction_tensor) / nb_classes, prediction_tensor) # gp_loss=tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.ones_like(prediction_tensor)/nb_classes, logits=prediction_tensor) # gp_loss=tf.reduce_mean(tf.reduce_sum(gp_loss, axis=1), name='loss') # fianal_loss = (1-e) * balanced_fl + e * gp_loss return fianal_loss # def focal_loss(prediction_tensor, target_tensor, weights=None, alpha=0.25, gamma=2): # r"""Compute focal loss for predictions. # Multi-labels Focal loss formula: # FL = -alpha * (z-p)^gamma * log(p) -(1-alpha) * p^gamma * log(1-p) # ,which alpha = 0.25, gamma = 2, p = sigmoid(x), z = target_tensor. # Args: # prediction_tensor: A float tensor of shape [batch_size, num_anchors, # num_classes] representing the predicted logits for each class # target_tensor: A float tensor of shape [batch_size, num_anchors, # num_classes] representing one-hot encoded classification targets # weights: A float tensor of shape [batch_size, num_anchors] # alpha: A scalar tensor for focal loss alpha hyper-parameter # gamma: A scalar tensor for focal loss gamma hyper-parameter # Returns: # loss: A (scalar) tensor representing the value of the loss function # """ # sigmoid_p = tf.nn.sigmoid(prediction_tensor) # zeros = array_ops.zeros_like(sigmoid_p, dtype=sigmoid_p.dtype) # # # For poitive prediction, only need consider front part loss, back part is 0; # # target_tensor > zeros <=> z=1, so poitive coefficient = z - p. # pos_p_sub = array_ops.where(target_tensor > zeros, target_tensor - sigmoid_p, zeros) # # # For negative prediction, only need consider back part loss, front part is 0; # # target_tensor > zeros <=> z=1, so negative coefficient = 0. # neg_p_sub = array_ops.where(target_tensor > zeros, zeros, sigmoid_p) # per_entry_cross_ent = - alpha * (pos_p_sub ** gamma) * tf.log(tf.clip_by_value(sigmoid_p, 1e-8, 1.0)) \ # - (1 - alpha) * (neg_p_sub ** gamma) * tf.log(tf.clip_by_value(1.0 - sigmoid_p, 1e-8, 1.0)) # return tf.reduce_sum(per_entry_cross_ent)
def streaming_covariance(predictions, labels, weights=None, metrics_collections=None, updates_collections=None, name=None): """Computes the unbiased sample covariance between `predictions` and `labels`. The `streaming_covariance` function creates four local variables, `comoment`, `mean_prediction`, `mean_label`, and `count`, which are used to compute the sample covariance between predictions and labels across multiple batches of data. The covariance is ultimately returned as an idempotent operation that simply divides `comoment` by `count` - 1. We use `count` - 1 in order to get an unbiased estimate. The algorithm used for this online computation is described in https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance. Specifically, the formula used to combine two sample comoments is `C_AB = C_A + C_B + (E[x_A] - E[x_B]) * (E[y_A] - E[y_B]) * n_A * n_B / n_AB` The comoment for a single batch of data is simply `sum((x - E[x]) * (y - E[y]))`, optionally weighted. If `weights` is not None, then it is used to compute weighted comoments, means, and count. NOTE: these weights are treated as "frequency weights", as opposed to "reliability weights". See discussion of the difference on https://wikipedia.org/wiki/Weighted_arithmetic_mean#Weighted_sample_variance To facilitate the computation of covariance across multiple batches of data, the function creates an `update_op` operation, which updates underlying variables and returns the updated covariance. Args: predictions: A `Tensor` of arbitrary size. labels: A `Tensor` of the same size as `predictions`. weights: Optional `Tensor` indicating the frequency with which an example is sampled. Rank must be 0, or the same rank as `labels`, and must be broadcastable to `labels` (i.e., all dimensions must be either `1`, or the same as the corresponding `labels` dimension). metrics_collections: An optional list of collections that the metric value variable should be added to. updates_collections: An optional list of collections that the metric update ops should be added to. name: An optional variable_scope name. Returns: covariance: A `Tensor` representing the current unbiased sample covariance, `comoment` / (`count` - 1). update_op: An operation that updates the local variables appropriately. Raises: ValueError: If labels and predictions are of different sizes or if either `metrics_collections` or `updates_collections` are not a list or tuple. """ with variable_scope.variable_scope(name, 'covariance', (predictions, labels, weights)): predictions, labels, weights = metrics_impl._remove_squeezable_dimensions( # pylint: disable=protected-access predictions, labels, weights) predictions.get_shape().assert_is_compatible_with(labels.get_shape()) count_ = metric_variable([], dtypes.float32, name='count') mean_prediction = metric_variable([], dtypes.float32, name='mean_prediction') mean_label = metric_variable([], dtypes.float32, name='mean_label') comoment = metric_variable( # C_A in update equation [], dtypes.float32, name='comoment') if weights is None: batch_count = math_ops.to_float( array_ops.size(labels)) # n_B in eqn weighted_predictions = predictions weighted_labels = labels else: weights = weights_broadcast_ops.broadcast_weights(weights, labels) batch_count = math_ops.reduce_sum(weights) # n_B in eqn weighted_predictions = math_ops.multiply(predictions, weights) weighted_labels = math_ops.multiply(labels, weights) update_count = state_ops.assign_add(count_, batch_count) # n_AB in eqn prev_count = update_count - batch_count # n_A in update equation # We update the means by Delta=Error*BatchCount/(BatchCount+PrevCount) # batch_mean_prediction is E[x_B] in the update equation batch_mean_prediction = _safe_div( math_ops.reduce_sum(weighted_predictions), batch_count, 'batch_mean_prediction') delta_mean_prediction = _safe_div( (batch_mean_prediction - mean_prediction) * batch_count, update_count, 'delta_mean_prediction') update_mean_prediction = state_ops.assign_add(mean_prediction, delta_mean_prediction) # prev_mean_prediction is E[x_A] in the update equation prev_mean_prediction = update_mean_prediction - delta_mean_prediction # batch_mean_label is E[y_B] in the update equation batch_mean_label = _safe_div(math_ops.reduce_sum(weighted_labels), batch_count, 'batch_mean_label') delta_mean_label = _safe_div( (batch_mean_label - mean_label) * batch_count, update_count, 'delta_mean_label') update_mean_label = state_ops.assign_add(mean_label, delta_mean_label) # prev_mean_label is E[y_A] in the update equation prev_mean_label = update_mean_label - delta_mean_label unweighted_batch_coresiduals = ((predictions - batch_mean_prediction) * (labels - batch_mean_label)) # batch_comoment is C_B in the update equation if weights is None: batch_comoment = math_ops.reduce_sum(unweighted_batch_coresiduals) else: batch_comoment = math_ops.reduce_sum(unweighted_batch_coresiduals * weights) # View delta_comoment as = C_AB - C_A in the update equation above. # Since C_A is stored in a var, by how much do we need to increment that var # to make the var = C_AB? delta_comoment = (batch_comoment + (prev_mean_prediction - batch_mean_prediction) * (prev_mean_label - batch_mean_label) * (prev_count * batch_count / update_count)) update_comoment = state_ops.assign_add(comoment, delta_comoment) covariance = array_ops.where(math_ops.less_equal(count_, 1.), float('nan'), math_ops.truediv(comoment, count_ - 1), name='covariance') with ops.control_dependencies([update_comoment]): update_op = array_ops.where(math_ops.less_equal(count_, 1.), float('nan'), math_ops.truediv(comoment, count_ - 1), name='update_op') if metrics_collections: ops.add_to_collections(metrics_collections, covariance) if updates_collections: ops.add_to_collections(updates_collections, update_op) return covariance, update_op
def body(time, outputs_ta, state, inputs, finished, sequence_lengths, bit_num, cur_interval): """Internal while_loop body. Args: time: scalar int32 tensor. outputs_ta: structure of TensorArray. state: (structure of) state tensors and TensorArrays. inputs: (structure of) input tensors. finished: bool tensor (keeping track of what's finished). sequence_lengths: int32 tensor (keeping track of time of finish). bit_num: int32 tensor (bits number been encoded this step) cur_interval: float32 shape=(2) (AC algorithm divide interval) Returns: `(time + 1, outputs_ta, next_state, next_inputs, next_finished, next_sequence_lengths)`. ``` """ (next_outputs, decoder_state, next_inputs, decoder_finished, num_bits_encoded, next_cur_interval) = \ decoder.step(time, inputs, state, bit_num, cur_interval) next_finished = math_ops.logical_or(decoder_finished, finished) bit_num += num_bits_encoded # if no_hidden_sign==True: no hide if maximum_iterations is not None: next_finished = math_ops.logical_or( next_finished, time + 1 >= maximum_iterations) # define sequence lengths for next sentence according to the 'finish' sign next_sequence_lengths = array_ops.where( math_ops.logical_and(math_ops.logical_not(finished), next_finished), array_ops.fill(array_ops.shape(sequence_lengths), time + 1), sequence_lengths) nest.assert_same_structure(state, decoder_state) nest.assert_same_structure(outputs_ta, next_outputs) nest.assert_same_structure(inputs, next_inputs) nest.assert_same_structure(cur_interval, next_cur_interval) # Zero out output values past finish if impute_finished: emit = nest.map_structure( lambda out, zero: array_ops.where(finished, zero, out), next_outputs, zero_outputs) else: emit = next_outputs # Copy through states past finish def _maybe_copy_state(new, cur): # TensorArrays and scalar states get passed through. if isinstance(cur, tensor_array_ops.TensorArray): pass_through = True else: new.set_shape(cur.shape) pass_through = (new.shape.ndims == 0) return new if pass_through else array_ops.where( finished, cur, new) if impute_finished: next_state = nest.map_structure(_maybe_copy_state, decoder_state, state) else: next_state = decoder_state outputs_ta = nest.map_structure( lambda ta, out: ta.write(time, out), outputs_ta, emit) return (time + 1, outputs_ta, next_state, next_inputs, next_finished, next_sequence_lengths, bit_num, next_cur_interval)
def batch_matrix_pow(matrices, powers): """Compute powers of matrices, e.g. A^3 = matmul(matmul(A, A), A). Uses exponentiation by squaring, with O(log(p)) matrix multiplications to compute A^p. Args: matrices: [batch size x N x N] powers: Which integer power to raise each matrix to [batch size] Returns: The matrices raised to their respective powers, same dimensions as the "matrices" argument. """ def terminate_when_all_zero(current_argument, residual_powers, accumulator): del current_argument, accumulator # not used for condition do_exit = math_ops.reduce_any( math_ops.greater(residual_powers, array_ops.ones_like(residual_powers))) return do_exit def do_iteration(current_argument, residual_powers, accumulator): """Compute one step of iterative exponentiation by squaring. The recursive form is: power(A, p) = { power(matmul(A, A), p / 2) for even p { matmul(A, power(matmul(A, A), (p - 1) / 2)) for odd p power(A, 0) = I The power(A, 0) = I case is handled by starting with accumulator set to the identity matrix; matrices with zero residual powers are passed through unchanged. Args: current_argument: On this step, what is the first argument (A^2..^2) to the (unrolled) recursive function? [batch size x N x N] residual_powers: On this step, what is the second argument (residual p)? [batch_size] accumulator: Accumulates the exterior multiplications from the odd powers (initially the identity matrix). [batch_size x N x N] Returns: Updated versions of each argument for one step of the unrolled computation. Does not change parts of the batch which have a residual power of zero. """ is_even = math_ops.equal(residual_powers % 2, array_ops.zeros( array_ops.shape(residual_powers), dtype=dtypes.int32)) new_accumulator = array_ops.where(is_even, accumulator, math_ops.matmul(accumulator, current_argument)) new_argument = math_ops.matmul(current_argument, current_argument) do_update = math_ops.greater(residual_powers, 1) new_residual_powers = residual_powers - residual_powers % 2 new_residual_powers //= 2 # Stop updating if we've reached our base case; some batch elements may # finish sooner than others accumulator = array_ops.where(do_update, new_accumulator, accumulator) current_argument = array_ops.where(do_update, new_argument, current_argument) residual_powers = array_ops.where(do_update, new_residual_powers, residual_powers) return (current_argument, residual_powers, accumulator) matrices = ops.convert_to_tensor(matrices) powers = math_ops.cast(powers, dtype=dtypes.int32) ident = array_ops.expand_dims( array_ops.diag( array_ops.ones([array_ops.shape(matrices)[1]], dtype=matrices.dtype)), 0) ident_tiled = array_ops.tile(ident, [array_ops.shape(matrices)[0], 1, 1]) (final_argument, final_residual_power, final_accumulator) = control_flow_ops.while_loop( terminate_when_all_zero, do_iteration, [matrices, powers, ident_tiled]) return array_ops.where( math_ops.equal(final_residual_power, array_ops.zeros_like( final_residual_power, dtype=dtypes.int32)), ident_tiled, math_ops.matmul(final_argument, final_accumulator))
def b(i, r): return i + 1, array_ops.where(math_ops.less(i, squarings), math_ops.matmul(r, r), r)
def kernel(target_log_prob_fn, current_state, step_size, num_leapfrog_steps, seed=None, current_target_log_prob=None, current_grads_target_log_prob=None, name=None): """Runs one iteration of Hamiltonian Monte Carlo. Hamiltonian Monte Carlo (HMC) is a Markov chain Monte Carlo (MCMC) algorithm that takes a series of gradient-informed steps to produce a Metropolis proposal. This function applies one step of HMC to randomly update the variable `x`. This function can update multiple chains in parallel. It assumes that all leftmost dimensions of `current_state` index independent chain states (and are therefore updated independently). The output of `target_log_prob_fn()` should sum log-probabilities across all event dimensions. Slices along the rightmost dimensions may have different target distributions; for example, `current_state[0, :]` could have a different target distribution from `current_state[1, :]`. This is up to `target_log_prob_fn()`. (The number of independent chains is `tf.size(target_log_prob_fn(*current_state))`.) #### Examples: ##### Simple chain with warm-up. ```python tfd = tf.contrib.distributions # Tuning acceptance rates: dtype = np.float32 target_accept_rate = 0.631 num_warmup_iter = 500 num_chain_iter = 500 x = tf.get_variable(name="x", initializer=dtype(1)) step_size = tf.get_variable(name="step_size", initializer=dtype(1)) target = tfd.Normal(loc=dtype(0), scale=dtype(1)) next_x, other_results = hmc.kernel( target_log_prob_fn=target.log_prob, current_state=x, step_size=step_size, num_leapfrog_steps=3)[:4] x_update = x.assign(next_x) step_size_update = step_size.assign_add( step_size * tf.where( tf.exp(tf.minimum(other_results.log_accept_ratio), 0.) > target_accept_rate, 0.01, -0.01)) warmup = tf.group([x_update, step_size_update]) tf.global_variables_initializer().run() sess.graph.finalize() # No more graph building. # Warm up the sampler and adapt the step size for _ in xrange(num_warmup_iter): sess.run(warmup) # Collect samples without adapting step size samples = np.zeros([num_chain_iter]) for i in xrange(num_chain_iter): _, x_, target_log_prob_, grad_ = sess.run([ x_update, x, other_results.target_log_prob, other_results.grads_target_log_prob]) samples[i] = x_ print(samples.mean(), samples.std()) ``` ##### Sample from more complicated posterior. I.e., ```none W ~ MVN(loc=0, scale=sigma * eye(dims)) for i=1...num_samples: X[i] ~ MVN(loc=0, scale=eye(dims)) eps[i] ~ Normal(loc=0, scale=1) Y[i] = X[i].T * W + eps[i] ``` ```python tfd = tf.contrib.distributions def make_training_data(num_samples, dims, sigma): dt = np.asarray(sigma).dtype zeros = tf.zeros(dims, dtype=dt) x = tfd.MultivariateNormalDiag( loc=zeros).sample(num_samples, seed=1) w = tfd.MultivariateNormalDiag( loc=zeros, scale_identity_multiplier=sigma).sample(seed=2) noise = tfd.Normal( loc=dt(0), scale=dt(1)).sample(num_samples, seed=3) y = tf.tensordot(x, w, axes=[[1], [0]]) + noise return y, x, w def make_prior(sigma, dims): # p(w | sigma) return tfd.MultivariateNormalDiag( loc=tf.zeros([dims], dtype=sigma.dtype), scale_identity_multiplier=sigma) def make_likelihood(x, w): # p(y | x, w) return tfd.MultivariateNormalDiag( loc=tf.tensordot(x, w, axes=[[1], [0]])) # Setup assumptions. dtype = np.float32 num_samples = 150 dims = 10 num_iters = int(5e3) true_sigma = dtype(0.5) y, x, true_weights = make_training_data(num_samples, dims, true_sigma) # Estimate of `log(true_sigma)`. log_sigma = tf.get_variable(name="log_sigma", initializer=dtype(0)) sigma = tf.exp(log_sigma) # State of the Markov chain. weights = tf.get_variable( name="weights", initializer=np.random.randn(dims).astype(dtype)) prior = make_prior(sigma, dims) def joint_log_prob_fn(w): # f(w) = log p(w, y | x) return prior.log_prob(w) + make_likelihood(x, w).log_prob(y) weights_update = weights.assign( hmc.kernel(target_log_prob_fn=joint_log_prob, current_state=weights, step_size=0.1, num_leapfrog_steps=5)[0]) with tf.control_dependencies([weights_update]): loss = -prior.log_prob(weights) optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01) log_sigma_update = optimizer.minimize(loss, var_list=[log_sigma]) sess.graph.finalize() # No more graph building. tf.global_variables_initializer().run() sigma_history = np.zeros(num_iters, dtype) weights_history = np.zeros([num_iters, dims], dtype) for i in xrange(num_iters): _, sigma_, weights_, _ = sess.run([log_sigma_update, sigma, weights]) weights_history[i, :] = weights_ sigma_history[i] = sigma_ true_weights_ = sess.run(true_weights) # Should converge to something close to true_sigma. plt.plot(sigma_history); plt.ylabel("sigma"); plt.xlabel("iteration"); ``` Args: target_log_prob_fn: Python callable which takes an argument like `current_state` (or `*current_state` if it's a list) and returns its (possibly unnormalized) log-density under the target distribution. current_state: `Tensor` or Python `list` of `Tensor`s representing the current state(s) of the Markov chain(s). The first `r` dimensions index independent chains, `r = tf.rank(target_log_prob_fn(*current_state))`. step_size: `Tensor` or Python `list` of `Tensor`s representing the step size for the leapfrog integrator. Must broadcast with the shape of `current_state`. Larger step sizes lead to faster progress, but too-large step sizes make rejection exponentially more likely. When possible, it's often helpful to match per-variable step sizes to the standard deviations of the target distribution in each variable. num_leapfrog_steps: Integer number of steps to run the leapfrog integrator for. Total progress per HMC step is roughly proportional to `step_size * num_leapfrog_steps`. seed: Python integer to seed the random number generator. current_target_log_prob: (Optional) `Tensor` representing the value of `target_log_prob_fn` at the `current_state`. The only reason to specify this argument is to reduce TF graph size. Default value: `None` (i.e., compute as needed). current_grads_target_log_prob: (Optional) Python list of `Tensor`s representing gradient of `current_target_log_prob` at the `current_state` and wrt the `current_state`. Must have same shape as `current_state`. The only reason to specify this argument is to reduce TF graph size. Default value: `None` (i.e., compute as needed). name: Python `str` name prefixed to Ops created by this function. Default value: `None` (i.e., "hmc_kernel"). Returns: next_state: Tensor or Python list of `Tensor`s representing the state(s) of the Markov chain(s) at each result step. Has same shape as `current_state`. kernel_results: `collections.namedtuple` of internal calculations used to advance the chain. Raises: ValueError: if there isn't one `step_size` or a list with same length as `current_state`. """ with ops.name_scope(name, "hmc_kernel", [ current_state, step_size, num_leapfrog_steps, seed, current_target_log_prob, current_grads_target_log_prob ]): with ops.name_scope("initialize"): [ current_state_parts, step_sizes, current_target_log_prob, current_grads_target_log_prob ] = _prepare_args(target_log_prob_fn, current_state, step_size, current_target_log_prob, current_grads_target_log_prob, maybe_expand=True) independent_chain_ndims = distributions_util.prefer_static_rank( current_target_log_prob) current_momentums = [] for s in current_state_parts: current_momentums.append( random_ops.random_normal(shape=array_ops.shape(s), dtype=s.dtype.base_dtype, seed=seed)) seed = distributions_util.gen_new_seed( seed, salt="hmc_kernel_momentums") num_leapfrog_steps = ops.convert_to_tensor( num_leapfrog_steps, dtype=dtypes.int32, name="num_leapfrog_steps") [ proposed_momentums, proposed_state_parts, proposed_target_log_prob, proposed_grads_target_log_prob, ] = _leapfrog_integrator(current_momentums, target_log_prob_fn, current_state_parts, step_sizes, num_leapfrog_steps, current_target_log_prob, current_grads_target_log_prob) energy_change = _compute_energy_change(current_target_log_prob, current_momentums, proposed_target_log_prob, proposed_momentums, independent_chain_ndims) log_accept_ratio = -energy_change # u < exp(log_accept_ratio), where u~Uniform[0,1) # ==> log(u) < log_accept_ratio random_value = random_ops.random_uniform( shape=array_ops.shape(energy_change), dtype=energy_change.dtype, seed=seed) random_negative = math_ops.log(random_value) is_accepted = random_negative < log_accept_ratio accepted_target_log_prob = array_ops.where(is_accepted, proposed_target_log_prob, current_target_log_prob) next_state_parts = [ _choose(is_accepted, proposed_state_part, current_state_part, independent_chain_ndims) for current_state_part, proposed_state_part in zip( current_state_parts, proposed_state_parts) ] accepted_grads_target_log_prob = [ _choose(is_accepted, proposed_grad, grad, independent_chain_ndims) for proposed_grad, grad in zip(proposed_grads_target_log_prob, current_grads_target_log_prob) ] maybe_flatten = lambda x: x if _is_list_like(current_state) else x[0] return [ maybe_flatten(next_state_parts), KernelResults( log_accept_ratio=log_accept_ratio, current_grads_target_log_prob=accepted_grads_target_log_prob, current_target_log_prob=accepted_target_log_prob, is_accepted=is_accepted, proposed_grads_target_log_prob=proposed_grads_target_log_prob, proposed_state=maybe_flatten(proposed_state_parts), proposed_target_log_prob=proposed_target_log_prob, ), ]
def training_graph(self, input_data, input_labels, data_spec=None, epoch=None, **tree_kwargs): """Constructs a TF graph for training a random forest. Args: input_data: A tensor or SparseTensor or placeholder for input data. input_labels: A tensor or placeholder for labels associated with input_data. data_spec: A list of tf.dtype values specifying the original types of each column. epoch: A tensor or placeholder for the epoch the training data comes from. **tree_kwargs: Keyword arguments passed to each tree's training_graph. Returns: The last op in the random forest training graph. """ data_spec = [constants.DATA_FLOAT] if data_spec is None else data_spec tree_graphs = [] for i in range(self.params.num_trees): with ops.device(self.device_assigner.get_device(i)): seed = self.params.base_random_seed if seed != 0: seed += i # If using bagging, randomly select some of the input. tree_data = input_data tree_labels = input_labels if self.params.bagging_fraction < 1.0: # TODO(thomaswc): This does sampling without replacment. Consider # also allowing sampling with replacement as an option. batch_size = array_ops.slice(array_ops.shape(input_data), [0], [1]) r = random_ops.random_uniform(batch_size, seed=seed) mask = math_ops.less( r, array_ops.ones_like(r) * self.params.bagging_fraction) gather_indices = array_ops.squeeze(array_ops.where(mask), squeeze_dims=[1]) # TODO(thomaswc): Calculate out-of-bag data and labels, and store # them for use in calculating statistics later. tree_data = array_ops.gather(input_data, gather_indices) tree_labels = array_ops.gather(input_labels, gather_indices) if self.params.bagged_features: tree_data = self._bag_features(i, tree_data) initialization = self.trees[i].tree_initialization() with ops.control_dependencies([initialization]): tree_graphs.append(self.trees[i].training_graph( tree_data, tree_labels, seed, data_spec=data_spec, epoch=([0] if epoch is None else epoch), **tree_kwargs)) return control_flow_ops.group(*tree_graphs, name='train')
def rotate_transpose(x, shift, name="rotate_transpose"): """Circularly moves dims left or right. Effectively identical to: ```python numpy.transpose(x, numpy.roll(numpy.arange(len(x.shape)), shift)) ``` When `validate_args=False` additional graph-runtime checks are performed. These checks entail moving data from to GPU to CPU. Example: ```python x = ... # Tensor of shape [1, 2, 3, 4]. rotate_transpose(x, -1) # result shape: [2, 3, 4, 1] rotate_transpose(x, -2) # result shape: [3, 4, 1, 2] rotate_transpose(x, 1) # result shape: [4, 1, 2, 3] rotate_transpose(x, 2) # result shape: [3, 4, 1, 2] rotate_transpose(x, 7) == rotate_transpose(x, 3) rotate_transpose(x, -7) == rotate_transpose(x, -3) ``` Args: x: `Tensor`. shift: `Tensor`. Number of dimensions to transpose left (shift<0) or transpose right (shift>0). name: Python `str`. The name to give this op. Returns: rotated_x: Input `Tensor` with dimensions circularly rotated by shift. Raises: TypeError: if shift is not integer type. """ with ops.name_scope(name, values=[x, shift]): x = ops.convert_to_tensor(x, name="x") shift = ops.convert_to_tensor(shift, name="shift") # We do not assign back to preserve constant-ness. check_ops.assert_integer(shift) shift_value_static = tensor_util.constant_value(shift) ndims = x.get_shape().ndims if ndims is not None and shift_value_static is not None: if ndims < 2: return x shift_value_static = np.sign(shift_value_static) * ( abs(shift_value_static) % ndims) if shift_value_static == 0: return x perm = np.roll(np.arange(ndims), shift_value_static) return array_ops.transpose(x, perm=perm) else: # Consider if we always had a positive shift, and some specified # direction. # When shifting left we want the new array: # last(x, n-shift) + first(x, shift) # and if shifting right then we want: # last(x, shift) + first(x, n-shift) # Observe that last(a) == slice(a, n) and first(a) == slice(0, a). # Also, we can encode direction and shift as one: direction * shift. # Combining these facts, we have: # a = cond(shift<0, -shift, n-shift) # last(x, n-a) + first(x, a) == x[a:n] + x[0:a] # Finally, we transform shift by modulo length so it can be specified # independently from the array upon which it operates (like python). ndims = array_ops.rank(x) shift = array_ops.where(math_ops.less(shift, 0), math_ops.mod(-shift, ndims), ndims - math_ops.mod(shift, ndims)) first = math_ops.range(0, shift) last = math_ops.range(shift, ndims) perm = array_ops.concat([last, first], 0) return array_ops.transpose(x, perm=perm)
def call(self, inputs, states, edge_types, cell_mask, training=True): # inputs: batch_size*embedding_dim, states:4*batch_size*embedding_dim, cell_mask: batch_size*recurrent_size batch_size = inputs.shape[0] state_size = len(states) if state_size > self.recurrent_size: raise ValueError("length of states exceeds recurrent_size.") if self.use_bias: unstacked_biases = array_ops.unstack(self.bias) # unstacked_biases: (recurrent_size+1)*embedding_dim input_bias, recurrent_bias = unstacked_biases[0], unstacked_biases[1:] # input_bias: (3*embedding_dim), recurrent_bias: recurrent_size*(3*embedding_dim) matrix_x = K.dot(inputs, self.kernel) # matrix_x: batch_size*(3*embedding_dim) if self.use_bias: matrix_x = K.bias_add(matrix_x, input_bias) x_z = matrix_x[:, :self.units] # x_z: batch_size*embedding_dim x_r = matrix_x[:, self.units: 2 * self.units] # x_r: batch_size*embedding_dim x_h = matrix_x[:, 2 * self.units:] # x_h: batch_size*embedding_dim def _expand_mask(mask_t, input_t, fixed_dim=1): # mask_t: batch_size*1, input_t: batch_size*embedding_dim assert not nest.is_sequence(mask_t) assert not nest.is_sequence(input_t) rank_diff = len(input_t.shape) - len(mask_t.shape) # rand_diff: 0 for _ in range(rank_diff): mask_t = array_ops.expand_dims(mask_t, -1) multiples = [1] * fixed_dim + input_t.shape.as_list()[fixed_dim:] # multiples: [1, embedding_dim] return array_ops.tile(mask_t, multiples) accumulate_h = array_ops.zeros([batch_size, self.units]) # accumulate_h: batch_size*embedding_dim accumulate_z_h = array_ops.zeros([batch_size, self.units]) # accumulate_z_h: batch_size*embedding_dim accumulate_z = array_ops.zeros([batch_size, self.units]) # accumulate_z: batch_size*embedding_dim loop = 1 if args['ablationD'] else self.recurrent_size z_list = [] h_list = [] for k in range(loop): # edge embedding edge_embed = self.edge_embeddings(edge_types[:, k]) # edge_embed: batch_size*embedding # mask tiled_mask_t = _expand_mask(cell_mask[:, k], edge_embed) # tiled_mask_t: batch_size*embedding_dim edge_embed = array_ops.where(tiled_mask_t, edge_embed, array_ops.ones_like(edge_embed)) # edge_embed: batch_size*embedding_dim state = states[k] # state: batch_size*embedding_dim h_list.append(state) matrix_inner = K.dot(state, self.recurrent_kernel[k]) # matrix_inner: batch_size*(3*embedding_dim), states[k]: batch_size*embedding_dim if self.use_bias: matrix_inner = K.bias_add(matrix_inner, recurrent_bias[k]) recurrent_z = matrix_inner[:, :self.units] # recurrent_z: batch_size*embedding_dim recurrent_r = matrix_inner[:, self.units: 2 * self.units] # recurrent_r: batch_size*embedding_dim # add for softmax attention z_list.append(recurrent_z) z = self.recurrent_activation(x_z + recurrent_z) # z: batch_size*embedding_dim r = self.recurrent_activation(x_r + recurrent_r) # r: batch_size*embedding_dim # comment for sum_after recurrent_h = r * matrix_inner[:, 2 * self.units:] # recurrent_h: batch_size*embedding_dim recurrent_h = array_ops.where(tiled_mask_t, recurrent_h, array_ops.zeros_like(recurrent_h)) # recurrent_h: batch_size*embedding_dim accumulate_h = accumulate_h + recurrent_h # accumulate_h: batch_size*embedding_dim hh = self.activation(x_h + accumulate_h / loop) # hh: batch_size*embedding_dim h_list.append(hh) # h_list: input_hidden without linear z_list.append(hh) # z_list: input_hidden after linear hidden_bank = tf.transpose(tf.stack(z_list, axis=0), [1, 0, 2]) # hidden_memory: batch_size * (recurrent_size + 1) * embedding_dim x_z_temp = tf.tile(tf.expand_dims(x_z, axis=1), [1, hidden_bank.shape[1], 1]) # x_z_temp = batch_size * (recurrent_size + 1) * embedding_dim prob_logits = tf.matmul(tf.tile(tf.expand_dims(tf.transpose(self.v, [1, 0]), axis=0), [batch_size, 1, 1]), tf.transpose(self.activation(x_z_temp + hidden_bank), [0, 2, 1])) prob_logits = tf.squeeze(tf.transpose(prob_logits, [0, 2, 1]), axis=2) mask_list = [] cell_mask_slices = tf.split(cell_mask, num_or_size_splits=cell_mask.shape[1], axis=1) for tensor in cell_mask_slices: mask_list.append(tensor) hh_mask = tf.ones([batch_size, 1], dtype=tf.bool) # hh_mask: batch_size * 1 mask_list.append(hh_mask) new_mask = tf.squeeze(tf.stack(mask_list, axis=1), axis=2) # new_mask: batch_size * (recurrent_size + 1) prob_logits_temp = array_ops.where(new_mask, prob_logits, (-1 * np.ones_like(prob_logits) * np.inf)) prob_soft = self.softmax(prob_logits_temp) # prob_soft: batch_size * (recurrent_size + 1) prob_soft_temp = tf.tile(tf.expand_dims(prob_soft, axis=2), [1, 1, hidden_bank.shape[2]]) # prob_soft_temp: batch_size * (recurrent_size + 1) * embedding_dim output_hidden_bank = tf.transpose(tf.stack(h_list, axis=0), [1, 0, 2]) # output_hidden_bank: batch_size * (recurrent_size + 1) * embedding_dim h = tf.reduce_sum((output_hidden_bank * prob_soft_temp), axis=1) # h: batch_size * embedding_dim return h, [h]
def mean_pairwise_squared_error( labels, predictions, weights=1.0, scope=None, loss_collection=ops.GraphKeys.LOSSES): """Adds a pairwise-errors-squared loss to the training procedure. Unlike `mean_squared_error`, which is a measure of the differences between corresponding elements of `predictions` and `labels`, `mean_pairwise_squared_error` is a measure of the differences between pairs of corresponding elements of `predictions` and `labels`. For example, if `labels`=[a, b, c] and `predictions`=[x, y, z], there are three pairs of differences are summed to compute the loss: loss = [ ((a-b) - (x-y)).^2 + ((a-c) - (x-z)).^2 + ((b-c) - (y-z)).^2 ] / 3 Note that since the inputs are of shape `[batch_size, d0, ... dN]`, the corresponding pairs are computed within each batch sample but not across samples within a batch. For example, if `predictions` represents a batch of 16 grayscale images of dimension [batch_size, 100, 200], then the set of pairs is drawn from each image, but not across images. `weights` acts as a coefficient for the loss. If a scalar is provided, then the loss is simply scaled by the given value. If `weights` is a tensor of size `[batch_size]`, then the total loss for each sample of the batch is rescaled by the corresponding element in the `weights` vector. Args: labels: The ground truth output tensor, whose shape must match the shape of `predictions`. predictions: The predicted outputs, a tensor of size `[batch_size, d0, .. dN]` where N+1 is the total number of dimensions in `predictions`. weights: Coefficients for the loss a scalar, a tensor of shape `[batch_size]` or a tensor whose shape matches `predictions`. scope: The scope for the operations performed in computing the loss. loss_collection: collection to which the loss will be added. Returns: A scalar `Tensor` that returns the weighted loss. Raises: ValueError: If the shape of `predictions` doesn't match that of `labels` or if the shape of `weights` is invalid. Also if `labels` or `predictions` is None. @compatibility(eager) The `loss_collection` argument is ignored when executing eagerly. Consider holding on to the return value or collecting losses via a `tf.keras.Model`. @end_compatibility """ if labels is None: raise ValueError("labels must not be None.") if predictions is None: raise ValueError("predictions must not be None.") with ops.name_scope(scope, "mean_pairwise_squared_error", (predictions, labels, weights)) as scope: weights = math_ops.cast(weights, dtype=dtypes.float32) labels = math_ops.cast(labels, dtype=dtypes.float32) with ops.control_dependencies(( weights_broadcast_ops.assert_broadcastable(weights, labels),)): predictions = math_ops.cast(predictions, dtype=dtypes.float32) predictions.get_shape().assert_is_compatible_with(labels.get_shape()) diffs = math_ops.subtract(predictions, labels) axis = math_ops.range(1, array_ops.rank(diffs)) sum_squares_diff_per_batch = math_ops.reduce_sum( math_ops.square(diffs), axis=axis, keepdims=True) num_present_per_batch = _num_present(diffs, weights, per_batch=True) term1 = 2.0 * math_ops.div_no_nan( sum_squares_diff_per_batch, math_ops.maximum(num_present_per_batch - 1, 0), name="value") sum_diff = math_ops.reduce_sum(diffs, axis=axis, keepdims=True) term2 = 2.0 * math_ops.div_no_nan( math_ops.square(sum_diff), math_ops.maximum( math_ops.multiply(num_present_per_batch, num_present_per_batch - 1), 0), name="value") weighted_losses = math_ops.multiply(term1 - term2, weights) loss = math_ops.reduce_sum(weighted_losses) mean_loss = array_ops.where( math_ops.reduce_sum(num_present_per_batch) > 0, loss, array_ops.zeros_like(loss), name="value") util.add_loss(mean_loss, loss_collection) return mean_loss