def per_example_maxent_loss(labels, weights, logits, num_classes, eps=1e-15): """Maximum entropy loss for multiclass problems. Maximum entropy is a generalization of logistic loss for the case when more than 2 classes are present. Args: labels: Rank 2 (N, 1) or Rank 1 (N) tensor of per-example labels. weights: Rank 2 (N, 1) tensor of per-example weights. logits: Rank 2 (N, K) tensor of per-example predictions, K - num of classes. num_classes: number of classes in classification task. Used to expand label indices into one-hot encodings. eps: tolerance, used as a minimum possible value. Returns: loss: A Rank 2 (N, 1) tensor of per-example maxent loss update_op: An update operation to update the loss's internal state. """ labels = math_ops.to_int64(labels) # If labels are of rank 1, make them rank 2. labels_shape = labels.get_shape() if len(labels_shape) != 2: labels = array_ops.expand_dims(labels, 1) # Labels are indices of classes, convert them to one hot encodings. target_one_hot = array_ops.one_hot(indices=labels, depth=num_classes) labels = math_ops.reduce_sum( input_tensor=target_one_hot, reduction_indices=[1]) labels = math_ops.to_float(labels) # Calculate softmax probabilities for each class. unnormalized_probs = math_ops.exp(logits) normalizers = math_ops.reduce_sum(unnormalized_probs, 1, keepdims=True) softmax_predictions = math_ops.divide(unnormalized_probs, math_ops.add(normalizers, eps)) # Pull out the probabilities for real label. probs_for_real_class = math_ops.reduce_sum(labels * softmax_predictions, 1) # Add handling for values near 0 and 1. zeros = array_ops.zeros_like(probs_for_real_class, dtype=logits.dtype) + eps one_minus_eps = array_ops.ones_like( probs_for_real_class, dtype=logits.dtype) - eps # Take maximum(eps, pred) cond = (probs_for_real_class >= eps) probs_for_real_class = array_ops.where(cond, probs_for_real_class, zeros) # Take minimum(1-eps, pred) cond = (probs_for_real_class <= 1 - eps) probs_for_real_class = array_ops.where(cond, probs_for_real_class, one_minus_eps) unweighted_loss = array_ops.expand_dims(-math_ops.log(probs_for_real_class), 1) if weights is None: return unweighted_loss, control_flow_ops.no_op() else: return unweighted_loss * weights, control_flow_ops.no_op()
def body(i, prev_c, prev_h, actions, log_probs): # pylint: disable=g-long-lambda signal = control_flow_ops.cond( math_ops.equal(i, 0), lambda: array_ops.tile(device_go_embedding, [self.hparams.num_children, 1]), lambda: embedding_ops.embedding_lookup(device_embeddings, actions.read(i - 1)) ) if self.hparams.keep_prob is not None: signal = nn_ops.dropout(signal, self.hparams.keep_prob) next_c, next_h = lstm(signal, prev_c, prev_h, w_lstm, forget_bias) query = math_ops.matmul(next_h, attn_w_2) query = array_ops.reshape( query, [self.hparams.num_children, 1, self.hparams.hidden_size]) query = math_ops.tanh(query + attn_mem) query = array_ops.reshape(query, [ self.hparams.num_children * self.num_groups, self.hparams.hidden_size ]) query = math_ops.matmul(query, attn_v) query = array_ops.reshape(query, [self.hparams.num_children, self.num_groups]) query = nn_ops.softmax(query) query = array_ops.reshape(query, [self.hparams.num_children, self.num_groups, 1]) query = math_ops.reduce_sum(attn_mem * query, axis=1) query = array_ops.concat([next_h, query], axis=1) logits = math_ops.matmul(query, device_softmax) logits /= self.hparams.temperature if self.hparams.tanh_constant > 0: logits = math_ops.tanh(logits) * self.hparams.tanh_constant if self.hparams.logits_std_noise > 0: num_in_logits = math_ops.cast( array_ops.size(logits), dtype=dtypes.float32) avg_norm = math_ops.divide( linalg_ops.norm(logits), math_ops.sqrt(num_in_logits)) logits_noise = random_ops.random_normal( array_ops.shape(logits), stddev=self.hparams.logits_std_noise * avg_norm) logits = control_flow_ops.cond( self.global_step > self.hparams.stop_noise_step, lambda: logits, lambda: logits + logits_noise) if mode == "sample": next_y = random_ops.multinomial(logits, 1, seed=self.hparams.seed) elif mode == "greedy": next_y = math_ops.argmax(logits, 1) elif mode == "target": next_y = array_ops.slice(y, [0, i], [-1, 1]) else: raise NotImplementedError next_y = math_ops.to_int32(next_y) next_y = array_ops.reshape(next_y, [self.hparams.num_children]) actions = actions.write(i, next_y) log_probs += nn_ops.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=next_y) return i + 1, next_c, next_h, actions, log_probs
def _safe_div(numerator, denominator, name): """Divides two values, returning 0 if the denominator is <= 0. Args: numerator: A real `Tensor`. denominator: A real `Tensor`, with dtype matching `numerator`. name: Name for the returned op. Returns: 0 if `denominator` <= 0, else `numerator` / `denominator` """ return tf.where(math_ops.greater(denominator, 0), math_ops.divide(numerator, denominator), tf.zeros_like(numerator), name=name)
def testFloorDivGrad(self): with self.test_session(): a = variables.Variable(2.) b = variables.Variable(4.) with self.test_session() as sess: sess.run(variables.global_variables_initializer()) c_grad = gradients.gradients(math_ops.divide(a, b), [a, b]) self.assertAllEqual([x.eval() for x in c_grad], [.25, -.125]) c_grad = gradients.gradients(math_ops.div(a, b), [a, b]) self.assertAllEqual([x.eval() for x in c_grad], [.25, -.125]) c_grad = gradients.gradients(math_ops.floordiv(a, b), [a, b]) self.assertAllEqual([None if x is None else x.eval() for x in c_grad], [None, None])
def _SegmentMinOrMaxGrad(op, grad): """ Gradient for SegmentMin and SegmentMax. """ zeros = array_ops.zeros_like(op.inputs[0], dtype=op.inputs[0].dtype) # Get the number of selected (minimum or maximum) elements in each segment. gathered_outputs = array_ops.gather(op.outputs[0], op.inputs[1]) is_selected = math_ops.equal(op.inputs[0], gathered_outputs) num_selected = math_ops.segment_sum(math_ops.cast(is_selected, grad.dtype), op.inputs[1]) # Compute the gradient for each segment. The gradient for the ith segment is # divided evenly among the selected elements in that segment. weighted_grads = math_ops.divide(grad, num_selected) gathered_grads = array_ops.gather(weighted_grads, op.inputs[1]) return array_ops.where(is_selected, gathered_grads, zeros), None
def __call__(self, step): with ops.name_scope(self.name, "CyclicLearningRate", [self.learning_rate, step]) as name: learning_rate = ops.convert_to_tensor(self.learning_rate, name="learning_rate") dtype = learning_rate.dtype step = math_ops.cast(step, dtype) step_size = math_ops.cast(self.step_size, dtype) max_lr = math_ops.cast(self.max_lr, dtype) # computing: cycle = floor( 1 + step / ( 2 * step_size ) ) double_step = math_ops.multiply(2., step_size) global_div_double_step = math_ops.divide(step, double_step) cycle = math_ops.floor(math_ops.add(1., global_div_double_step)) # computing: x = abs( step / step_size – 2 * cycle + 1 ) double_cycle = math_ops.multiply(2., cycle) global_div_step = math_ops.divide(step, step_size) tmp = math_ops.subtract(global_div_step, double_cycle) x = math_ops.abs(math_ops.add(1., tmp)) # computing: clr = learning_rate + ( max_lr – learning_rate ) * max( 0, 1 - x ) a1 = math_ops.maximum(0., math_ops.subtract(1., x)) a2 = math_ops.subtract(max_lr, learning_rate) clr = math_ops.multiply(a1, a2) if self.mode == 'triangular2': clr = math_ops.divide( clr, math_ops.cast( math_ops.pow(2, math_ops.cast(cycle - 1, tf.int32)), tf.float32)) if self.mode == 'exp_range': gamma = math_ops.cast(self.gamma, dtype) clr = math_ops.multiply(math_ops.pow(gamma, step), clr) #if self.mode == 'cosine': return math_ops.add(clr, learning_rate, name=name)
def cyclic_lr(): double_step = math_ops.multiply(2., step_size) global_div_double_step = math_ops.divide(global_step, double_step) cycle = math_ops.floor(math_ops.add(1., global_div_double_step)) double_cycle = math_ops.multiply(2., cycle) global_div_step = math_ops.divide(global_step, step_size) tmp = math_ops.subtract(global_div_step, double_cycle) x = math_ops.abs(math_ops.add(1., tmp)) a1 = math_ops.maximum(0., math_ops.subtract(1., x)) a2 = math_ops.subtract(max_lr, learning_rate) clr = math_ops.multiply(a1, a2) if mode == 'triangular2': clr = math_ops.divide( clr, math_ops.cast( math_ops.pow(2, math_ops.cast(cycle - 1, tf.int32)), tf.float32)) if mode == 'exp_range': clr = math_ops.multiply(math_ops.pow(gamma, global_step), clr) return math_ops.add(clr, learning_rate, name=name)
def calculate_bboxes_intersection(self, bbox_ref, bboxes): bboxes = tf.transpose(bboxes) bbox_ref = tf.transpose(bbox_ref) int_ymin = tf.maximum(bboxes[0], bbox_ref[0]) int_xmin = tf.maximum(bboxes[1], bbox_ref[1]) int_ymax = tf.minimum(bboxes[2], bbox_ref[2]) int_xmax = tf.minimum(bboxes[3], bbox_ref[3]) inter_vol = tf.maximum(int_ymax - int_ymin, 0.) * tf.maximum( int_xmax - int_xmin, 0.) bboxes_vol = (bboxes[2] - bboxes[0]) * (bboxes[3] - bboxes[1]) scores = tf.where(math_ops.greater(bboxes_vol, 0), math_ops.divide(inter_vol, bboxes_vol), tf.zeros_like(inter_vol)) return scores
def ignore(x, binary_tensor, name=None): with ops.name_scope(name, "ignore", [x]) as name: x = ops.convert_to_tensor(x, name="x") keep_ratio = math_ops.divide( math_ops.reduce_sum(binary_tensor), math_ops.reduce_prod( array_ops.shape(binary_tensor, out_type=dtypes.float32))) keep_ratio.get_shape().assert_is_compatible_with(tensor_shape.scalar()) with tf.Session() as sess: print(keep_ratio.eval(session=sess)) ret = math_ops.div(x, keep_ratio) * binary_tensor ret.set_shape(x.get_shape()) return ret
def _optimal_step_size(last_step, error_ratio, safety=0.9, ifactor=10.0, dfactor=0.2, order=5): """Calculate the optimal size for the next Runge-Kutta step.""" error_ratio = math_ops.cast(error_ratio, last_step.dtype) exponent = math_ops.cast(1 / order, last_step.dtype) factor = math_ops.maximum( 1 / ifactor, math_ops.minimum(error_ratio**exponent / safety, 1 / dfactor)) return math_ops.divide(last_step, factor)
def gen_crossentropy(y_true, y_pred, q=0.7, k=-1.0): # Filter true values ("y_true") in "y_pred" y_ok = array_ops.boolean_mask(y_pred, gen_math_ops.equal(y_true, 1)) # Conversion for Float64 for valid operations in TensorFlow um = np.float64(1.) q = np.float64(q) if k == -1: # cross entropy loss # mean[ (1-y_ok^q)/q ] return K.mean(math_ops.divide( math_ops.subtract(um, math_ops.pow(y_ok, q)), q), axis=-1) else: # truncated cross entropy loss k = np.float64(k) # if y_ok < k # [ (1-k^q)/q ] (no broadcasting in Where()) # [ (1-y_ok^q)/q ] vfunct = array_ops.where( gen_math_ops.less_equal(y_ok, k), gen_array_ops.fill(array_ops.shape(y_ok), (um - k**q) / q), math_ops.divide(math_ops.subtract(um, math_ops.pow(y_ok, q)), q)) return K.mean(vfunct, axis=-1) # mean [ above values ]
def safe_divide(numerator, denominator, name): """Divides two values, returning 0 if the denominator is <= 0. Args: numerator: A real `Tensor`. denominator: A real `Tensor`, with dtype matching `numerator`. name: Name for the returned op. Returns: 0 if `denominator` <= 0, else `numerator` / `denominator` """ return tf.where( math_ops.greater(denominator, 0), math_ops.divide(numerator, denominator), tf.zeros_like(numerator), name=name)
def cyclic_lr(): """Helper to recompute learning rate; most helpful in eager-mode.""" # computing: cycle = floor( 1 + global_step / ( 2 * step_size ) ) double_step = math_ops.multiply(2., step_size) global_div_double_step = math_ops.divide(global_step, double_step) cycle = math_ops.floor(math_ops.add(1., global_div_double_step)) # computing: x = abs( global_step / step_size – 2 * cycle + 1 ) double_cycle = math_ops.multiply(2., cycle) global_div_step = math_ops.divide(global_step, step_size) tmp = math_ops.subtract(global_div_step, double_cycle) x = math_ops.abs(math_ops.add(1., tmp)) # computing: clr = learning_rate + ( max_lr – learning_rate ) * max( 0, 1 - x ) a1 = math_ops.maximum(0., math_ops.subtract(1., x)) a2 = math_ops.subtract(max_lr, learning_rate) clr = math_ops.multiply(a1, a2) if mode == 'triangular2': clr = math_ops.divide( clr, math_ops.cast( math_ops.pow(2, math_ops.cast(cycle - 1, tf.int32)), tf.float32)) if mode == 'exp_range': clr = math_ops.multiply(math_ops.pow(gamma, global_step), clr) return math_ops.add(clr, learning_rate, name=name)
def __call__(self, step): with ops.name_scope_v2(self.name or "InverseTimeDecay") as name: initial_learning_rate = ops.convert_to_tensor_v2_with_dispatch( self.initial_learning_rate, name="initial_learning_rate") dtype = initial_learning_rate.dtype decay_steps = math_ops.cast(self.decay_steps, dtype) decay_rate = math_ops.cast(self.decay_rate, dtype) global_step_recomp = math_ops.cast(step, dtype) p = global_step_recomp / decay_steps if self.staircase: p = math_ops.floor(p) const = math_ops.cast(constant_op.constant(1), dtype) denom = math_ops.add(const, math_ops.multiply(decay_rate, p)) return math_ops.divide(initial_learning_rate, denom, name=name)
def make_grouping_predictions(self, input_layer, reuse=None): """model that predicts grouping (grouping_actions). Args: input_layer: group_input_layer reuse: reuse Returns: grouping_actions: actions grouping_log_probs: log probabilities corresponding to actions """ with variable_scope.variable_scope(self.hparams.name, reuse=True): # input_layer: tensor of size [1, num_ops, hidden_size] w_grouping_ff = variable_scope.get_variable("w_grouping_ff") w_grouping_softmax = variable_scope.get_variable( "w_grouping_softmax") batch_size = array_ops.shape(input_layer)[0] embedding_dim = array_ops.shape(input_layer)[2] reshaped = array_ops.reshape( input_layer, [batch_size * self.num_ops, embedding_dim]) ff_output = math_ops.matmul(reshaped, w_grouping_ff) logits = math_ops.matmul(ff_output, w_grouping_softmax) if self.hparams.logits_std_noise > 0: num_in_logits = math_ops.cast(array_ops.size(logits), dtype=dtypes.float32) avg_norm = math_ops.divide(linalg_ops.norm(logits), math_ops.sqrt(num_in_logits)) logits_noise = random_ops.random_normal( array_ops.shape(logits), stddev=self.hparams.logits_std_noise * avg_norm) logits = control_flow_ops.cond( self.global_step > self.hparams.stop_noise_step, lambda: logits, lambda: logits + logits_noise) logits = array_ops.reshape( logits, [batch_size * self.num_ops, self.num_groups]) actions = random_ops.multinomial(logits, 1, seed=self.hparams.seed) actions = math_ops.to_int32(actions) actions = array_ops.reshape(actions, [batch_size, self.num_ops]) action_label = array_ops.reshape(actions, [-1]) log_probs = nn_ops.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=action_label) log_probs = array_ops.reshape(log_probs, [batch_size, -1]) log_probs = math_ops.reduce_sum(log_probs, 1) grouping_actions = actions grouping_log_probs = log_probs return grouping_actions, grouping_log_probs
def __call__(self, step): """ return a float(learning rate) """ # decrease linearly steprate = math_ops.abs(math_ops.divide( math_ops.subtract(self.finallr, self.initlr), self.nsteps)) lr = math_ops.subtract(self.initlr, math_ops.multiply( steprate, math_ops.subtract(step, self.shiftstep))) pred = math_ops.greater(step, self.shiftstep) lr = control_flow_ops.cond(pred, lambda: lr, lambda: self.initlr) return lr
def __call__(self, step): with tf.name_scope(self.name or "Stolera") as name: dtype = tf.dtypes.float32 initial_learning_rate = tf.convert_to_tensor(self.initial_learning_rate, dtype=dtype, name="initial_learning_rate") sigma = math_ops.cast(self.sigma, dtype) t_step = math_ops.cast(step, dtype) # t_step = math_ops.multiply(t_step, t_step) t_step = math_ops.add(t_step, tf.constant(1, dtype=dtype)) Z_t = tf.random.normal([1], mean=0.0, stddev=1.0, dtype=dtype) term_a = math_ops.divide(Z_t[0], t_step) term_b = math_ops.multiply(sigma, term_a) term_c = math_ops.subtract(initial_learning_rate, term_b, name=name) return term_c
def _MinOrMaxGrad(op, grad): """Gradient for Min or Max. Amazingly it's precisely the same code.""" input_shape = array_ops.shape(op.inputs[0]) output_shape_kept_dims = math_ops.reduced_shape(input_shape, op.inputs[1]) y = op.outputs[0] y = array_ops.reshape(y, output_shape_kept_dims) grad = array_ops.reshape(grad, output_shape_kept_dims) # Compute the number of selected (maximum or minimum) elements in each # reduction dimension. If there are multiple minimum or maximum elements # then the gradient will be divided between them. indicators = math_ops.cast(math_ops.equal(y, op.inputs[0]), grad.dtype) num_selected = array_ops.reshape( math_ops.reduce_sum(indicators, op.inputs[1]), output_shape_kept_dims) return [math_ops.divide(indicators, num_selected) * grad, None]
def apply_dropout(): keep_prob = 1 - rate # uniform [keep_prob, 1.0 + keep_prob) random_tensor = keep_prob random_tensor += random_ops.random_uniform(noise_shape, seed=seed, dtype=x.dtype) # 0. if [keep_prob, 1.0) and 1. if [1.0, 1.0 + keep_prob) binary_tensor = math_ops.floor(random_tensor) # save binary tensor to variable assign_op = binary_tensor_var.assign(binary_tensor) with tf.control_dependencies([assign_op]): ret = math_ops.divide(x, keep_prob) * binary_tensor #ret = tf.Print(ret,["apply dropout and save", _is_reuse_binary_tensor_var]) return ret
def make_grouping_predictions(self, input_layer, reuse=None): """model that predicts grouping (grouping_actions). Args: input_layer: group_input_layer reuse: reuse Returns: grouping_actions: actions grouping_log_probs: log probabilities corresponding to actions """ with variable_scope.variable_scope(self.hparams.name, reuse=True): # input_layer: tensor of size [1, num_ops, hidden_size] w_grouping_ff = variable_scope.get_variable("w_grouping_ff") w_grouping_softmax = variable_scope.get_variable("w_grouping_softmax") batch_size = array_ops.shape(input_layer)[0] embedding_dim = array_ops.shape(input_layer)[2] reshaped = array_ops.reshape(input_layer, [batch_size * self.num_ops, embedding_dim]) ff_output = math_ops.matmul(reshaped, w_grouping_ff) logits = math_ops.matmul(ff_output, w_grouping_softmax) if self.hparams.logits_std_noise > 0: num_in_logits = math_ops.cast( array_ops.size(logits), dtype=dtypes.float32) avg_norm = math_ops.divide( linalg_ops.norm(logits), math_ops.sqrt(num_in_logits)) logits_noise = random_ops.random_normal( array_ops.shape(logits), stddev=self.hparams.logits_std_noise * avg_norm) logits = control_flow_ops.cond( self.global_step > self.hparams.stop_noise_step, lambda: logits, lambda: logits + logits_noise) logits = array_ops.reshape(logits, [batch_size * self.num_ops, self.num_groups]) actions = random_ops.multinomial(logits, 1, seed=self.hparams.seed) actions = math_ops.to_int32(actions) actions = array_ops.reshape(actions, [batch_size, self.num_ops]) action_label = array_ops.reshape(actions, [-1]) log_probs = nn_ops.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=action_label) log_probs = array_ops.reshape(log_probs, [batch_size, -1]) log_probs = math_ops.reduce_sum(log_probs, 1) grouping_actions = actions grouping_log_probs = log_probs return grouping_actions, grouping_log_probs
def sample(self, time, outputs, state, name=None): """sample for SampleEmbeddingHelper.""" del time, state # unused by sample_fn # Outputs are logits, we sample instead of argmax (greedy). if not isinstance(outputs, ops.Tensor): raise TypeError("Expected outputs to be a single Tensor, got: %s" % type(outputs)) if self._softmax_temperature is None: logits = outputs else: #logits = outputs / self._softmax_temperature logits = math_ops.divide(outputs, self._softmax_temperature) sample_id_sampler = categorical.Categorical(logits=logits) sample_ids = sample_id_sampler.sample(seed=self._seed) return sample_ids
def _UnsortedSegmentMinOrMaxGrad(op, grad): """ Gradient for UnsortedSegmentMin and UnsortedSegmentMax. """ # Get the number of selected (minimum or maximum) elements in each segment. gathered_outputs, zero_clipped_indices, is_positive = \ _GatherDropNegatives(op.outputs[0], op.inputs[1]) is_selected = math_ops.equal(op.inputs[0], gathered_outputs) is_selected = math_ops.logical_and(is_selected, is_positive) num_selected = math_ops.unsorted_segment_sum( math_ops.cast(is_selected, grad.dtype), op.inputs[1], op.inputs[2]) # Compute the gradient for each segment. The gradient for the ith segment is # divided evenly among the selected elements in that segment. weighted_grads = math_ops.divide(grad, num_selected) gathered_grads, _, _ = _GatherDropNegatives(weighted_grads, None, zero_clipped_indices, is_positive) zeros = array_ops.zeros_like(gathered_grads) return array_ops.where(is_selected, gathered_grads, zeros), None, None
def _optimal_step_size(last_step, error_ratio, safety=0.9, ifactor=10.0, dfactor=0.2, order=5): """Calculate the optimal size for the next Runge-Kutta step.""" error_ratio = math_ops.cast(error_ratio, last_step.dtype) dfactor = tf.cond(error_ratio < 1, lambda: tf.constant(1, dtype=tf.float64), lambda: dfactor) error_ratio = tf.cast(tf.math.sqrt(error_ratio), last_step.dtype) exponent = math_ops.cast(1 / order, last_step.dtype) factor = math_ops.maximum( 1 / ifactor, math_ops.minimum(error_ratio**exponent / safety, 1 / dfactor)) #print('FACTOR', factor) return math_ops.divide(last_step, factor)
def mean_squared_norm_loss( labels, predictions, weights=1.0, scope=None, loss_collection=ops.GraphKeys.LOSSES, reduction=tf.losses.Reduction.SUM_BY_NONZERO_WEIGHTS): with ops.name_scope(scope, "mean_squared_norm_loss", (predictions, labels, weights)) as scope: predictions = math_ops.to_float(predictions) labels = math_ops.to_float(labels) predictions.get_shape().assert_is_compatible_with(labels.get_shape()) divisor = tf.maximum(labels, 1.0) error = math_ops.square( math_ops.divide(math_ops.subtract(predictions, labels), divisor)) return tf.losses.compute_weighted_loss(error, weights, scope, loss_collection, reduction=reduction)
def __call__(self, step): with tf.name_scope(self.name or "Dilera") as name: dtype = tf.dtypes.float32 initial_learning_rate = tf.convert_to_tensor(self.initial_learning_rate, dtype=dtype, name="initial_learning_rate") sigma = math_ops.cast(self.sigma, dtype) t_step = math_ops.cast(step, dtype) # t_step = math_ops.multiply(t_step, t_step) dt = tf.constant(1, dtype=dtype) t_step = math_ops.add(t_step, tf.constant(1, dtype=dtype)) Z_t = tf.random.normal([1], mean=0.0, stddev=1.0, dtype=dtype) Z_over_T = math_ops.divide(Z_t[0], t_step) Sigma_Z_over_T = math_ops.multiply(sigma, Z_over_T) Sigma_Z_sqrtDt_over_T = math_ops.multiply(Sigma_Z_over_T, math_ops.sqrt(dt)) eta_dT = math_ops.multiply(initial_learning_rate, dt) newLearningRate = math_ops.subtract(eta_dT, Sigma_Z_sqrtDt_over_T, name=name) return newLearningRate
def softmax(logits: ragged_tensor.Ragged, axis=None, name=None): """Computes softmax activations. Used for multi-class predictions. The sum of all outputs generated by softmax is 1. This function performs the equivalent of softmax = tf.exp(logits) / tf.reduce_sum(tf.exp(logits), axis) Example usage: >>> softmax = tf.nn.softmax([-1, 0., 1.]) >>> softmax <tf.Tensor: shape=(3,), dtype=float32, numpy=array([0.09003057, 0.24472848, 0.66524094], dtype=float32)> >>> sum(softmax) <tf.Tensor: shape=(), dtype=float32, numpy=1.0> Args: logits: A non-empty `Tensor`. Must be one of the following types: `half`, `float32`, `float64`. axis: The dimension softmax would be performed on. The default is -1 which indicates the last dimension. name: A name for the operation (optional). Returns: A `Tensor`. Has the same type and shape as `logits`. Raises: InvalidArgumentError: if `logits` is empty or `axis` is beyond the last dimension of `logits`. """ if axis is None: axis = -1 with ops.name_scope(name, 'RaggedSoftmax', [logits]) as name: max_input = reduce_max(logits, axis=axis, keepdims=True) logits_exp = math_ops.exp(math_ops.subtract(logits, max_input)) denominator = reduce_sum(logits_exp, axis=axis, keepdims=True) return math_ops.divide(logits_exp, denominator)
def normalize_for_graph_lstm(tensor): """Normalizes Tensor to range [-0.5, 0.5]. Scales a Tensor uniformly to fit within [-0.5, 0.5]^n. Additionally, each dimension is shifted to be centred around [0]^n i.e. the origin, in a way that data extends the same distance in positive and negative direction. In other words, the mean between maximum and minimum value of each dimension is shifted to zero. The undo_scaling op undoes scaling, but does not undo shifting. The unnormalize op does both, but is currently unused. Returns: The normalized Tensor, and an op to undo normalization. Example usage: ``` normalized_tensor, undo_scaling = normalize_for_graph_lstm(input_tensor) normalized_output_tensor = some_op(normalized_tensor) output_tensor = undo_scaling(normalized_output_tensor) ``` """ # tensor is normalized to range[-0.5, 0.5] # this function assumes tensors with shape [ batch_size, number_of_nodes, output_size ] assert (len(tensor.shape) == 3) # compute maximum and minimum joint position value in each dimension max_dim = math_ops.reduce_max(tensor, axis=1, keepdims=True) min_dim = math_ops.reduce_min(tensor, axis=1, keepdims=True) diff_dim = math_ops.subtract(max_dim, min_dim) # get normalizing factor as maximum difference within all dimensions max_diff = math_ops.reduce_max(diff_dim, axis=2, keepdims=True) normalized_tensor = math_ops.divide(tensor - min_dim - diff_dim / 2, max_diff) # return output rescaled and shifted to original position def unnormalize(tensor): return math_ops.multiply(tensor, max_diff) + diff_dim / 2 + min_dim # return output only rescaled, centered around 0 def undo_scaling(tensor): return math_ops.multiply(tensor, max_diff) return normalized_tensor, undo_scaling
def lr_annealing(learning_rate, current_epoch, total_epochs, alpha, beta, name=None): """ Applies learning rate annealing to the initial learning rate return lr_p = learning_rate * (1 + alpha * (current_epoch/global_step))^(-beta) Args: learning_rate: global_step: number of iterations alpha: beta: """ with ops.name_scope(name, "Lr_Annealing", [learning_rate, current_epoch, total_epochs, alpha, beta]) as name: learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate") dtype = learning_rate.dtype current_epoch = math_ops.cast(current_epoch, dtype) total_epochs = math_ops.cast(total_epochs, dtype) alpha = math_ops.cast(alpha, dtype) beta = math_ops.cast(beta, dtype) epoch_ratio = math_ops.divide(current_epoch, total_epochs) base = math_ops.multiply(alpha, epoch_ratio) base = math_ops.add(1., base) return math_ops.multiply(learning_rate, math_ops.pow(base, -beta), name=name)
def cosine_similarity(vec_a, vec_b): ''' Computes the cosine similarity between tensors 'vec_a' and 'vec_b'. This method assumes that rank(vec_a) = rank(vec_b) = 1. Arguments: vec_a - Rank(1) tensor. vec_b - Rank(1) tensor. Returns: cos_sim - Rank(0) tensor containing cosine similarities between tensors 'vec_a' and 'vec_b'. ''' dot = math_ops.reduce_sum(vec_a*vec_b, axis=1) norm_a = linalg_ops.norm(vec_a, ord=2, axis=1) norm_b = linalg_ops.norm(vec_b, ord=2, axis=1) # Some padding is added to the denominator to prevent 0/0 errors. cos_sim = math_ops.divide(dot, math_ops.add(norm_a*norm_b, 1e-8)) return cos_sim
def signal_to_noise(y_true, y_pred, mode='snr', data_format=None, epsilon=1e-8): '''Signal-to-noise ratio. (metric) Calculate the signal-to-noise ratio. It support different modes. Arguments: mode: (1) snr: mean [ y_true^2 / (y_pred - y_true)^2 ] (2) psnr: mean [ max( y_true^2 ) / (y_pred - y_true)^2 ] data_format: 'channels_first' or 'channels_last'. The default setting is generally 'channels_last' like other tf.keras APIs. epsilon: used for avoid zero division. Input: y_true: label, tensor in any shape. y_pred: prediction, tensor in any shape. Output: scalar, the mean SNR. ''' get_reduced_axes = get_channels(y_true, data_format) if mode.casefold() == 'psnr': signal = math_ops.reduce_max(gen_math_ops.square(y_true), axis=get_reduced_axes) else: signal = math_ops.reduce_sum(gen_math_ops.square(y_true), axis=get_reduced_axes) noise = math_ops.reduce_sum(gen_math_ops.square(y_true - y_pred), axis=get_reduced_axes) + epsilon coeff = (10.0/2.3025851) # 10/log_e(10) return coeff*math_ops.reduce_mean(gen_math_ops.log(math_ops.divide(signal, noise)))
def _NthElementGrad(op, grad): """Return the gradients for NthElement. Args: op: The NthElementOp for which we need to generate gradients. grad: Tensor. The gradients passed to the NthElementOp Returns: A list of two tensors, the first being the gradient w.r.t. the input, the second being the gradient w.r.t. the N (None). """ input = op.inputs[0] # pylint: disable=redefined-builtin output = op.outputs[0] # Compute the number of elements which equal to output in each reduction # dimension. If there are multiple elements then the gradient will be # divided between them. indicators = math_ops.cast( math_ops.equal(array_ops.expand_dims(output, -1), input), grad.dtype) grad = array_ops.expand_dims(grad, -1) num_selected = array_ops.expand_dims(math_ops.reduce_sum(indicators, -1), -1) return [math_ops.divide(indicators, num_selected) * grad, None]
def _finish(self, state): var_dtype = self._variables[0].dtype.base_dtype # Update global step. global_step = self._get_global_step(state) update_global_step = state_ops.assign_add(global_step, 1.) # Update the first moment estimate. beta1 = state.get_hyper("beta1", dtype=var_dtype) moment1 = self._get_moment1(state) flat_grad = self._get_flat_grad(state) # moment1_t := beta1 * moment1_{t-1} + (1 - beta1) * flat_grad_t update_moment1 = moment1.assign(beta1 * moment1 + (1. - beta1) * flat_grad) # Update the gradient buffer. window = state.get_hyper("window") grad_buffer = self._get_grad_buffer(state) next_grad_index = math_ops.floormod( math_ops.to_int32(update_global_step - 1.), window) # grad_buffer[(t-1) % window] := moment1_t update_grad_buffer = state_ops.scatter_update(grad_buffer, next_grad_index, update_moment1) # Compute the update step. eps = state.get_hyper("eps", dtype=var_dtype) svd_eps = state.get_hyper("svd_eps", dtype=var_dtype) sigma_eps = state.get_hyper("sigma_eps", dtype=var_dtype) lr = state.get_hyper("lr", dtype=var_dtype) denom = math_ops.sqrt( math_ops.minimum( ops.convert_to_tensor(update_global_step), ops.convert_to_tensor(math_ops.cast(window, dtype=var_dtype)))) moment1_2d = array_ops.expand_dims(update_moment1, -1) # m = grad_buffer^T / sqrt(min(t, window)) # m has shape [model dimension, window], where model dimension is the sum # of the dimensions of the flattened variables. m = array_ops.transpose(math_ops.divide(update_grad_buffer, denom)) # sigma, u, _ = SVD(m^Tm + I * svd_eps) mm = math_ops.matmul(m, m, transpose_a=True) damping = math_ops.cast(linalg_ops.eye(window), dtype=var_dtype) * svd_eps sigma, u, _ = linalg_ops.svd(mm + damping) sigma_sqrt = math_ops.sqrt(sigma) sigma_sqrt_min = math_ops.reduce_min(sigma_sqrt) # sigma_sqrt_inv = 1 / (\sqrt{sigma} + sigma_eps) ^ 3 # We add sigma_eps to alleviate numerical instability. # Note that (m^Tm)^(-3/2) = u diag(sigma_sqrt_inv) u^T. sigma_sqrt_inv = math_ops.divide( math_ops.cast(1.0, dtype=var_dtype), math_ops.pow(sigma_sqrt + sigma_eps, 3)) # In full matrix AdaGrad, the update step computes (mm^T)^(-1/2)g, where the # inversion of a model dimension by model dimension matrix is needed. To # speed up this computation we calculate the following instead: # m(m^Tm)^(-3/2)m^T moment1 = m u diag(sigma_sqrt_inv) u^T m^T moment1. new_step = array_ops.expand_dims( array_ops.zeros(flat_grad.get_shape(), dtype=var_dtype), -1) head = math_ops.matmul( m, math_ops.matmul( u, math_ops.matmul( array_ops.diag(sigma_sqrt_inv), math_ops.matmul( u, math_ops.matmul(m, moment1_2d, transpose_a=True), transpose_a=True)))) # When inverting (mm^t)^(1/2), we also add epsilon * I regularization for # degenerate cases. We expand ((mm^t)^(1/2) + epsilon * I)^(-1) using # Woodbury's identity. # For full derivation please see paper at # https://arxiv.org/pdf/1806.02958.pdf tail = moment1_2d - math_ops.matmul( m, math_ops.matmul( u, math_ops.matmul( array_ops.diag( math_ops.divide(math_ops.cast(1.0, dtype=var_dtype), sigma)), math_ops.matmul( u, math_ops.matmul(m, moment1_2d, transpose_a=True), transpose_a=True)))) scaled_tail = math_ops.divide(tail, sigma_sqrt_min) update_new_step = control_flow_ops.cond( sigma_sqrt_min > eps, lambda: math_ops.add(head, scaled_tail), lambda: math_ops.add(new_step, head)) # Update each variable. update_step = [] for var in self._variables: dim = self.shape_dict[var.name] start_index = self.index_dict[var.name] end_index = start_index + dim var_update_correct_shape = array_ops.reshape( update_new_step[start_index:end_index], var.get_shape()) var_updated = state_ops.assign_sub(var, lr * var_update_correct_shape) update_step.append(var_updated) return control_flow_ops.group(update_step)
def testComplexDiv(self): foo = array_ops.constant([1. + 3.j]) with self.test_session(): _ = math_ops.divide(foo, 1.).eval() _ = math_ops.div(foo, 2.).eval()
def testDivideName(self): with self.test_session(): op = math_ops.divide( array_ops.constant(3), array_ops.constant(4), name="my_cool_divide") self.assertEqual(op.name, "my_cool_divide:0")
def weighted_moving_average(value, decay, weight, truediv=True, collections=None, name=None): """Compute the weighted moving average of `value`. Conceptually, the weighted moving average is: `moving_average(value * weight) / moving_average(weight)`, where a moving average updates by the rule `new_value = decay * old_value + (1 - decay) * update` Internally, this Op keeps moving average variables of both `value * weight` and `weight`. Args: value: A numeric `Tensor`. decay: A float `Tensor` or float value. The moving average decay. weight: `Tensor` that keeps the current value of a weight. Shape should be able to multiply `value`. truediv: Boolean, if `True`, dividing by `moving_average(weight)` is floating point division. If `False`, use division implied by dtypes. collections: List of graph collections keys to add the internal variables `value * weight` and `weight` to. Defaults to `[GraphKeys.GLOBAL_VARIABLES]`. name: Optional name of the returned operation. Defaults to "WeightedMovingAvg". Returns: An Operation that updates and returns the weighted moving average. """ # Unlike assign_moving_average, the weighted moving average doesn't modify # user-visible variables. It is the ratio of two internal variables, which are # moving averages of the updates. Thus, the signature of this function is # quite different than assign_moving_average. if collections is None: collections = [ops.GraphKeys.GLOBAL_VARIABLES] with variable_scope.variable_scope(name, "WeightedMovingAvg", [value, weight, decay]) as scope: value_x_weight_var = variable_scope.get_variable( "value_x_weight", shape=value.get_shape(), dtype=value.dtype, initializer=init_ops.zeros_initializer(), trainable=False, collections=collections) weight_var = variable_scope.get_variable( "weight", shape=weight.get_shape(), dtype=weight.dtype, initializer=init_ops.zeros_initializer(), trainable=False, collections=collections) numerator = assign_moving_average( value_x_weight_var, value * weight, decay, zero_debias=False) denominator = assign_moving_average( weight_var, weight, decay, zero_debias=False) if truediv: return math_ops.truediv(numerator, denominator, name=scope.name) else: return math_ops.divide(numerator, denominator, name=scope.name)
def _embedded_lattices(calibrated_input_tensor, input_dim, output_dim, interpolation_type, monotonic_num_lattices, monotonic_lattice_rank, monotonic_lattice_size, non_monotonic_num_lattices, non_monotonic_lattice_rank, non_monotonic_lattice_size, linear_embedding_calibration_min, linear_embedding_calibration_max, linear_embedding_calibration_num_keypoints, is_monotone=None, lattice_l1_reg=None, lattice_l2_reg=None, lattice_l1_torsion_reg=None, lattice_l2_torsion_reg=None, lattice_l1_laplacian_reg=None, lattice_l2_laplacian_reg=None): """Creates an ensemble of lattices with a linear embedding. This function constructs the following deep lattice network: calibrated_input -> linear_embedding -> calibration -> ensemble of lattices. Then ensemble of lattices' output are averaged and bias term is added to make a final prediction. ensemble of lattices is consists of two parts: monotonic lattices and non-monotonic lattices. The input to the monotonic lattices is an output of linear_embedding that contains both monotonic and non-monotonic calibrated_input. All inputs to the monotonic lattices are set to be monotonic to preserve end-to-end monotonicity in the monotonic feature. The input to the non-monotonic lattices is an output of linear_embedding that only contains non-monotonic calibrated_input. All inputs to the non-monotonic lattices are set to be non-monotonic, since we do not need to guarantee monotonicity. Args: calibrated_input_tensor: [batch_size, input_dim] tensor. input_dim: (int) input dimnension. output_dim: (int) output dimension. interpolation_type: defines whether the lattice will interpolate using the full hypercube or only the simplex ("hyper-triangle") around the point being evaluated. Valid values: 'hypercube' or 'simplex' monotonic_num_lattices: (int) number of monotonic lattices in the ensemble lattices layer. monotonic_lattice_rank: (int) number of inputs to each monotonic lattice in the ensemble lattices layer. monotonic_lattice_size: (int) lattice cell size for each monotonic lattice in the ensemble lattices layer. non_monotonic_num_lattices: (int) number of non monotonic lattices in the ensemble lattices layer. non_monotonic_lattice_rank: (int) number of inputs to each non monotonic lattice in the ensemble lattices layer. non_monotonic_lattice_size: (int) lattice cell size for each non monotonic lattice in the ensemble lattices layer. linear_embedding_calibration_min: (float) a minimum input keypoints value for linear_embedding calibration. linear_embedding_calibration_max: (float) a maximum input keypoints value for linear_embedding calibration. linear_embedding_calibration_num_keypoints: (int) a number of eypoints for linear_embedding calibration. is_monotone: (bool, list of booleans) is_monotone[k] == true then calibrated_input_tensor[:, k] is considered to be a monotonic input. lattice_l1_reg: (float) lattice l1 regularization amount. lattice_l2_reg: (float) lattice l2 regularization amount. lattice_l1_torsion_reg: (float) lattice l1 torsion regularization amount. lattice_l2_torsion_reg: (float) lattice l2 torsion regularization amount. lattice_l1_laplacian_reg: (float) lattice l1 laplacian regularization amount. lattice_l2_laplacian_reg: (float) lattice l2 laplacian regularization amount. Returns: A tuple of (output_tensor, projection_ops, regularization). Raises: ValueError: If there is no non-monotonic inputs but non_monotonic_num_lattices is not zero. """ projections = [] regularization = None # Explictly assign number of lattices to zero for any empty cases. if not monotonic_num_lattices: monotonic_num_lattices = 0 if not non_monotonic_num_lattices: non_monotonic_num_lattices = 0 # Step 1. Create a linear embedding. if monotonic_num_lattices: monotonic_embedding_dim = monotonic_num_lattices * monotonic_lattice_rank else: monotonic_num_lattices = 0 monotonic_embedding_dim = 0 if non_monotonic_num_lattices: non_monotonic_embedding_dim = (non_monotonic_num_lattices * non_monotonic_lattice_rank) else: non_monotonic_num_lattices = 0 non_monotonic_embedding_dim = 0 if is_monotone is not None: is_monotone = tools.cast_to_list(is_monotone, input_dim, 'is_monotone') with variable_scope.variable_scope('linear_embedding'): packed_results = monotone_linear_layers.split_monotone_linear_layer( calibrated_input_tensor, input_dim, monotonic_embedding_dim, non_monotonic_embedding_dim, is_monotone=is_monotone) (monotonic_output, _, non_monotonic_output, _, proj, _) = packed_results if proj is not None: projections.append(proj) # Step 2. Create ensemble of monotonic lattices. if monotonic_num_lattices == 0: m_lattice_outputs = None else: with variable_scope.variable_scope('monotonic_lattices'): m_lattice_outputs, projs, reg = _ensemble_lattices_layer( monotonic_output, monotonic_embedding_dim, output_dim, interpolation_type, linear_embedding_calibration_min, linear_embedding_calibration_max, linear_embedding_calibration_num_keypoints, monotonic_num_lattices, monotonic_lattice_rank, monotonic_lattice_size, is_monotone=True, l1_reg=lattice_l1_reg, l2_reg=lattice_l2_reg, l1_torsion_reg=lattice_l1_torsion_reg, l2_torsion_reg=lattice_l2_torsion_reg, l1_laplacian_reg=lattice_l1_laplacian_reg, l2_laplacian_reg=lattice_l2_laplacian_reg) if projs: projections += projs regularization = tools.add_if_not_none(regularization, reg) # Step 3. Construct non-monotonic ensembles. if non_monotonic_output is None and non_monotonic_num_lattices > 0: raise ValueError( 'All input signals are monotonic but the number of non monotonic ' 'lattices is not zero.') if non_monotonic_num_lattices == 0: n_lattice_outputs = None else: with variable_scope.variable_scope('non_monotonic_lattices'): n_lattice_outputs, projs, reg = _ensemble_lattices_layer( non_monotonic_output, non_monotonic_embedding_dim, output_dim, interpolation_type, linear_embedding_calibration_min, linear_embedding_calibration_max, linear_embedding_calibration_num_keypoints, non_monotonic_num_lattices, non_monotonic_lattice_rank, non_monotonic_lattice_size, is_monotone=False, l1_reg=lattice_l1_reg, l2_reg=lattice_l2_reg, l1_torsion_reg=lattice_l1_torsion_reg, l2_torsion_reg=lattice_l2_torsion_reg, l1_laplacian_reg=lattice_l1_laplacian_reg, l2_laplacian_reg=lattice_l2_laplacian_reg) if projs: projections += projs regularization = tools.add_if_not_none(regularization, reg) # Step 4. Take average to make a final prediction. with variable_scope.variable_scope('ensemble_average'): output = variable_scope.get_variable( name='ensemble_bias', initializer=[0.0] * output_dim, dtype=calibrated_input_tensor.dtype) if m_lattice_outputs: output += math_ops.divide(math_ops.add_n(m_lattice_outputs), monotonic_num_lattices) if n_lattice_outputs is not None: output += math_ops.divide(math_ops.add_n(n_lattice_outputs), non_monotonic_num_lattices) return (output, projections, regularization)
def _ComputeBatchNormCorrections(context, match, freeze_batch_norm_delay, fused_batch_norm): """Computes batch norm correction params. Before batch normalization is frozen: We use batch statistics for batch norm. correction_scale = sigma_b/sigma_mv correction_recip = 1/correction_scale correction_offset = 0 After batch normalization is frozen: correction_scale = sigma_b/sigma_mv correction_recip = 1 correction_offset = gamma*(mu_b/sigma_b-mu_mv/sigma_mv). Batch norm is frozen if global_step > bn_freeze_delay. The corrections ensure that: a) The weights are quantized after scaling by gamma/sigma_mv. This enables smoother training as the scaling on the weights changes slowly, rather than jump across mini-batches b) Changing the values of the corrections allows for one to switch between using batch statistics to using moving mean and average, without requiring changes to batch_norm Args: context: The scope under which we look for batch norm params match: Object containing required batch norm tensors for correction computation. freeze_batch_norm_delay: Delay in steps at which computation switches from regular batch norm to frozen mean and variance. fused_batch_norm: Bool, true if fused batch norm is used. Returns: A tuple of correction_scale, correction_recip, correction_offset """ g = ops.get_default_graph() prefix = '' if not context else context + '/' with g.name_scope(prefix + 'batch_norm_correction'): recip_sigma_mv = math_ops.rsqrt( match.moving_variance_tensor + match.batch_epsilon) recip_sigma = math_ops.rsqrt(match.variance_tensor + match.batch_epsilon) correction_scale = math_ops.divide( recip_sigma_mv, recip_sigma, name='scale_compute') correction_scale = array_ops.identity( correction_scale, name='correction_scale') correction_recip = math_ops.reciprocal( correction_scale, name='reciprocal_compute') correction_offset = math_ops.multiply( match.gamma_tensor, match.mean_tensor * recip_sigma - match.moving_mean_tensor * recip_sigma_mv, name='offset_compute') if freeze_batch_norm_delay is not None: use_mv_avg = math_ops.greater_equal( common.CreateOrGetQuantizationStep(), freeze_batch_norm_delay, name='use_moving_average') else: use_mv_avg = False bn_decay_zero = 0.0 bn_decay_mean_consumers = list(match.bn_decay_mean_tensor.consumers()) bn_decay_var_consumers = list(match.bn_decay_mean_tensor.consumers()) bn_decay_mean_out = utils.smart_cond( use_mv_avg, lambda: bn_decay_zero, lambda: match.bn_decay_mean_tensor, name='freeze_moving_mean') graph_editor.reroute_ts( [bn_decay_mean_out], [match.bn_decay_mean_tensor], can_modify=bn_decay_mean_consumers) if fused_batch_norm is False: bn_decay_var_consumers = list(match.bn_decay_var_tensor.consumers()) bn_decay_var_out = utils.smart_cond( use_mv_avg, lambda: bn_decay_zero, lambda: match.bn_decay_var_tensor, name='freeze_moving_var') graph_editor.reroute_ts( [bn_decay_var_out], [match.bn_decay_var_tensor], can_modify=bn_decay_var_consumers) correction_recip = utils.smart_cond( use_mv_avg, lambda: array_ops.ones(correction_scale.shape), lambda: correction_recip, name='correction_recip') correction_offset = utils.smart_cond( use_mv_avg, lambda: correction_offset, lambda: array_ops.zeros(correction_offset.shape), name='correction_offset') return correction_scale, correction_recip, correction_offset
def testComplexDiv(self): foo = array_ops.constant([1. + 3.j]) with self.cached_session(): _ = math_ops.divide(foo, 1.).eval() _ = math_ops.div(foo, 2.).eval()