def _show_max_abs(tensor): output_tensor = math_ops.cast( math_ops.reduce_max(math_ops.abs(tensor)), dtypes.float64) zero = constant_op.constant(0, dtypes.float64) output_tensor = gen_math_ops.maximum(zero, output_tensor) return _print_tensor(op_name, output_idx, -1, tensor, output_tensor)
def GraphFn(self, x): dtype = x.dtype # scale a = constant_op.constant(np.random.randn(3, 1, 1), dtype=dtype) r1 = x / a a = constant_op.constant(np.random.randn(3, 1, 1), dtype=dtype) r2 = a / x a = constant_op.constant(np.random.randn(1, 3, 1), dtype=dtype) r3 = a + x a = constant_op.constant(np.random.randn(1, 3, 1), dtype=dtype) r4 = x * a a = constant_op.constant(np.random.randn(3, 1, 1), dtype=dtype) r5 = x - a a = constant_op.constant(np.random.randn(3, 1, 1), dtype=dtype) r6 = a - x a = constant_op.constant(np.random.randn(3, 1), dtype=dtype) r7 = x - a a = constant_op.constant(np.random.randn(3, 1), dtype=dtype) r8 = a - x a = constant_op.constant(np.random.randn(3, 1, 1), dtype=dtype) r9 = gen_math_ops.maximum(x, a) a = constant_op.constant(np.random.randn(3, 1), dtype=dtype) r10 = gen_math_ops.minimum(a, x) a = constant_op.constant(np.random.randn(3), dtype=dtype) r11 = x * a a = constant_op.constant(np.random.randn(1), dtype=dtype) r12 = a * x concat1 = array_ops.concat([r1, r2, r3, r4, r5, r6], axis=-1) concat2 = array_ops.concat([r7, r8, r9, r10, r11, r12], axis=3) x = array_ops.concat([concat1, concat2], axis=-1) return gen_array_ops.reshape(x, [2, -1], name="output_0")
def decrease_loss_scale(): new_loss_scale_value = gen_math_ops.maximum( 1., self._loss_scale * self._decr_ratio) update_loss_scale = state_ops.assign(self._loss_scale, new_loss_scale_value) return control_flow_ops.group(update_loss_scale, self._reset_stats())
def saturate_cast(value, dtype, name=None): """Performs a safe saturating cast of `value` to `dtype`. This function casts the input to `dtype` without applying any scaling. If there is a danger that values would over or underflow in the cast, this op applies the appropriate clamping before the cast. Args: value: A `Tensor`. dtype: The desired output `DType`. name: A name for the operation (optional). Returns: `value` safely cast to `dtype`. """ # When casting to a type with smaller representable range, clamp. # Note that this covers casting to unsigned types as well. with ops.op_scope([value], name, "saturate_cast") as name: value = ops.convert_to_tensor(value, name="value") dtype = dtypes.as_dtype(dtype).base_dtype if value.dtype.min < dtype.min: value = gen_math_ops.maximum(value, ops.convert_to_tensor( dtype.min, dtype=value.dtype, name="min")) if value.dtype.max > dtype.max: value = gen_math_ops.minimum(value, ops.convert_to_tensor( dtype.max, dtype=value.dtype, name="max")) return cast(value, dtype, name=name)
def decr_loss_scale(): update_op = state_ops.assign( self._loss_scale, gen_math_ops.maximum(1., self._loss_scale * self._decr_ratio)) # When loss_scale is updated, both good and bad steps are reset. return control_flow_ops.group(update_op, self._reset_stats())
def _show_max_abs(tensor): tensor = math_ops.cast(tensor, dtypes.float32) output_tensor = math_ops.reduce_max(math_ops.abs(tensor)) zero = constant_op.constant(0, dtypes.float32) output_tensor = gen_math_ops.maximum(zero, output_tensor) # The shape has to be 1. Set it if it does not have the information. output_tensor = array_ops.reshape(output_tensor, [1]) return output_tensor
def posdef_inv_eig(tensor, identity, damping): """Computes inverse(tensor + damping * identity) with eigendecomposition.""" eigenvalues, eigenvectors = linalg_ops.self_adjoint_eig( tensor + damping * identity) # TODO(GD): it's a little hacky eigenvalues = gen_math_ops.maximum(eigenvalues, damping) return math_ops.matmul( eigenvectors / eigenvalues, eigenvectors, transpose_b=True)
def _show_max_abs(tensor): tensor = math_ops.cast(tensor, dtypes.float32) output_tensor = math_ops.reduce_max(math_ops.abs(tensor)) zero = constant_op.constant(0, dtypes.float32) output_tensor = gen_math_ops.maximum(zero, output_tensor) # The shape has to be 1. Set it if it does not have the information. output_tensor = array_ops.reshape(output_tensor, [1]) return output_tensor
def gen_non_linearity(A, non_linearity): ''' Returns required activation for a tensor based on the inputs ''' if non_linearity == "tanh": return math_ops.tanh(A) elif non_linearity == "sigmoid": return math_ops.sigmoid(A) elif non_linearity == "relu": return gen_math_ops.maximum(A, 0.0) elif non_linearity == "quantTanh": return gen_math_ops.maximum(gen_math_ops.minimum(A, 1.0), -1.0) elif non_linearity == "quantSigm": A = (A + 1.0) / 2.0 return gen_math_ops.maximum(gen_math_ops.minimum(A, 1.0), 0.0) else: return math_ops.tanh(A)
def get_range_len(start, limit, delta): dist = ops.convert_to_tensor(limit - start) unadjusted_len = dist // delta adjustment = math_ops.cast( gen_math_ops.not_equal(dist % delta, array_ops.zeros_like(unadjusted_len)), dist.dtype) final_len = unadjusted_len + adjustment return gen_math_ops.maximum(final_len, array_ops.zeros_like(final_len))
def sequence_mask(lengths, maxlen=None, dtype=dtypes.bool, name=None): """Returns a mask tensor representing the first N positions of each cell. If `lengths` has shape `[d_1, d_2, ..., d_n]` the resulting tensor `mask` has dtype `dtype` and shape `[d_1, d_2, ..., d_n, maxlen]`, with ``` mask[i_1, i_2, ..., i_n, j] = (j < lengths[i_1, i_2, ..., i_n]) ``` Examples: ```python tf.sequence_mask([1, 3, 2], 5) # [[True, False, False, False, False], # [True, True, True, False, False], # [True, True, False, False, False]] tf.sequence_mask([[1, 3],[2,0]]) # [[[True, False, False], # [True, True, True]], # [[True, True, False], # [False, False, False]]] ``` Args: lengths: integer tensor, all its values <= maxlen. maxlen: scalar integer tensor, size of last dimension of returned tensor. Default is the maximum value in `lengths`. dtype: output type of the resulting tensor. name: name of the op. Returns: A mask tensor of shape `lengths.shape + (maxlen,)`, cast to specified dtype. Raises: ValueError: if `maxlen` is not a scalar. """ with ops.name_scope(name, "SequenceMask", [lengths, maxlen]): lengths = ops.convert_to_tensor(lengths) if maxlen is None: maxlen = gen_math_ops._max(lengths, _all_dimensions(lengths)) maxlen = gen_math_ops.maximum(constant(0, maxlen.dtype), maxlen) else: maxlen = ops.convert_to_tensor(maxlen) if maxlen.get_shape( ).ndims is not None and maxlen.get_shape().ndims != 0: raise ValueError("maxlen must be scalar for sequence_mask") # The basic idea is to compare a range row vector of size maxlen: # [0, 1, 2, 3, 4] # to length as a matrix with 1 column: [[1], [3], [2]]. # Because of broadcasting on both arguments this comparison results # in a matrix of size (len(lengths), maxlen) row_vector = gen_math_ops._range(constant(0, maxlen.dtype), maxlen, constant(1, maxlen.dtype)) # Since maxlen >= max(lengths), it is safe to use maxlen as a cast # authoritative type. Whenever maxlen fits into tf.int32, so do the lengths. matrix = gen_math_ops.cast(expand_dims(lengths, -1), maxlen.dtype) result = row_vector < matrix if dtype is None or result.dtype.base_dtype == dtype.base_dtype: return result else: return gen_math_ops.cast(result, dtype)
def loss_op(self, targets, prediction_ops): """Create loss_op.""" prediction = prediction_ops["mean"] covariance = prediction_ops["covariance"] # Normal data log probability. sigma = math_ops.sqrt(gen_math_ops.maximum(covariance, 1e-5)) log_prob1 = math_utils.normal_log_prob(targets, sigma, prediction) log_prob1 += math_ops.log(1 - self._anomaly_prior_probability) # Anomaly log probability. log_prob2 = self._anomaly_log_prob(targets, prediction_ops) log_prob2 += math_ops.log(self._anomaly_prior_probability) # We need to compute log(exp(log_prob1) + exp(log_prob2). For numerical # stability, we rewrite the expression as below. p1 = gen_math_ops.minimum(log_prob1, log_prob2) p2 = gen_math_ops.maximum(log_prob1, log_prob2) mixed_log_prob = p2 + math_ops.log(1 + gen_math_ops.exp(p1 - p2)) loss_op = -math_ops.reduce_sum(mixed_log_prob) loss_op /= math_ops.cast( math_ops.reduce_prod(array_ops.shape(targets)), self.dtype) return loss_op
def loss_op(self, targets, prediction_ops): """Create loss_op.""" prediction = prediction_ops["mean"] covariance = prediction_ops["covariance"] # Normal data log probability. sigma = math_ops.sqrt(gen_math_ops.maximum(covariance, 1e-5)) log_prob1 = math_utils.normal_log_prob(targets, sigma, prediction) log_prob1 += math_ops.log(1 - self._anomaly_prior_probability) # Anomaly log probability. log_prob2 = self._anomaly_log_prob(targets, prediction_ops) log_prob2 += math_ops.log(self._anomaly_prior_probability) # We need to compute log(exp(log_prob1) + exp(log_prob2). For numerical # stability, we rewrite the expression as below. p1 = gen_math_ops.minimum(log_prob1, log_prob2) p2 = gen_math_ops.maximum(log_prob1, log_prob2) mixed_log_prob = p2 + math_ops.log(1 + gen_math_ops.exp(p1 - p2)) loss_op = -math_ops.reduce_sum(mixed_log_prob) loss_op /= math_ops.cast( math_ops.reduce_prod(array_ops.shape(targets)), self.dtype) return loss_op
def _anomaly_log_prob(self, targets, prediction_ops): prediction = prediction_ops["mean"] if self._anomaly_distribution == AnomalyMixtureARModel.GAUSSIAN_ANOMALY: anomaly_variance = prediction_ops["anomaly_params"] anomaly_sigma = math_ops.sqrt( gen_math_ops.maximum(anomaly_variance, 1e-5)) log_prob = math_utils.normal_log_prob(targets, anomaly_sigma, prediction) else: assert self._anomaly_distribution == AnomalyMixtureARModel.CAUCHY_ANOMALY anomaly_scale = prediction_ops["anomaly_params"] log_prob = math_utils.cauchy_log_prob(targets, anomaly_scale, prediction) return log_prob
def _anomaly_log_prob(self, targets, prediction_ops): prediction = prediction_ops["mean"] if self._anomaly_distribution == AnomalyMixtureARModel.GAUSSIAN_ANOMALY: anomaly_variance = prediction_ops["anomaly_params"] anomaly_sigma = math_ops.sqrt( gen_math_ops.maximum(anomaly_variance, 1e-5)) log_prob = math_utils.normal_log_prob(targets, anomaly_sigma, prediction) else: assert self._anomaly_distribution == AnomalyMixtureARModel.CAUCHY_ANOMALY anomaly_scale = prediction_ops["anomaly_params"] log_prob = math_utils.cauchy_log_prob(targets, anomaly_scale, prediction) return log_prob
def gen_non_linearity(A, non_linearity): if non_linearity == "tanh": return math_ops.tanh(A) elif non_linearity == "sigmoid": return math_ops.sigmoid(A) elif non_linearity == "relu": return gen_math_ops.maximum(A, 0.0) elif non_linearity == "quantTanh": return gen_math_ops.maximum(gen_math_ops.minimum(A, 1.0), -1.0) elif non_linearity == "quantSigm": A = (A + 1.0) / 2.0 return gen_math_ops.maximum(gen_math_ops.minimum(A, 1.0), 0.0) elif non_linearity == "quantSigm4": A = (A + 2.0) / 4.0 return gen_math_ops.maximum(gen_math_ops.minimum(A, 1.0), 0.0) else: # non_linearity is a user specified function if not callable(non_linearity): raise ValueError("non_linearity is either a callable or a value " + + "['tanh', 'sigmoid', 'relu', 'quantTanh', " + "'quantSigm'") return non_linearity(A)
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [] lr = self.lr if self.initial_decay > 0: lr = lr * ( 1. / (1. + self.decay * math_ops.cast(self.iterations,K.dtype(self.decay))) ) with ops.control_dependencies([state_ops.assign_add(self.iterations, 1)]): t = math_ops.cast(self.iterations, K.floatx()) lr_t = gen_math_ops.sqrt(1. - math_ops.pow(self.beta_2, t)) / (1. - math_ops.pow(self.beta_1, t)) lower_bound = self.lr_boost * (1. - 1. / (self.gamma * t + 1.)) upper_bound = self.lr_boost * (1. + 1. / (self.gamma * t)) if self.sgdcorr: m_rate = 1. - self.beta_1 / (self.gamma * t + 1.) else: m_rate = 1. - self.beta_1 ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] if self.amsgrad: vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] else: vhats = [K.zeros(1) for _ in params] self.weights = [self.iterations] + ms + vs + vhats for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): m_t = (self.beta_1 * m) + m_rate * g v_t = (self.beta_2 * v) + (1. - self.beta_2) * math_ops.square(g) if self.amsgrad: vhat_t = math_ops.maximum(vhat, v_t) lr_v = gen_math_ops.reciprocal(gen_math_ops.sqrt(vhat_t) + self.epsilon) self.updates.append(state_ops.assign(vhat, vhat_t)) else: lr_v = gen_math_ops.reciprocal(gen_math_ops.sqrt(v_t) + self.epsilon) lr_bound = gen_math_ops.minimum(gen_math_ops.maximum(lr_t * lr_v, lower_bound), upper_bound) p_t = p - lr * lr_bound * m_t self.updates.append(state_ops.assign(m, m_t)) self.updates.append(state_ops.assign(v, v_t)) new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(state_ops.assign(p, new_p)) return self.updates
def loss_op(self, targets, prediction_ops): """Create loss_op.""" prediction = prediction_ops["mean"] if self.loss == ARModel.NORMAL_LIKELIHOOD_LOSS: covariance = prediction_ops["covariance"] sigma = math_ops.sqrt(gen_math_ops.maximum(covariance, 1e-5)) normal = distributions.Normal(loc=targets, scale=sigma) loss_op = -math_ops.reduce_sum(normal.log_prob(prediction)) else: assert self.loss == ARModel.SQUARED_LOSS, self.loss loss_op = math_ops.reduce_sum(math_ops.square(prediction - targets)) loss_op /= math_ops.cast( math_ops.reduce_prod(array_ops.shape(targets)), loss_op.dtype) return loss_op
def _unsorted_segment_N(data, segment_ids, num_segments): """ Helper function for unsorted_segment_mean/_sqrtN. Computes the number of segment entries with 0-entries set to 1 to allow division by N. """ # bincount doesn't support negative indices so we use unsorted_segment_sum segment_ids_shape = array_ops.shape_internal(segment_ids) ones_tensor = array_ops.ones(segment_ids_shape, dtype=data.dtype) N = gen_math_ops.unsorted_segment_sum(ones_tensor, segment_ids, num_segments) # add dimensions for all non-reduced axes ndims_output = data.shape.ndims - segment_ids.shape.ndims broadcast_shape = [num_segments] + [1] * ndims_output N = array_ops.reshape(N, broadcast_shape) return gen_math_ops.maximum(N, 1)
def loss_op(self, targets, prediction_ops): """Create loss_op.""" prediction = prediction_ops["mean"] if self.loss == ARModel.NORMAL_LIKELIHOOD_LOSS: covariance = prediction_ops["covariance"] sigma = math_ops.sqrt(gen_math_ops.maximum(covariance, 1e-5)) normal = distributions.normal.Normal(loc=targets, scale=sigma) loss_op = -math_ops.reduce_sum(normal.log_prob(prediction)) else: assert self.loss == ARModel.SQUARED_LOSS, self.loss loss_op = math_ops.reduce_sum(math_ops.square(prediction - targets)) loss_op /= math_ops.cast( math_ops.reduce_prod(array_ops.shape(targets)), loss_op.dtype) return loss_op
def __sample_w_rej(self, n, seed): c = math_ops.sqrt((4 * (self.scale**2)) + (self.__mf - 1)**2) b_true = (-2 * self.scale + c) / (self.__mf - 1) # using Taylor approximation with a smooth swift from 10 < scale < 11 # to avoid numerical errors for large scale b_app = (self.__mf - 1) / (4 * self.scale) s = gen_math_ops.minimum(gen_math_ops.maximum(0., self.scale - 10), 1.) b = b_app * s + b_true * (1 - s) a = (self.__mf - 1 + 2 * self.scale + c) / 4 d = (4 * a * b) / (1 + b) - (self.__mf - 1) * math_ops.log(self.__mf - 1) self.__b, (self.__e, self.__w) = b, self.__while_loop(b, a, d, n, seed) return self.__w
def GetParams(self): """Testing Concatenation in TF-TRT conversion.""" dtype = dtypes.float32 input_name = "input" input_dims = [2, 3, 3, 1] output_name = "output" g = ops.Graph() with g.as_default(): x = array_ops.placeholder(dtype=dtype, shape=input_dims, name=input_name) # scale a = constant_op.constant(np.random.randn(3, 1, 1), dtype=dtype) r1 = x / a a = constant_op.constant(np.random.randn(3, 1, 1), dtype=dtype) r2 = a / x a = constant_op.constant(np.random.randn(1, 3, 1), dtype=dtype) r3 = a + x a = constant_op.constant(np.random.randn(1, 3, 1), dtype=dtype) r4 = x * a a = constant_op.constant(np.random.randn(3, 1, 1), dtype=dtype) r5 = x - a a = constant_op.constant(np.random.randn(3, 1, 1), dtype=dtype) r6 = a - x a = constant_op.constant(np.random.randn(3, 1), dtype=dtype) r7 = x - a a = constant_op.constant(np.random.randn(3, 1), dtype=dtype) r8 = a - x a = constant_op.constant(np.random.randn(3, 1, 1), dtype=dtype) r9 = gen_math_ops.maximum(x, a) a = constant_op.constant(np.random.randn(3, 1), dtype=dtype) r10 = gen_math_ops.minimum(a, x) a = constant_op.constant(np.random.randn(3), dtype=dtype) r11 = x * a a = constant_op.constant(np.random.randn(1), dtype=dtype) r12 = a * x concat1 = array_ops.concat([r1, r2, r3, r4, r5, r6], axis=-1) concat2 = array_ops.concat([r7, r8, r9, r10, r11, r12], axis=3) x = array_ops.concat([concat1, concat2], axis=-1) gen_array_ops.reshape(x, [2, -1], name=output_name) return trt_test.TfTrtIntegrationTestParams(gdef=g.as_graph_def(), input_names=[input_name], input_dims=[input_dims], output_names=[output_name], expected_output_dims=[ (2, 126) ])
def _anomaly_log_prob(self, targets, prediction_ops): prediction = prediction_ops["mean"] if self._anomaly_distribution == AnomalyMixtureARModel.GAUSSIAN_ANOMALY: anomaly_variance = prediction_ops["anomaly_params"] anomaly_sigma = math_ops.sqrt( gen_math_ops.maximum(anomaly_variance, 1e-5)) normal = distributions.Normal(loc=targets, scale=anomaly_sigma) log_prob = normal.log_prob(prediction) else: assert self._anomaly_distribution == AnomalyMixtureARModel.CAUCHY_ANOMALY anomaly_scale = prediction_ops["anomaly_params"] cauchy = distributions.StudentT(df=array_ops.ones( [], dtype=anomaly_scale.dtype), loc=targets, scale=anomaly_scale) log_prob = cauchy.log_prob(prediction) return log_prob
def _anomaly_log_prob(self, targets, prediction_ops): prediction = prediction_ops["mean"] if self._anomaly_distribution == AnomalyMixtureARModel.GAUSSIAN_ANOMALY: anomaly_variance = prediction_ops["anomaly_params"] anomaly_sigma = math_ops.sqrt( gen_math_ops.maximum(anomaly_variance, 1e-5)) normal = distributions.Normal(loc=targets, scale=anomaly_sigma) log_prob = normal.log_prob(prediction) else: assert self._anomaly_distribution == AnomalyMixtureARModel.CAUCHY_ANOMALY anomaly_scale = prediction_ops["anomaly_params"] cauchy = distributions.StudentT( df=array_ops.ones([], dtype=anomaly_scale.dtype), loc=targets, scale=anomaly_scale) log_prob = cauchy.log_prob(prediction) return log_prob
def GetParams(self): """Testing Concatenation in TF-TRT conversion.""" dtype = dtypes.float32 input_name = "input" input_dims = [2, 3, 3, 1] g = ops.Graph() with g.as_default(): x = array_ops.placeholder(dtype=dtype, shape=input_dims, name=input_name) # scale a = constant_op.constant(np.random.randn(3, 1, 1), dtype=dtype) r1 = x / a a = constant_op.constant(np.random.randn(3, 1, 1), dtype=dtype) r2 = a / x a = constant_op.constant(np.random.randn(1, 3, 1), dtype=dtype) r3 = a + x a = constant_op.constant(np.random.randn(1, 3, 1), dtype=dtype) r4 = x * a a = constant_op.constant(np.random.randn(3, 1, 1), dtype=dtype) r5 = x - a a = constant_op.constant(np.random.randn(3, 1, 1), dtype=dtype) r6 = a - x a = constant_op.constant(np.random.randn(3, 1), dtype=dtype) r7 = x - a a = constant_op.constant(np.random.randn(3, 1), dtype=dtype) r8 = a - x a = constant_op.constant(np.random.randn(3, 1, 1), dtype=dtype) r9 = gen_math_ops.maximum(x, a) a = constant_op.constant(np.random.randn(3, 1), dtype=dtype) r10 = gen_math_ops.minimum(a, x) a = constant_op.constant(np.random.randn(3), dtype=dtype) r11 = x * a a = constant_op.constant(np.random.randn(1), dtype=dtype) r12 = a * x concat1 = array_ops.concat([r1, r2, r3, r4, r5, r6], axis=-1) concat2 = array_ops.concat([r7, r8, r9, r10, r11, r12], axis=3) x = array_ops.concat([concat1, concat2], axis=-1) gen_array_ops.reshape(x, [2, -1], name=self.output_name) return trt_test.TfTrtIntegrationTestParams( gdef=g.as_graph_def(), input_names=[input_name], input_dims=[input_dims], expected_engines=["my_trt_op_0"], expected_output_dims=(2, 126), allclose_atol=1.e-03, allclose_rtol=1.e-03)
def _apply_dense(self, grad, var): # bias-corrected learning rate lr = self._lr_t * math_ops.sqrt(1. - self._beta2_power) / (1. - self._beta1_power) first_mom = self.get_slot(var, "first_mom") second_mom = self.get_slot(var, "second_mom") second_mom_max = self.get_slot(var, "second_mom_max") first_update = first_mom.assign(self._beta1_t * first_mom + self._one_minus_beta1 * grad, use_locking=self._use_locking) second_update = second_mom.assign(self._beta2_t * second_mom + self._one_minus_beta2 * math_ops.square(grad), use_locking=self._use_locking) # AMSGrad compared to ADAM second_max_update = second_mom_max.assign(gen_math_ops.maximum(second_mom_max, second_update)) var_update = var.assign_sub(lr * first_update / (math_ops.sqrt(second_max_update) + self._epsilon_t), use_locking=self._use_locking) return control_flow_ops.group(*[var_update, first_update, second_update, second_max_update])
def clip_covariance(covariance_matrix, maximum_variance_ratio, minimum_variance): """Enforce constraints on a covariance matrix to improve numerical stability. Args: covariance_matrix: A [..., N, N] batch of covariance matrices. maximum_variance_ratio: The maximum allowed ratio of two diagonal entries. Any entries lower than the maximum entry divided by this ratio will be set to that value. minimum_variance: A floor for diagonal entries in the returned matrix. Returns: A new covariance matrix with the requested constraints enforced. If the input was positive definite, the output will be too. """ # TODO(allenl): Smarter scaling here so that correlations are preserved when # fiddling with diagonal elements. diagonal = array_ops.matrix_diag_part(covariance_matrix) maximum = math_ops.reduce_max(diagonal, axis=-1, keep_dims=True) new_diagonal = gen_math_ops.maximum(diagonal, maximum / maximum_variance_ratio) return array_ops.matrix_set_diag( covariance_matrix, math_ops.maximum(new_diagonal, minimum_variance))
def posdef_inv_eig(tensor, identity, damping): """Computes inverse(tensor + damping * identity) with eigendecomposition.""" # # this works # with tf.device('/cpu:0'): # eigenvalues, eigenvectors = linalg_ops.self_adjoint_eig( # tensor + damping * identity) # # this doesn't work # eigenvalues, eigenvectors = linalg_ops.self_adjoint_eig( # tensor + damping * identity) # this works eigenvalues, eigenvectors = linalg_ops.self_adjoint_eig( tf.to_double(tensor + damping * identity)) eigenvalues, eigenvectors = tf.to_float(eigenvalues), tf.to_float( eigenvectors) # TODO(GD): it's a little hacky eigenvalues = gen_math_ops.maximum(eigenvalues, damping) return math_ops.matmul(eigenvectors / eigenvalues, eigenvectors, transpose_b=True)
def clip_covariance( covariance_matrix, maximum_variance_ratio, minimum_variance): """Enforce constraints on a covariance matrix to improve numerical stability. Args: covariance_matrix: A [..., N, N] batch of covariance matrices. maximum_variance_ratio: The maximum allowed ratio of two diagonal entries. Any entries lower than the maximum entry divided by this ratio will be set to that value. minimum_variance: A floor for diagonal entries in the returned matrix. Returns: A new covariance matrix with the requested constraints enforced. If the input was positive definite, the output will be too. """ # TODO(allenl): Smarter scaling here so that correlations are preserved when # fiddling with diagonal elements. diagonal = array_ops.matrix_diag_part(covariance_matrix) maximum = math_ops.reduce_max(diagonal, axis=-1, keep_dims=True) new_diagonal = gen_math_ops.maximum( diagonal, maximum / maximum_variance_ratio) return array_ops.matrix_set_diag( covariance_matrix, math_ops.maximum(new_diagonal, minimum_variance))
def repeat_with_axis(data, repeats, axis, name=None): """Repeats elements of `data`. Args: data: An `N`-dimensional tensor. repeats: A 1-D integer tensor specifying how many times each element in `axis` should be repeated. `len(repeats)` must equal `data.shape[axis]`. Supports broadcasting from a scalar value. axis: `int`. The axis along which to repeat values. Must be less than `max(N, 1)`. name: A name for the operation. Returns: A tensor with `max(N, 1)` dimensions. Has the same shape as `data`, except that dimension `axis` has size `sum(repeats)`. #### Examples: ```python >>> repeat(['a', 'b', 'c'], repeats=[3, 0, 2], axis=0) ['a', 'a', 'a', 'c', 'c'] >>> repeat([[1, 2], [3, 4]], repeats=[2, 3], axis=0) [[1, 2], [1, 2], [3, 4], [3, 4], [3, 4]] >>> repeat([[1, 2], [3, 4]], repeats=[2, 3], axis=1) [[1, 1, 2, 2, 2], [3, 3, 4, 4, 4]] ``` """ if not isinstance(axis, int): raise TypeError("axis must be an int; got %s" % type(axis).__name__) with ops.name_scope(name, "Repeat", [data, repeats]): data = ops.convert_to_tensor(data, name="data") repeats = convert_to_int_tensor(repeats, name="repeats") repeats.shape.with_rank_at_most(1) # If `data` is a scalar, then upgrade it to a vector. data = _with_nonzero_rank(data) data_shape = shape(data) # If `axis` is negative, then convert it to a positive value. axis = get_positive_axis(axis, data.shape.ndims) # Check data Tensor shapes. if repeats.shape.ndims == 1: data.shape.dims[axis].assert_is_compatible_with(repeats.shape[0]) # If we know that `repeats` is a scalar, then we can just tile & reshape. if repeats.shape.ndims == 0: expanded = expand_dims(data, axis + 1) tiled = tile_one_dimension(expanded, axis + 1, repeats) result_shape = concat([data_shape[:axis], [-1], data_shape[axis + 1:]], axis=0) return tf.reshape(tiled, result_shape) # Broadcast the `repeats` tensor so rank(repeats) == axis + 1. if repeats.shape.ndims != axis + 1: repeats_shape = shape(repeats) repeats_ndims = rank(repeats) broadcast_shape = concat( [data_shape[:axis + 1 - repeats_ndims], repeats_shape], axis=0) repeats = broadcast_to(repeats, broadcast_shape) repeats.set_shape([None] * (axis + 1)) # Create a "sequence mask" based on `repeats`, where slices across `axis` # contain one `True` value for each repetition. E.g., if # `repeats = [3, 1, 2]`, then `mask = [[1, 1, 1], [1, 0, 0], [1, 1, 0]]`. max_repeat = gen_math_ops.maximum( 0, gen_math_ops._max(repeats, _all_dimensions(repeats))) mask = tf.sequence_mask(repeats, max_repeat) # Add a new dimension around each value that needs to be repeated, and # then tile that new dimension to match the maximum number of repetitions. expanded = expand_dims(data, axis + 1) tiled = tile_one_dimension(expanded, axis + 1, max_repeat) # Use `boolean_mask` to discard the extra repeated values. This also # flattens all dimensions up through `axis`. masked = tf.boolean_mask(tiled, mask) # Reshape the output tensor to add the outer dimensions back. if axis == 0: result = masked else: result_shape = concat([data_shape[:axis], [-1], data_shape[axis + 1:]], axis=0) result = tf.reshape(masked, result_shape) # Preserve shape information. if data.shape.ndims is not None: new_axis_size = 0 if repeats.shape[0] == 0 else None result.set_shape(data.shape[:axis].concatenate( [new_axis_size]).concatenate(data.shape[axis + 1:])) return result
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [state_ops.assign_add(self.iterations, 1)] lr = self.lr if self.initial_decay > 0: lr = lr * ( 1. / (1. + self.decay * math_ops.cast(self.iterations,K.dtype(self.decay))) ) t = math_ops.cast(self.iterations, K.floatx()) + 1 lower_bound = self.lr_boost * (1. - 1. / (self.gamma * t + 1.)) upper_bound = self.lr_boost * (1. + 1. / (self.gamma * t)) if self.sgdcorr: m_rate = 1. - self.beta_1 / (self.gamma * t + 1.) else: m_rate = 1. - self.beta_1 # Due to the recommendations in [2], i.e. warming momentum schedule momentum_cache_t = self.beta_1 * ( 1. - 0.5 * (math_ops.pow(K.cast_to_floatx(0.96), t * self.schedule_decay))) momentum_cache_t_1 = self.beta_1 * ( 1. - 0.5 * (math_ops.pow(K.cast_to_floatx(0.96), (t + 1) * self.schedule_decay))) m_schedule_new = self.m_schedule * momentum_cache_t m_schedule_next = self.m_schedule * momentum_cache_t * momentum_cache_t_1 self.updates.append((self.m_schedule, m_schedule_new)) ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] if self.amsgrad: vhats = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params] else: vhats = [K.zeros(1) for _ in params] self.weights = [self.iterations] + ms + vs + vhats for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats): # the following equations given in [1] g_prime = g / (1. - m_schedule_new) m_t = self.beta_1 * m + m_rate * g m_t_prime = m_t / (1. - m_schedule_next) v_t = self.beta_2 * v + (1. - self.beta_2) * math_ops.square(g) if self.amsgrad: vhat_t = math_ops.maximum(vhat, v_t) self.updates.append(state_ops.assign(vhat, vhat_t)) v_t_prime = vhat_t / (1. - math_ops.pow(self.beta_2, t)) else: v_t_prime = v_t / (1. - math_ops.pow(self.beta_2, t)) m_t_bar = (m_rate / (1.-self.beta_1)) * (1. - momentum_cache_t) * g_prime + momentum_cache_t_1 * m_t_prime beta_1_reduce = 1. - math_ops.pow(self.beta_1, t) lr_v = gen_math_ops.reciprocal((gen_math_ops.sqrt(v_t_prime) + self.epsilon) * beta_1_reduce) self.updates.append(state_ops.assign(m, m_t)) self.updates.append(state_ops.assign(v, v_t)) lr_bound = gen_math_ops.minimum(gen_math_ops.maximum(lr_v, lower_bound), upper_bound) p_t = p - lr * lr_bound * beta_1_reduce * m_t_bar new_p = p_t # Apply constraints. if getattr(p, 'constraint', None) is not None: new_p = p.constraint(new_p) self.updates.append(state_ops.assign(p, new_p)) return self.updates
def decr_loss_scale(): update_op = state_ops.assign( self._loss_scale, gen_math_ops.maximum(1., self._loss_scale * self._decr_ratio)) # When loss_scale is updated, both good and bad steps are reset. return control_flow_ops.group(update_op, self._reset_stats())
def _update_statistics_from_mini_batch( self, statistics, auxiliary_variables, times, values): """Given mini-batch input, update `statistics` and `auxiliary_variables`.""" values = math_ops.cast(values, self._dtype) # The density (measured in times per observation) that we see in each part # of the mini-batch. batch_inter_observation_duration = (math_ops.cast( math_ops.reduce_max(times, axis=1) - math_ops.reduce_min(times, axis=1), self._dtype) / math_ops.cast( array_ops.shape(times)[1] - 1, self._dtype)) # Co-locate updates with their variables to minimize race conditions when # updating statistics. with ops.colocate_with(auxiliary_variables.max_time_seen): # There is a race condition if this value is being updated from multiple # workers. However, it should eventually reach the correct value if the # last chunk is presented enough times. max_time_seen_assign = state_ops.assign( auxiliary_variables.max_time_seen, gen_math_ops.maximum(auxiliary_variables.max_time_seen, math_ops.reduce_max(times))) with ops.colocate_with(auxiliary_variables.chunk_count): chunk_count_assign = state_ops.assign_add(auxiliary_variables.chunk_count, array_ops.shape( times, out_type=dtypes.int64)[0]) with ops.colocate_with(auxiliary_variables.inter_observation_duration_sum): inter_observation_duration_assign = state_ops.assign_add( auxiliary_variables.inter_observation_duration_sum, math_ops.reduce_sum(batch_inter_observation_duration)) with ops.colocate_with(auxiliary_variables.example_count): example_count_assign = state_ops.assign_add( auxiliary_variables.example_count, array_ops.size(times, out_type=dtypes.int64)) # Note: These mean/variance updates assume that all points are equally # likely, which is not true if _chunks_ are sampled uniformly from the space # of all possible contiguous chunks, since points at the start and end of # the series are then members of fewer chunks. For series which are much # longer than the chunk size (the usual/expected case), this effect becomes # irrelevant. with ops.colocate_with(auxiliary_variables.overall_feature_sum): overall_feature_sum_assign = state_ops.assign_add( auxiliary_variables.overall_feature_sum, math_ops.reduce_sum(values, axis=[0, 1])) with ops.colocate_with(auxiliary_variables.overall_feature_sum_of_squares): overall_feature_sum_of_squares_assign = state_ops.assign_add( auxiliary_variables.overall_feature_sum_of_squares, math_ops.reduce_sum(values**2, axis=[0, 1])) per_chunk_aux_updates = control_flow_ops.group( max_time_seen_assign, chunk_count_assign, inter_observation_duration_assign, example_count_assign, overall_feature_sum_assign, overall_feature_sum_of_squares_assign) with ops.control_dependencies([per_chunk_aux_updates]): example_count_float = math_ops.cast(auxiliary_variables.example_count, self._dtype) new_feature_mean = (auxiliary_variables.overall_feature_sum / example_count_float) overall_feature_mean_update = state_ops.assign( statistics.overall_feature_moments.mean, new_feature_mean) overall_feature_var_update = state_ops.assign( statistics.overall_feature_moments.variance, # De-biased n / (n - 1) variance correction example_count_float / (example_count_float - 1.) * (auxiliary_variables.overall_feature_sum_of_squares / example_count_float - new_feature_mean**2)) # TODO(b/35675805): Remove this cast min_time_batch = math_ops.cast(math_ops.argmin(times[:, 0]), dtypes.int32) def series_start_updates(): # If this is the lowest-time chunk that we have seen so far, update # series start moments to reflect that. Note that these statistics are # "best effort", as there are race conditions in the update (however, # they should eventually converge if the start of the series is # presented enough times). mean, variance = nn.moments( values[min_time_batch, :self._starting_variance_window_size], axes=[0]) return control_flow_ops.group( state_ops.assign(statistics.series_start_moments.mean, mean), state_ops.assign(statistics.series_start_moments.variance, variance)) with ops.colocate_with(statistics.start_time): series_start_update = control_flow_ops.cond( # Update moments whenever we even match the lowest time seen so far, # to ensure that series start statistics are eventually updated to # their correct values, despite race conditions (i.e. eventually # statistics.start_time will reflect the global lowest time, and # given that we will eventually update the series start moments to # their correct values). math_ops.less_equal(times[min_time_batch, 0], statistics.start_time), series_start_updates, control_flow_ops.no_op) with ops.control_dependencies([series_start_update]): # There is a race condition if this update is performed in parallel on # multiple workers. Since models may be sensitive to being presented # with times before the putative start time, the value of this # variable is post-processed above to guarantee that each worker is # presented with a start time which is at least as low as the lowest # time in its current mini-batch. start_time_update = state_ops.assign(statistics.start_time, gen_math_ops.minimum( statistics.start_time, math_ops.reduce_min(times))) inter_observation_duration_estimate = ( auxiliary_variables.inter_observation_duration_sum / math_ops.cast( auxiliary_variables.chunk_count, self._dtype)) # Estimate the total number of observations as: # (end time - start time + 1) * average intra-chunk time density total_observation_count_update = state_ops.assign( statistics.total_observation_count, math_ops.cast( gen_math_ops.round( math_ops.cast(auxiliary_variables.max_time_seen - statistics.start_time + 1, self._dtype) / inter_observation_duration_estimate), dtypes.int64)) per_chunk_stat_updates = control_flow_ops.group( overall_feature_mean_update, overall_feature_var_update, series_start_update, start_time_update, total_observation_count_update) return per_chunk_stat_updates
def bincount(arr, weights=None, minlength=None, maxlength=None, dtype=dtypes.int32, name=None, axis=None, binary_output=False): """Counts the number of occurrences of each value in an integer array. If `minlength` and `maxlength` are not given, returns a vector with length `tf.reduce_max(arr) + 1` if `arr` is non-empty, and length 0 otherwise. If `weights` are non-None, then index `i` of the output stores the sum of the value in `weights` at each index where the corresponding value in `arr` is `i`. ```python values = tf.constant([1,1,2,3,2,4,4,5]) tf.math.bincount(values) #[0 2 2 1 2 1] ``` Vector length = Maximum element in vector `values` is 5. Adding 1, which is 6 will be the vector length. Each bin value in the output indicates number of occurrences of the particular index. Here, index 1 in output has a value 2. This indicates value 1 occurs two times in `values`. ```python values = tf.constant([1,1,2,3,2,4,4,5]) weights = tf.constant([1,5,0,1,0,5,4,5]) tf.math.bincount(values, weights=weights) #[0 6 0 1 9 5] ``` Bin will be incremented by the corresponding weight instead of 1. Here, index 1 in output has a value 6. This is the summation of weights corresponding to the value in `values`. **Bin-counting on a certain axis** This example takes a 2 dimensional input and returns a `Tensor` with bincounting on each sample. >>> data = np.array([[1, 2, 3, 0], [0, 0, 1, 2]], dtype=np.int32) >>> tf.math.bincount(data, axis=-1) <tf.Tensor: shape=(2, 4), dtype=int32, numpy= array([[1, 1, 1, 1], [2, 1, 1, 0]], dtype=int32)> **Bin-counting with binary_output** This example gives binary output instead of counting the occurrence. >>> data = np.array([[1, 2, 3, 0], [0, 0, 1, 2]], dtype=np.int32) >>> tf.math.bincount(data, axis=-1, binary_output=True) <tf.Tensor: shape=(2, 4), dtype=int32, numpy= array([[1, 1, 1, 1], [1, 1, 1, 0]], dtype=int32)> Args: arr: A Tensor, RaggedTensor, or SparseTensor whose values should be counted. These tensors must have a rank of 2 if `axis=-1`. weights: If non-None, must be the same shape as arr. For each value in `arr`, the bin will be incremented by the corresponding weight instead of 1. minlength: If given, ensures the output has length at least `minlength`, padding with zeros at the end if necessary. maxlength: If given, skips values in `arr` that are equal or greater than `maxlength`, ensuring that the output has length at most `maxlength`. dtype: If `weights` is None, determines the type of the output bins. name: A name scope for the associated operations (optional). axis: The axis to slice over. Axes at and below `axis` will be flattened before bin counting. Currently, only `0`, and `-1` are supported. If None, all axes will be flattened (identical to passing `0`). binary_output: If True, this op will output 1 instead of the number of times a token appears (equivalent to one_hot + reduce_any instead of one_hot + reduce_add). Defaults to False. Returns: A vector with the same dtype as `weights` or the given `dtype`. The bin values. Raises: `InvalidArgumentError` if negative values are provided as an input. """ name = "bincount" if name is None else name with ops.name_scope(name): # Somehow forward compatible needs to be False. if not binary_output and axis is None: arr = ops.convert_to_tensor(arr, name="arr", dtype=dtypes.int32) array_is_nonempty = math_ops.reduce_prod(array_ops.shape(arr)) > 0 output_size = math_ops.cast(array_is_nonempty, dtypes.int32) * ( math_ops.reduce_max(arr) + 1) if minlength is not None: minlength = ops.convert_to_tensor(minlength, name="minlength", dtype=dtypes.int32) output_size = gen_math_ops.maximum(minlength, output_size) if maxlength is not None: maxlength = ops.convert_to_tensor(maxlength, name="maxlength", dtype=dtypes.int32) output_size = gen_math_ops.minimum(maxlength, output_size) if weights is not None: weights = ops.convert_to_tensor(weights, name="weights") return gen_math_ops.unsorted_segment_sum( weights, arr, output_size) weights = constant_op.constant([], dtype) arr = array_ops.reshape(arr, [-1]) return gen_math_ops.bincount(arr, output_size, weights) if not isinstance(arr, sparse_tensor.SparseTensor): arr = ragged_tensor.convert_to_tensor_or_ragged_tensor(arr, name="arr") if weights is not None: if not isinstance(weights, sparse_tensor.SparseTensor): weights = ragged_tensor.convert_to_tensor_or_ragged_tensor( weights, name="weights") if weights is not None and binary_output: raise ValueError( "Arguments `binary_output` and `weights` are mutually " "exclusive. Please specify only one.") if not arr.dtype.is_integer: arr = math_ops.cast(arr, dtypes.int32) if axis is None: axis = 0 if axis not in [0, -1]: raise ValueError( f"Unsupported value for argument axis={axis}. Only 0 and" " -1 are currently supported.") if isinstance(arr, ragged_tensor.RaggedTensor): array_is_nonempty = math_ops.reduce_prod( array_ops.shape(arr.values)) > 0 else: array_is_nonempty = math_ops.reduce_prod(array_ops.shape(arr)) > 0 if isinstance(arr, sparse_tensor.SparseTensor): output_size = math_ops.cast(array_is_nonempty, arr.dtype) * ( math_ops.reduce_max(arr.values) + 1) else: output_size = math_ops.cast( array_is_nonempty, arr.dtype) * (math_ops.reduce_max(arr) + 1) if minlength is not None: minlength = ops.convert_to_tensor(minlength, name="minlength", dtype=arr.dtype) output_size = gen_math_ops.maximum(minlength, output_size) if maxlength is not None: maxlength = ops.convert_to_tensor(maxlength, name="maxlength", dtype=arr.dtype) output_size = gen_math_ops.minimum(maxlength, output_size) if axis == 0: if isinstance(arr, sparse_tensor.SparseTensor): if weights is not None: weights = validate_sparse_weights(arr, weights, dtype) arr = arr.values elif isinstance(arr, ragged_tensor.RaggedTensor): if weights is not None: weights = validate_ragged_weights(arr, weights, dtype) arr = arr.values else: if weights is not None: weights = array_ops.reshape(weights, [-1]) arr = array_ops.reshape(arr, [-1]) if isinstance(arr, sparse_tensor.SparseTensor): weights = validate_sparse_weights(arr, weights, dtype) return gen_math_ops.sparse_bincount(indices=arr.indices, values=arr.values, dense_shape=arr.dense_shape, size=output_size, weights=weights, binary_output=binary_output) elif isinstance(arr, ragged_tensor.RaggedTensor): weights = validate_ragged_weights(arr, weights, dtype) return gen_math_ops.ragged_bincount(splits=arr.row_splits, values=arr.values, size=output_size, weights=weights, binary_output=binary_output) else: weights = validate_dense_weights(arr, weights, dtype) return gen_math_ops.dense_bincount(input=arr, size=output_size, weights=weights, binary_output=binary_output)
def call(self, inputs, state): """Long short-term memory cell (Neat). Args: inputs: `2-D` tensor with shape `[batch_size, input_size]`. state: An `LSTMStateTuple` of state tensors, each shaped `[batch_size, num_units]`, if `state_is_tuple` has been set to `True`. Otherwise, a `Tensor` shaped `[batch_size, 2 * num_units]`. Returns: A pair containing the new hidden state, and the new state (either a `LSTMStateTuple` or a concatenated state, depending on `state_is_tuple`). """ sigmoid = math_ops.sigmoid zero = constant_op.constant(0, dtype=dtypes.int32) one = constant_op.constant(1, dtype=dtypes.int32) # Parameters of gates are concatenated into one multiply for efficiency. if self._state_is_tuple: c, h = state else: c, h = array_ops.split(value=state, num_or_size_splits=2, axis=one, name="c_h_-_split") # print("c = \n{}\nh = \n{}\n".format(c.get_shape(),h.get_shape())) # print("i = \n{}\n".format(inputs.get_shape())) input_depth = int(inputs.get_shape()[1]) shape = int(self._kernel.get_shape()[1]) ratio = [self._num_units * 5, self._num_units * 3] # print("w = \n{}\n".format(self._kernel.get_shape())) # W_fi [5,4] W_fh [5,28] W_f, W_r = array_ops.split(value=self._kernel, num_or_size_splits=ratio, axis=one, name="W-f_W-r_-_split_-kernel") # print("w_f = \n{}\nw_r = \n{}\n".format(W_f.get_shape(),W_r.get_shape())) # W_fi [1,4] W_fh [4,4] W_fi, W_fh = array_ops.split( value=W_f, num_or_size_splits=[input_depth, self._num_units], axis=zero, name="W-fi_W-fh_-_split_W-f") # print("w_fi = \n{}\nw_fh = \n{}\n".format(W_fi.get_shape(),W_fh.get_shape())) #print("b = \n{}\n".format(self._bias.get_shape())) # b_f [_num_units,] b_f [_num_units*7,] b_f, b_r = array_ops.split(value=self._bias, num_or_size_splits=ratio, axis=zero, name="b-f_b-r_-_split_-bias") # print("b_f = \n{}\nb_r = \n{}\n".format(b_f.get_shape(),b_r.get_shape())) # a [?,_num_units] sw = math_ops.add(math_ops.matmul(h, W_fh), math_ops.matmul(inputs, W_fi)) # print("a = \n{}\n".format(a.get_shape())) sw = nn_ops.bias_add(value=sw, bias=b_f) # print("a = \n{}\n".format(a.get_shape())) s, t, u, v, w = array_ops.split(value=sw, num_or_size_splits=5, axis=one, name="s_t_v_u_w_-_split_sw") # W_ri [input_depth,_num_units*7] W_rh [_num_units,_num_units*7] W_ri, W_rh = array_ops.split( value=W_r, num_or_size_splits=[input_depth, self._num_units], axis=zero, name="W-ri_W-rh_-_split_W-r") # print("w_ri = \n{}\nw_rh = \n{}\n".format(W_ri.get_shape(),W_rh.get_shape())) # bh [?,_num_units*7] xz = gen_math_ops.maximum(math_ops.matmul(h, W_rh), math_ops.matmul(inputs, W_ri)) # print("bh = \n{}\n".format(bh.get_shape())) xz = nn_ops.bias_add(xz, b_r) # print("bh = \n{}\n".format(bh.get_shape())) # b,...,h [?,_num_units] x, y, z = array_ops.split(value=xz, num_or_size_splits=3, axis=one, name="x_y_z_-_split_xz") add = math_ops.add multiply = math_ops.multiply tanh = math_ops.tanh relu = nn_ops.relu identity = array_ops.identity #Nas cell 2 new_c = multiply(identity(add(identity(add(c, tanh(z))), identity(y))), sigmoid(add(relu(v), tanh(s)))) new_h = tanh( multiply( identity(new_c), sigmoid( multiply(sigmoid(add(tanh(x), tanh(w))), sigmoid(add(identity(u), tanh(t))))))) if self._state_is_tuple: new_state = LSTMStateTuple(new_c, new_h) else: new_state = array_ops.concat([new_c, new_h], 1) return new_h, new_state
def _conjugate_gradient(self, loss, z, grads_and_vars, cg_iter, fix_first_step=False, init_deltas=None): minus_gradient = [g for g, v in grads_and_vars] variables = [v for g, v in grads_and_vars] H_vars = [array_ops.zeros_like(g) for g in minus_gradient] if init_deltas is not None: H_vars = self._Hv(loss, z, variables, init_deltas, self._damping) curr_dirs = [g - b for g, b in list(zip(minus_gradient, H_vars))] curr_residuals = [g - b for g, b in list(zip(minus_gradient, H_vars))] deltas = [array_ops.zeros_like(g) for g in curr_dirs] deltas_history = [] residuals_history = [] first_alpha = 1 for i in range(cg_iter): Hvs = self._Hv(loss, z, variables, curr_dirs, self._damping) if len(Hvs) != len(variables): raise ValueError("xs and Hvs must have the same length.") curr_residuals_flatten = [ gen_array_ops.reshape(v, [-1]) for v in curr_residuals ] curr_dirs_flatten = [ gen_array_ops.reshape(v, [-1]) for v in curr_dirs ] Hvs_flatten = [gen_array_ops.reshape(v, [-1]) for v in Hvs] curr_residuals_concat = array_ops.concat(curr_residuals_flatten, 0) curr_dirs_concat = array_ops.concat(curr_dirs_flatten, 0) Hvs_concat = array_ops.concat(Hvs_flatten, 0) alpha = _dot(curr_residuals_concat, curr_residuals_concat) / _dot( curr_dirs_concat, Hvs_concat) alpha = control_flow_ops.cond( gen_math_ops.is_finite(alpha), lambda: gen_math_ops.maximum(alpha, 1e-6), lambda: ops.convert_to_tensor(1.0)) if i == 0 and fix_first_step: first_alpha = alpha curr_deltas = [d * (alpha / first_alpha) for d in curr_dirs] deltas = [d1 + d0 for d0, d1 in list(zip(curr_deltas, deltas))] deltas_history.append(curr_deltas) residuals_history.append(curr_residuals) new_residuals = [ r - alpha * v for r, v in list(zip(curr_residuals, Hvs)) ] new_residuals_flatten = [ gen_array_ops.reshape(v, [-1]) for v in new_residuals ] new_residuals_concat = array_ops.concat(new_residuals_flatten, 0) beta = _dot(new_residuals_concat, new_residuals_concat) / _dot( curr_residuals_concat, curr_residuals_concat) beta = control_flow_ops.cond(gen_math_ops.is_finite(beta), lambda: beta, lambda: ops.convert_to_tensor(0.0)) #beta = gen_math_ops.maximum(beta, 1e-4) new_dirs = [ r + beta * d for r, d in list(zip(new_residuals, curr_dirs)) ] curr_dirs = new_dirs curr_residuals = new_residuals return list(zip(deltas, variables)), deltas_history, residuals_history
def lr_function(): return gen_math_ops.maximum( ending_lr, starting_lr + ((ending_lr - starting_lr) * step_counter) / num_steps_float)
def _update_statistics_from_mini_batch(self, statistics, auxiliary_variables, times, values): """Given mini-batch input, update `statistics` and `auxiliary_variables`.""" values = math_ops.cast(values, self._dtype) # The density (measured in times per observation) that we see in each part # of the mini-batch. batch_inter_observation_duration = ( math_ops.cast( math_ops.reduce_max(times, axis=1) - math_ops.reduce_min(times, axis=1), self._dtype) / math_ops.cast(array_ops.shape(times)[1] - 1, self._dtype)) # Co-locate updates with their variables to minimize race conditions when # updating statistics. with ops.colocate_with(auxiliary_variables.max_time_seen): # There is a race condition if this value is being updated from multiple # workers. However, it should eventually reach the correct value if the # last chunk is presented enough times. max_time_seen_assign = state_ops.assign( auxiliary_variables.max_time_seen, gen_math_ops.maximum(auxiliary_variables.max_time_seen, math_ops.reduce_max(times))) with ops.colocate_with(auxiliary_variables.chunk_count): chunk_count_assign = state_ops.assign_add( auxiliary_variables.chunk_count, array_ops.shape(times, out_type=dtypes.int64)[0]) with ops.colocate_with( auxiliary_variables.inter_observation_duration_sum): inter_observation_duration_assign = state_ops.assign_add( auxiliary_variables.inter_observation_duration_sum, math_ops.reduce_sum(batch_inter_observation_duration)) with ops.colocate_with(auxiliary_variables.example_count): example_count_assign = state_ops.assign_add( auxiliary_variables.example_count, array_ops.size(times, out_type=dtypes.int64)) # Note: These mean/variance updates assume that all points are equally # likely, which is not true if _chunks_ are sampled uniformly from the space # of all possible contiguous chunks, since points at the start and end of # the series are then members of fewer chunks. For series which are much # longer than the chunk size (the usual/expected case), this effect becomes # irrelevant. with ops.colocate_with(auxiliary_variables.overall_feature_sum): overall_feature_sum_assign = state_ops.assign_add( auxiliary_variables.overall_feature_sum, math_ops.reduce_sum(values, axis=[0, 1])) with ops.colocate_with( auxiliary_variables.overall_feature_sum_of_squares): overall_feature_sum_of_squares_assign = state_ops.assign_add( auxiliary_variables.overall_feature_sum_of_squares, math_ops.reduce_sum(values**2, axis=[0, 1])) per_chunk_aux_updates = control_flow_ops.group( max_time_seen_assign, chunk_count_assign, inter_observation_duration_assign, example_count_assign, overall_feature_sum_assign, overall_feature_sum_of_squares_assign) with ops.control_dependencies([per_chunk_aux_updates]): example_count_float = math_ops.cast( auxiliary_variables.example_count, self._dtype) new_feature_mean = (auxiliary_variables.overall_feature_sum / example_count_float) overall_feature_mean_update = state_ops.assign( statistics.overall_feature_moments.mean, new_feature_mean) overall_feature_var_update = state_ops.assign( statistics.overall_feature_moments.variance, # De-biased n / (n - 1) variance correction example_count_float / (example_count_float - 1.) * (auxiliary_variables.overall_feature_sum_of_squares / example_count_float - new_feature_mean**2)) # TODO(b/35675805): Remove this cast min_time_batch = math_ops.cast(math_ops.argmin(times[:, 0]), dtypes.int32) def series_start_updates(): # If this is the lowest-time chunk that we have seen so far, update # series start moments to reflect that. Note that these statistics are # "best effort", as there are race conditions in the update (however, # they should eventually converge if the start of the series is # presented enough times). mean, variance = nn.moments(values[ min_time_batch, :self._starting_variance_window_size], axes=[0]) return control_flow_ops.group( state_ops.assign(statistics.series_start_moments.mean, mean), state_ops.assign(statistics.series_start_moments.variance, variance)) with ops.colocate_with(statistics.start_time): series_start_update = control_flow_ops.cond( # Update moments whenever we even match the lowest time seen so far, # to ensure that series start statistics are eventually updated to # their correct values, despite race conditions (i.e. eventually # statistics.start_time will reflect the global lowest time, and # given that we will eventually update the series start moments to # their correct values). math_ops.less_equal(times[min_time_batch, 0], statistics.start_time), series_start_updates, control_flow_ops.no_op) with ops.control_dependencies([series_start_update]): # There is a race condition if this update is performed in parallel on # multiple workers. Since models may be sensitive to being presented # with times before the putative start time, the value of this # variable is post-processed above to guarantee that each worker is # presented with a start time which is at least as low as the lowest # time in its current mini-batch. start_time_update = state_ops.assign( statistics.start_time, gen_math_ops.minimum(statistics.start_time, math_ops.reduce_min(times))) inter_observation_duration_estimate = ( auxiliary_variables.inter_observation_duration_sum / math_ops.cast(auxiliary_variables.chunk_count, self._dtype)) # Estimate the total number of observations as: # (end time - start time + 1) * average intra-chunk time density total_observation_count_update = state_ops.assign( statistics.total_observation_count, math_ops.cast( gen_math_ops.round( math_ops.cast( auxiliary_variables.max_time_seen - statistics.start_time + 1, self._dtype) / inter_observation_duration_estimate), dtypes.int64)) per_chunk_stat_updates = control_flow_ops.group( overall_feature_mean_update, overall_feature_var_update, series_start_update, start_time_update, total_observation_count_update) return per_chunk_stat_updates