def replace(self, episodes, length, rows=None): """Replace full episodes. Args: episodes: Tuple of transition quantities with batch and time dimensions. length: Batch of sequence lengths. rows: Episodes to replace, defaults to all. Returns: Operation. """ rows = tf.range(self._capacity) if rows is None else rows assert rows.shape.ndims == 1 assert_capacity = tf.assert_less(rows, self._capacity, message='capacity exceeded') with tf.control_dependencies([assert_capacity]): assert_max_length = tf.assert_less_equal( length, self._max_length, message='max length exceeded') replace_ops = [] with tf.control_dependencies([assert_max_length]): for buffer_, elements in zip(self._buffers, episodes): replace_op = tf.scatter_update(buffer_, rows, elements) replace_ops.append(replace_op) with tf.control_dependencies(replace_ops): return tf.scatter_update(self._length, rows, length)
def position_embeddings_layer(input_shape, position_embedding_name="position_embeddings", initializer_range=0.02, max_position_embeddings=512): seq_length = input_shape[1] width = input_shape[2] assert_op = tf.assert_less_equal(seq_length, max_position_embeddings) # control_dependencies是 tensorflow 中的一个flow顺序控制机制 # 在此处, 运行一下代码块之前会先运行assert op,主要检查输入长度是否小于支持的最大长度 with tf.control_dependencies([assert_op]): full_position_embeddings = create_embedding( shape=[max_position_embeddings, width], embedding_name=position_embedding_name, initializer_range=initializer_range) # 直接使用切片取embedding position_embeddings = tf.slice(full_position_embeddings, [0, 0], [seq_length, -1]) num_dims = len(input_shape) position_broadcast_shape = [] for _ in range(num_dims - 2): position_broadcast_shape.append(1) position_broadcast_shape.extend([seq_length, width]) position_embeddings = tf.reshape(position_embeddings, position_broadcast_shape) return position_embeddings
def remidify(pitches): """Transforms [0, 88) to MIDI pitches [21, 108].""" assertions = [ tf.assert_greater_equal(pitches, 0), tf.assert_less_equal(pitches, 87) ] with tf.control_dependencies(assertions): return pitches + 21
def demidify(pitches): """Transforms MIDI pitches [21,108] to [0, 88).""" assertions = [ tf.assert_greater_equal(pitches, 21), tf.assert_less_equal(pitches, 108) ] with tf.control_dependencies(assertions): return pitches - 21
def assert_less_equal(*args, **kwargs): """ Wrapper for tf.assert_less_equal Overrides tf.device so that the assert always goes on CPU. The unwrapped version raises an exception if used with tf.device("/GPU:x"). """ with tf.device("/CPU:0"): return tf.assert_less_equal(*args, **kwargs)
def maybe_split_sequence_lengths(sequence_length, num_splits, total_length): """Validates and splits `sequence_length`, if necessary. Returned value must be used in graph for all validations to be executed. Args: sequence_length: A batch of sequence lengths, either sized `[batch_size]` and equal to either 0 or `total_length`, or sized `[batch_size, num_splits]`. num_splits: The scalar number of splits of the full sequences. total_length: The scalar total sequence length (potentially padded). Returns: sequence_length: If input shape was `[batch_size, num_splits]`, returns the same Tensor. Otherwise, returns a Tensor of that shape with each input length in the batch divided by `num_splits`. Raises: ValueError: If `sequence_length` is not shaped `[batch_size]` or `[batch_size, num_splits]`. tf.errors.InvalidArgumentError: If `sequence_length` is shaped `[batch_size]` and all values are not either 0 or `total_length`. """ if sequence_length.shape.ndims == 1: if total_length % num_splits != 0: raise ValueError( '`total_length` must be evenly divisible by `num_splits`.') with tf.control_dependencies([ tf.Assert(tf.reduce_all( tf.logical_or(tf.equal(sequence_length, 0), tf.equal(sequence_length, total_length))), data=[sequence_length]) ]): sequence_length = (tf.tile(tf.expand_dims(sequence_length, axis=1), [1, num_splits]) // num_splits) elif sequence_length.shape.ndims == 2: with tf.control_dependencies([ tf.assert_less_equal( sequence_length, tf.constant(total_length // num_splits, tf.int32), message='Segment length cannot be more than ' '`total_length / num_splits`.') ]): sequence_length = tf.identity(sequence_length) sequence_length.set_shape([sequence_length.shape[0], num_splits]) else: raise ValueError( 'Sequence lengths must be given as a vector or a 2D Tensor whose ' 'second dimension size matches its initial hierarchical split. Got ' 'shape: %s' % sequence_length.shape.as_list()) return sequence_length
def __call__(self, batch_size): """Reads `batch_size` data. Args: batch_size: Tensor of type `int32`, batch size of the data to be retrieved from the dataset. `batch_size` should be less than or equal to `max_batch_size`. Returns: Read data, An iterable of tensors with batch size equal to `batch_size`. """ check_size = tf.assert_less_equal( batch_size, tf.convert_to_tensor(self._max_batch_size, dtype=tf.int32), message= 'Data set read failure, Batch size greater than max allowed.') with tf.control_dependencies([check_size]): return _slice_data(self._dataset, batch_size)
def psnr(labels, predictions): """Computes average peak signal-to-noise ratio of `predictions`. Here PSNR is defined with respect to the maximum value of 1. All image tensors must be within the range [0, 1]. Args: labels: Tensor of shape [B, H, W, N]. predictions: Tensor of shape [B, H, W, N]. Returns: Tuple of (psnr, update_op) as returned by tf.metrics. """ predictions.shape.assert_is_compatible_with(labels.shape) with tf.control_dependencies([tf.assert_greater_equal(labels, 0.0), tf.assert_less_equal(labels, 1.0)]): psnrs = tf.image.psnr(labels, predictions, max_val=1.0) psnrs = tf.boolean_mask(psnrs, tf.logical_not(tf.is_inf(psnrs))) return tf.metrics.mean(psnrs, name='psnr')
def __call__(self, batch_size): """Reads `batch_size` data. Args: batch_size: Tensor of type `int32`. Batch size of the data to be retrieved from the dataset. `batch_size` should be less than or equal to the number of examples in the dataset. Returns: Read data, a list of Tensors with batch size equal to `batch_size`. """ check_size = tf.assert_less_equal( batch_size, tf.convert_to_tensor(self._num_examples, dtype=tf.int32), message='Data set read failure, batch_size > num_examples.' ) with tf.control_dependencies([check_size]): self._indices = tf.random.shuffle( tf.range(self._num_examples, dtype=tf.int32)) return _extract_data(self._dataset, self._indices[:batch_size])
def assert_rank_at_most(x, rank, data=None, summarize=None, message=None, name=None): """Assert `x` has rank equal to `rank` or smaller. Example of adding a dependency to an operation: ```python with tf.control_dependencies([tf.assert_rank_at_most(x, 2)]): output = tf.reduce_sum(x) ``` Args: x: Numeric `Tensor`. rank: Scalar `Tensor`. data: The tensors to print out if the condition is False. Defaults to error message and first few entries of `x`. summarize: Print this many entries of each tensor. message: A string to prefix to the default message. name: A name for this operation (optional). Defaults to "assert_rank_at_most". Returns: Op raising `InvalidArgumentError` unless `x` has specified rank or lower. If static checks determine `x` has correct rank, a `no_op` is returned. Raises: ValueError: If static checks determine `x` has wrong rank. """ with tf.name_scope(name or 'assert_rank_at_most'): return tf1.assert_less_equal(tf.rank(x), rank, data=data, summarize=summarize, message=message)
def embed(input_ids, vocab_size, embedding_size, position_offset=0, initializer_range=0.02, max_position_embeddings=512, use_one_hot_embeddings=True): """reur and position embeddings :param input_ids: int Tensor of shape [batch_size, seq_length]. :param vocab_size: number of words in vocab :param embedding_size: dimensionality of the embedding :param position_offset: aka number of cached tokens. :param initializer_range: float. Range of the weight initialization. :param max_position_embeddings: int. Maximum sequence length. :param use_one_hot_embeddings: probably want this to be true :return: [batch_size, seq_length, embedding_size] embedded tensor """ (batch_size, seq_length) = get_shape_list(input_ids, expected_rank=2) embedding_table = tf.get_variable( name='word_embed', shape=[vocab_size, embedding_size], initializer=create_initializer(initializer_range), ) assert_op = tf.assert_less_equal(tf.reduce_max(input_ids), vocab_size - 1) with tf.control_dependencies([assert_op]): if use_one_hot_embeddings: flat_input_ids = tf.reshape(input_ids, [-1]) one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size) output_flat = tf.matmul(one_hot_input_ids, embedding_table) else: output_flat = tf.nn.embedding_lookup(embedding_table, input_ids) embedded_input = tf.reshape(output_flat, [batch_size, seq_length, embedding_size]) assert_op = tf.assert_less_equal(seq_length, max_position_embeddings) with tf.control_dependencies([assert_op]): full_position_embeddings = tf.get_variable( name='pos_embed', shape=[max_position_embeddings, embedding_size], initializer=create_initializer(initializer_range), ) # Since the position embedding table is a learned variable, we create it # using a (long) sequence length `max_position_embeddings`. The actual # sequence length might be shorter than this, for faster training of # tasks that do not have long sequences. # # So `full_position_embeddings` is effectively an embedding table # for position [0, 1, 2, ..., max_position_embeddings-1], and the current # sequence has positions [0, 1, 2, ... seq_length-1], so we can just # perform a slice. if position_offset == 0: embedded_input += tf.slice(full_position_embeddings, [0, 0], [seq_length, -1])[None] else: # Tensorflow is too stupid to allow slicing flat_pos_ids = (tf.range(seq_length, dtype=tf.int32) + position_offset) one_hot_pos_ids = tf.one_hot(flat_pos_ids, depth=max_position_embeddings) # [seq_length, full_position_embeddings], [full_position_embeddings, dim] seq_embeds = tf.matmul(one_hot_pos_ids, full_position_embeddings) embedded_input += seq_embeds[None] # embedded_input += tf.slice(full_position_embeddings[position_offset:], [0, 0], [seq_length, -1])[None] return layer_norm(embedded_input, name='embed_norm'), embedding_table
def __init__(self, batch_size, total_num_examples, max_learning_rate=1., preconditioner_decay_rate=0.95, burnin=25, burnin_max_learning_rate=1e-6, use_single_learning_rate=False, name=None): default_name = 'VariationalSGD' with tf1.name_scope(name, default_name, [ max_learning_rate, preconditioner_decay_rate, batch_size, burnin, burnin_max_learning_rate ]): self._preconditioner_decay_rate = tf.convert_to_tensor( value=preconditioner_decay_rate, name='preconditioner_decay_rate') self._batch_size = tf.convert_to_tensor(value=batch_size, name='batch_size') self._total_num_examples = tf.convert_to_tensor( value=total_num_examples, name='total_num_examples') self._burnin = tf.convert_to_tensor(value=burnin, name='burnin', dtype=dtype_util.common_dtype( [burnin], dtype_hint=tf.int64)) self._burnin_max_learning_rate = tf.convert_to_tensor( value=burnin_max_learning_rate, name='burnin_max_learning_rate') self._max_learning_rate = tf.convert_to_tensor( value=max_learning_rate, name='max_learning_rate') self._use_single_learning_rate = use_single_learning_rate self._preconditioner_decay_rate = distribution_util.with_dependencies( [ tf1.assert_non_negative( self._preconditioner_decay_rate, message= '`preconditioner_decay_rate` must be non-negative'), tf1.assert_less_equal( self._preconditioner_decay_rate, 1., message='`preconditioner_decay_rate` must be at most 1.' ), ], self._preconditioner_decay_rate) self._batch_size = distribution_util.with_dependencies([ tf1.assert_greater( self._batch_size, 0, message='`batch_size` must be greater than zero') ], self._batch_size) self._total_num_examples = distribution_util.with_dependencies([ tf1.assert_greater( self._total_num_examples, 0, message='`total_num_examples` must be greater than zero') ], self._total_num_examples) self._burnin = distribution_util.with_dependencies([ tf1.assert_non_negative( self._burnin, message='`burnin` must be non-negative'), tf1.assert_integer(self._burnin, message='`burnin` must be an integer') ], self._burnin) self._burnin_max_learning_rate = distribution_util.with_dependencies( [ tf1.assert_non_negative( self._burnin_max_learning_rate, message= '`burnin_max_learning_rate` must be non-negative') ], self._burnin_max_learning_rate) self._max_learning_rate = distribution_util.with_dependencies([ tf1.assert_non_negative( self._max_learning_rate, message='`max_learning_rate` must be non-negative') ], self._max_learning_rate) super(VariationalSGD, self).__init__(name=name or default_name)
def embedding_postprocessor(input_tensor, use_token_type=False, token_type_ids=None, token_type_vocab_size=None, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, reset_position_index_per_cell=False, position_embedding_name="position_embeddings", initializer_range=0.02, max_position_embeddings=512, extra_embeddings=None, dropout_prob=0.1): """Performs various post-processing on a word embedding tensor. Args: input_tensor: float Tensor of shape [batch_size, seq_length, embedding_size]. use_token_type: bool. Whether to add embeddings for `token_type_ids`. token_type_ids: (optional) nested structure of int32 Tensors of shape [batch_size, seq_length]. Must be specified if `use_token_type` is True. token_type_vocab_size: nested structure of ints. The vocabulary size of `token_type_ids`. Must match the structure of `token_type_ids`. token_type_embedding_name: string. The name of the embedding table variable for token type ids. use_position_embeddings: bool. Whether to add position embeddings for the position of each token in the sequence. reset_position_index_per_cell: bool. Whether to restart position index when a new cell starts. position_embedding_name: string. The name of the embedding table variable for positional embeddings. initializer_range: float. Range of the weight initialization. max_position_embeddings: int. Maximum sequence length that might ever be used with this model. This can be longer than the sequence length of input_tensor, but cannot be shorter. extra_embeddings: (optional) float32 Tensor of shape [batch_size, seq_length, embedding_dim]. Additional embeddings concatenated with all the other embeddings. dropout_prob: float. Dropout probability applied to the final output tensor. Returns: float tensor with same shape as `input_tensor`. Raises: ValueError: One of the tensor shapes or input values is invalid. """ input_shape = get_shape_list(input_tensor, expected_rank=3) batch_size = input_shape[0] seq_length = input_shape[1] width = input_shape[2] output = input_tensor if use_token_type: if token_type_ids is None: raise ValueError("`token_type_ids` must be specified if" "`use_token_type` is True.") tf.nest.assert_same_structure(token_type_ids, token_type_vocab_size) token_type_ids = tf.nest.flatten(token_type_ids) token_type_vocab_size = tf.nest.flatten(token_type_vocab_size) for i, (type_ids, type_vocab_size) in enumerate( zip(token_type_ids, token_type_vocab_size)): token_type_table = tf.get_variable( name="%s_%d" % (token_type_embedding_name, i), shape=[type_vocab_size, width], initializer=create_initializer(initializer_range)) # This vocab will be small so we always do one-hot here, since it is # always faster for a small vocabulary. flat_token_type_ids = tf.reshape(type_ids, [-1]) one_hot_ids = tf.one_hot(flat_token_type_ids, depth=type_vocab_size) token_type_embeddings = tf.matmul(one_hot_ids, token_type_table) token_type_embeddings = tf.reshape(token_type_embeddings, [batch_size, seq_length, width]) output += token_type_embeddings if use_position_embeddings: full_position_embeddings = tf.get_variable( name=position_embedding_name, shape=[max_position_embeddings, width], initializer=create_initializer(initializer_range)) if not reset_position_index_per_cell: assert_op = tf.assert_less_equal(seq_length, max_position_embeddings) with tf.control_dependencies([assert_op]): num_dims = len(output.shape.as_list()) position_embeddings = _get_absolute_position_embeddings( full_position_embeddings, seq_length=seq_length, width=width, num_dims=num_dims, ) else: position_embeddings = _get_relative_position_embeddings( full_position_embeddings, token_type_ids, token_type_vocab_size, seq_length, batch_size, max_position_embeddings, ) output += position_embeddings if extra_embeddings is not None: flat_extra_embeddings = tf.reshape(extra_embeddings, [batch_size * seq_length, -1]) flat_extra_embeddings = tf.layers.dense( flat_extra_embeddings, width, kernel_initializer=create_initializer(initializer_range)) output += tf.reshape(flat_extra_embeddings, [batch_size, seq_length, width]) output = layer_norm_and_dropout(output, dropout_prob) return output
def preprocess_example(example_proto, hparams, is_training): """Compute spectral representation, labels, and length from sequence/audio. Args: example_proto: Example that has not been preprocessed. hparams: HParams object specifying hyperparameters. is_training: Whether or not this is a training run. Returns: An InputTensors tuple. Raises: ValueError: If hparams is contains an invalid spec_type. """ record = parse_example(example_proto) sequence_id = record['id'] sequence = record['sequence'] audio = record['audio'] velocity_range = record['velocity_range'] wav_jitter_amount_ms = label_jitter_amount_ms = 0 # if there is combined jitter, we must generate it once here if is_training and hparams.jitter_amount_ms > 0: wav_jitter_amount_ms = np.random.choice(hparams.jitter_amount_ms, size=1) label_jitter_amount_ms = wav_jitter_amount_ms if label_jitter_amount_ms > 0: sequence = jitter_label_op(sequence, label_jitter_amount_ms / 1000.) # possibly shift the entire sequence backward for better forward only training if hparams.backward_shift_amount_ms > 0: sequence = jitter_label_op(sequence, hparams.backward_shift_amount_ms / 1000.) if is_training: audio = transform_wav_data_op(audio, hparams=hparams, jitter_amount_sec=wav_jitter_amount_ms / 1000.) spec = wav_to_spec_op(audio, hparams=hparams) spectrogram_hash = get_spectrogram_hash_op(spec) labels, label_weights, onsets, offsets, velocities = sequence_to_pianoroll_op( sequence, velocity_range, hparams=hparams) length = wav_to_num_frames_op(audio, hparams_frames_per_second(hparams)) asserts = [] if hparams.max_expected_train_example_len and is_training: asserts.append( tf.assert_less_equal(length, hparams.max_expected_train_example_len)) with tf.control_dependencies(asserts): return InputTensors(spec=spec, spectrogram_hash=spectrogram_hash, labels=labels, label_weights=label_weights, length=length, onsets=onsets, offsets=offsets, velocities=velocities, sequence_id=sequence_id, note_sequence=sequence)
def expected_calibration_error(y_true, y_pred, nbins=20): """Calculates Expected Calibration Error (ECE). ECE is a scalar summary statistic of calibration error. It is the sample-weighted average of the difference between the predicted and true probabilities of a positive detection across uniformly-spaced model confidences [0, 1]. See referenced paper for a thorough explanation. Reference: Guo, et. al, "On Calibration of Modern Neural Networks" Page 2, Expected Calibration Error (ECE). https://arxiv.org/pdf/1706.04599.pdf This function creates three local variables, `bin_counts`, `bin_true_sum`, and `bin_preds_sum` that are used to compute ECE. For estimation of the metric over a stream of data, the function creates an `update_op` operation that updates these variables and returns the ECE. Args: y_true: 1-D tf.int64 Tensor of binarized ground truth, corresponding to each prediction in y_pred. y_pred: 1-D tf.float32 tensor of model confidence scores in range [0.0, 1.0]. nbins: int specifying the number of uniformly-spaced bins into which y_pred will be bucketed. Returns: value_op: A value metric op that returns ece. update_op: An operation that increments the `bin_counts`, `bin_true_sum`, and `bin_preds_sum` variables appropriately and whose value matches `ece`. Raises: InvalidArgumentError: if y_pred is not in [0.0, 1.0]. """ bin_counts = metrics_impl.metric_variable([nbins], tf.float32, name='bin_counts') bin_true_sum = metrics_impl.metric_variable([nbins], tf.float32, name='true_sum') bin_preds_sum = metrics_impl.metric_variable([nbins], tf.float32, name='preds_sum') with tf.control_dependencies([ tf.assert_greater_equal(y_pred, 0.0), tf.assert_less_equal(y_pred, 1.0), ]): bin_ids = tf.histogram_fixed_width_bins(y_pred, [0.0, 1.0], nbins=nbins) with tf.control_dependencies([bin_ids]): update_bin_counts_op = tf.assign_add( bin_counts, tf.cast(tf.bincount(bin_ids, minlength=nbins), dtype=tf.float32)) update_bin_true_sum_op = tf.assign_add( bin_true_sum, tf.cast(tf.bincount(bin_ids, weights=y_true, minlength=nbins), dtype=tf.float32)) update_bin_preds_sum_op = tf.assign_add( bin_preds_sum, tf.cast(tf.bincount(bin_ids, weights=y_pred, minlength=nbins), dtype=tf.float32)) ece_update_op = _ece_from_bins(update_bin_counts_op, update_bin_true_sum_op, update_bin_preds_sum_op, name='update_op') ece = _ece_from_bins(bin_counts, bin_true_sum, bin_preds_sum, name='value') return ece, ece_update_op
def preprocess_data(sequence_id, sequence, audio, velocity_range, hparams, is_training): """Compute spectral representation, labels, and length from sequence/audio. Args: sequence_id: id of the sequence. sequence: String tensor containing serialized NoteSequence proto. audio: String tensor containing containing WAV data. velocity_range: String tensor containing max and min velocities of file as a serialized VelocityRange. hparams: HParams object specifying hyperparameters. is_training: Whether or not this is a training run. Returns: An InputTensors tuple. Raises: ValueError: If hparams is contains an invalid spec_type. """ wav_jitter_amount_ms = label_jitter_amount_ms = 0 # if there is combined jitter, we must generate it once here if is_training and hparams.jitter_amount_ms > 0: wav_jitter_amount_ms = np.random.choice(hparams.jitter_amount_ms, size=1) label_jitter_amount_ms = wav_jitter_amount_ms if label_jitter_amount_ms > 0: sequence = jitter_label_op(sequence, label_jitter_amount_ms / 1000.) # possibly shift the entire sequence backward for better forward only training if hparams.backward_shift_amount_ms > 0: sequence = jitter_label_op(sequence, hparams.backward_shift_amount_ms / 1000.) if is_training: audio = transform_wav_data_op( audio, hparams=hparams, jitter_amount_sec=wav_jitter_amount_ms / 1000.) if hparams.spec_type == 'tflite_compat_mel': assert hparams.spec_log_amplitude spec = tflite_compat_mel(audio, hparams=hparams) else: spec = wav_to_spec_op(audio, hparams=hparams) spectrogram_hash = get_spectrogram_hash_op(spec) labels, label_weights, onsets, offsets, velocities = sequence_to_pianoroll_op( sequence, velocity_range, hparams=hparams) length = wav_to_num_frames_op(audio, hparams_frames_per_second(hparams)) asserts = [] if hparams.max_expected_train_example_len and is_training: asserts.append( tf.assert_less_equal(length, hparams.max_expected_train_example_len)) with tf.control_dependencies(asserts): return InputTensors( spec=spec, spectrogram_hash=spectrogram_hash, labels=labels, label_weights=label_weights, length=length, onsets=onsets, offsets=offsets, velocities=velocities, sequence_id=sequence_id, note_sequence=sequence)
def embedding_postprocessor(input_tensor, use_token_type=False, token_type_ids=None, token_type_vocab_size=16, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=0.02, max_position_embeddings=512, dropout_prob=0.1): """Performs various post-processing on a word embedding tensor. Args: input_tensor: float Tensor of shape [batch_size, seq_length, embedding_size]. use_token_type: bool. Whether to add embeddings for `token_type_ids`. token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. Must be specified if `use_token_type` is True. token_type_vocab_size: int. The vocabulary size of `token_type_ids`. token_type_embedding_name: string. The name of the embedding table variable for token type ids. use_position_embeddings: bool. Whether to add position embeddings for the position of each token in the sequence. position_embedding_name: string. The name of the embedding table variable for positional embeddings. initializer_range: float. Range of the weight initialization. max_position_embeddings: int. Maximum sequence length that might ever be used with this model. This can be longer than the sequence length of input_tensor, but cannot be shorter. dropout_prob: float. Dropout probability applied to the final output tensor. Returns: float tensor with same shape as `input_tensor`. Raises: ValueError: One of the tensor shapes or input values is invalid. """ input_shape = get_shape_list(input_tensor, expected_rank=3) batch_size = input_shape[0] seq_length = input_shape[1] width = input_shape[2] output = input_tensor if use_token_type: if token_type_ids is None: raise ValueError("`token_type_ids` must be specified if" "`use_token_type` is True.") token_type_table = tf.get_variable( name=token_type_embedding_name, shape=[token_type_vocab_size, width], initializer=create_initializer(initializer_range)) # This vocab will be small so we always do one-hot here, since it is always # faster for a small vocabulary. flat_token_type_ids = tf.reshape(token_type_ids, [-1]) one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size) token_type_embeddings = tf.matmul(one_hot_ids, token_type_table) token_type_embeddings = tf.reshape(token_type_embeddings, [batch_size, seq_length, width]) output += token_type_embeddings if use_position_embeddings: assert_op = tf.assert_less_equal(seq_length, max_position_embeddings) with tf.control_dependencies([assert_op]): full_position_embeddings = tf.get_variable( name=position_embedding_name, shape=[max_position_embeddings, width], initializer=create_initializer(initializer_range)) # Since the position embedding table is a learned variable, we create it # using a (long) sequence length `max_position_embeddings`. The actual # sequence length might be shorter than this, for faster training of # tasks that do not have long sequences. # # So `full_position_embeddings` is effectively an embedding table # for position [0, 1, 2, ..., max_position_embeddings-1], and the current # sequence has positions [0, 1, 2, ... seq_length-1], so we can just # perform a slice. position_embeddings = tf.slice(full_position_embeddings, [0, 0], [seq_length, -1]) num_dims = len(output.shape.as_list()) # Only the last two dimensions are relevant (`seq_length` and `width`), so # we broadcast among the first dimensions, which is typically just # the batch size. position_broadcast_shape = [] for _ in range(num_dims - 2): position_broadcast_shape.append(1) position_broadcast_shape.extend([seq_length, width]) position_embeddings = tf.reshape(position_embeddings, position_broadcast_shape) output += position_embeddings output = layer_norm_and_dropout(output, dropout_prob) return output
def __init__(self, learning_rate, preconditioner_decay_rate=0.95, data_size=1, burnin=25, diagonal_bias=1e-8, name=None, parallel_iterations=10): default_name = 'StochasticGradientLangevinDynamics' with tf1.name_scope(name, default_name, [ learning_rate, preconditioner_decay_rate, data_size, burnin, diagonal_bias ]): if tf.executing_eagerly(): raise NotImplementedError( 'Eager execution currently not supported for ' ' SGLD optimizer.') self._preconditioner_decay_rate = tf.convert_to_tensor( value=preconditioner_decay_rate, name='preconditioner_decay_rate') self._data_size = tf.convert_to_tensor(value=data_size, name='data_size') self._burnin = tf.convert_to_tensor(value=burnin, name='burnin', dtype=dtype_util.common_dtype( [burnin], dtype_hint=tf.int64)) self._diagonal_bias = tf.convert_to_tensor(value=diagonal_bias, name='diagonal_bias') # TODO(b/124800185): Consider migrating `learning_rate` to be a # hyperparameter handled by the base Optimizer class. This would allow # users to plug in a `tf.keras.optimizers.schedules.LearningRateSchedule` # object in addition to Tensors. self._learning_rate = tf.convert_to_tensor(value=learning_rate, name='learning_rate') self._parallel_iterations = parallel_iterations self._preconditioner_decay_rate = distribution_util.with_dependencies( [ tf1.assert_non_negative( self._preconditioner_decay_rate, message= '`preconditioner_decay_rate` must be non-negative'), tf1.assert_less_equal( self._preconditioner_decay_rate, 1., message='`preconditioner_decay_rate` must be at most 1.' ), ], self._preconditioner_decay_rate) self._data_size = distribution_util.with_dependencies([ tf1.assert_greater( self._data_size, 0, message='`data_size` must be greater than zero') ], self._data_size) self._burnin = distribution_util.with_dependencies([ tf1.assert_non_negative( self._burnin, message='`burnin` must be non-negative'), tf1.assert_integer(self._burnin, message='`burnin` must be an integer') ], self._burnin) self._diagonal_bias = distribution_util.with_dependencies([ tf1.assert_non_negative( self._diagonal_bias, message='`diagonal_bias` must be non-negative') ], self._diagonal_bias) super(StochasticGradientLangevinDynamics, self).__init__(name=name or default_name)
def percentile(x, q, axis=None, interpolation=None, keep_dims=False, validate_args=False, preserve_gradients=True, name=None): """Compute the `q`-th percentile(s) of `x`. Given a vector `x`, the `q`-th percentile of `x` is the value `q / 100` of the way from the minimum to the maximum in a sorted copy of `x`. The values and distances of the two nearest neighbors as well as the `interpolation` parameter will determine the percentile if the normalized ranking does not match the location of `q` exactly. This function is the same as the median if `q = 50`, the same as the minimum if `q = 0` and the same as the maximum if `q = 100`. Multiple percentiles can be computed at once by using `1-D` vector `q`. Dimension zero of the returned `Tensor` will index the different percentiles. Compare to `numpy.percentile`. Args: x: Numeric `N-D` `Tensor` with `N > 0`. If `axis` is not `None`, `x` must have statically known number of dimensions. q: Scalar or vector `Tensor` with values in `[0, 100]`. The percentile(s). axis: Optional `0-D` or `1-D` integer `Tensor` with constant values. The axis that index independent samples over which to return the desired percentile. If `None` (the default), treat every dimension as a sample dimension, returning a scalar. interpolation : {'nearest', 'linear', 'lower', 'higher', 'midpoint'}. Default value: 'nearest'. This specifies the interpolation method to use when the desired quantile lies between two data points `i < j`: * linear: i + (j - i) * fraction, where fraction is the fractional part of the index surrounded by i and j. * lower: `i`. * higher: `j`. * nearest: `i` or `j`, whichever is nearest. * midpoint: (i + j) / 2. `linear` and `midpoint` interpolation do not work with integer dtypes. keep_dims: Python `bool`. If `True`, the last dimension is kept with size 1 If `False`, the last dimension is removed from the output shape. validate_args: Whether to add runtime checks of argument validity. If False, and arguments are incorrect, correct behavior is not guaranteed. preserve_gradients: Python `bool`. If `True`, ensure that gradient w.r.t the percentile `q` is preserved in the case of linear interpolation. If `False`, the gradient will be (incorrectly) zero when `q` corresponds to a point in `x`. name: A Python string name to give this `Op`. Default is 'percentile' Returns: A `(rank(q) + N - len(axis))` dimensional `Tensor` of same dtype as `x`, or, if `axis` is `None`, a `rank(q)` `Tensor`. The first `rank(q)` dimensions index quantiles for different values of `q`. Raises: ValueError: If argument 'interpolation' is not an allowed type. ValueError: If interpolation type not compatible with `dtype`. #### Examples ```python # Get 30th percentile with default ('nearest') interpolation. x = [1., 2., 3., 4.] tfp.stats.percentile(x, q=30.) ==> 2.0 # Get 30th percentile with 'linear' interpolation. x = [1., 2., 3., 4.] tfp.stats.percentile(x, q=30., interpolation='linear') ==> 1.9 # Get 30th and 70th percentiles with 'lower' interpolation x = [1., 2., 3., 4.] tfp.stats.percentile(x, q=[30., 70.], interpolation='lower') ==> [1., 3.] # Get 100th percentile (maximum). By default, this is computed over every dim x = [[1., 2.] [3., 4.]] tfp.stats.percentile(x, q=100.) ==> 4. # Treat the leading dim as indexing samples, and find the 100th quantile (max) # over all such samples. x = [[1., 2.] [3., 4.]] tfp.stats.percentile(x, q=100., axis=[0]) ==> [3., 4.] ``` """ name = name or 'percentile' allowed_interpolations = { 'linear', 'lower', 'higher', 'nearest', 'midpoint' } if interpolation is None: interpolation = 'nearest' else: if interpolation not in allowed_interpolations: raise ValueError( 'Argument `interpolation` must be in %s. Found %s' % (allowed_interpolations, interpolation)) with tf1.name_scope(name, values=[x, q]): x = tf.convert_to_tensor(value=x, name='x') if interpolation in {'linear', 'midpoint'} and x.dtype.is_integer: raise TypeError( '{} interpolation not allowed with dtype {}'.format( interpolation, x.dtype)) # Double is needed here and below, else we get the wrong index if the array # is huge along axis. q = tf.cast(q, tf.float64) _get_static_ndims(q, expect_ndims_no_more_than=1) if validate_args: q = distribution_util.with_dependencies([ tf1.assert_rank_in(q, [0, 1]), tf1.assert_greater_equal(q, tf.cast(0., tf.float64)), tf1.assert_less_equal(q, tf.cast(100., tf.float64)) ], q) # Move `axis` dims of `x` to the rightmost, call it `y`. if axis is None: y = tf.reshape(x, [-1]) else: x_ndims = _get_static_ndims(x, expect_static=True, expect_ndims_at_least=1) axis = _make_static_axis_non_negative_list(axis, x_ndims) y = _move_dims_to_flat_end(x, axis, x_ndims, right_end=True) frac_at_q_or_above = 1. - q / 100. # Sort everything, not just the top 'k' entries, which allows multiple calls # to sort only once (under the hood) and use CSE. sorted_y = _sort_tensor(y) d = tf.cast(tf.shape(input=y)[-1], tf.float64) def _get_indices(interp_type): """Get values of y at the indices implied by interp_type.""" # Note `lower` <--> ceiling. Confusing, huh? Due to the fact that # _sort_tensor sorts highest to lowest, tf.ceil corresponds to the higher # index, but the lower value of y! if interp_type == 'lower': indices = tf.math.ceil((d - 1) * frac_at_q_or_above) elif interp_type == 'higher': indices = tf.floor((d - 1) * frac_at_q_or_above) elif interp_type == 'nearest': indices = tf.round((d - 1) * frac_at_q_or_above) # d - 1 will be distinct from d in int32, but not necessarily double. # So clip to avoid out of bounds errors. return tf.clip_by_value(tf.cast(indices, tf.int32), 0, tf.shape(input=y)[-1] - 1) if interpolation in ['nearest', 'lower', 'higher']: gathered_y = tf.gather(sorted_y, _get_indices(interpolation), axis=-1) elif interpolation == 'midpoint': gathered_y = 0.5 * ( tf.gather(sorted_y, _get_indices('lower'), axis=-1) + tf.gather(sorted_y, _get_indices('higher'), axis=-1)) elif interpolation == 'linear': # Copy-paste of docstring on interpolation: # linear: i + (j - i) * fraction, where fraction is the fractional part # of the index surrounded by i and j. larger_y_idx = _get_indices('lower') exact_idx = (d - 1) * frac_at_q_or_above if preserve_gradients: # If q corresponds to a point in x, we will initially have # larger_y_idx == smaller_y_idx. # This results in the gradient w.r.t. fraction being zero (recall `q` # enters only through `fraction`...and see that things cancel). # The fix is to ensure that smaller_y_idx and larger_y_idx are always # separated by exactly 1. smaller_y_idx = tf.maximum(larger_y_idx - 1, 0) larger_y_idx = tf.minimum(smaller_y_idx + 1, tf.shape(input=y)[-1] - 1) fraction = tf.cast(larger_y_idx, tf.float64) - exact_idx else: smaller_y_idx = _get_indices('higher') fraction = tf.math.ceil( (d - 1) * frac_at_q_or_above) - exact_idx fraction = tf.cast(fraction, y.dtype) gathered_y = ( tf.gather(sorted_y, larger_y_idx, axis=-1) * (1 - fraction) + tf.gather(sorted_y, smaller_y_idx, axis=-1) * fraction) # Propagate NaNs if x.dtype in (tf.bfloat16, tf.float16, tf.float32, tf.float64): # Apparently tf.is_nan doesn't like other dtypes nan_batch_members = tf.reduce_any(input_tensor=tf.math.is_nan(x), axis=axis) right_rank_matched_shape = tf.pad( tensor=tf.shape(input=nan_batch_members), paddings=[[0, tf.rank(input=q)]], constant_values=1) nan_batch_members = tf.reshape(nan_batch_members, shape=right_rank_matched_shape) nan = np.array(np.nan, gathered_y.dtype.as_numpy_dtype) gathered_y = tf.where(nan_batch_members, nan, gathered_y) # Expand dimensions if requested if keep_dims: if axis is None: ones_vec = tf.ones(shape=[ _get_best_effort_ndims(x) + _get_best_effort_ndims(q) ], dtype=tf.int32) gathered_y *= tf.ones(ones_vec, dtype=x.dtype) else: gathered_y = _insert_back_keep_dims(gathered_y, axis) # If q is a scalar, then result has the right shape. # If q is a vector, then result has trailing dim of shape q.shape, which # needs to be rotated to dim 0. return distribution_util.rotate_transpose(gathered_y, tf.rank(q))