def train(self, sentences): token_ids, token_values, token_dense_shape = self._tokenize(sentences) tokens_sparse = tf.sparse.SparseTensor( indices=token_ids, values=token_values, dense_shape=token_dense_shape) tokens = tf.sparse.to_dense(tokens_sparse, default_value="") sparse_lookup_ids = tf.sparse.SparseTensor( indices=tokens_sparse.indices, values=self._words_to_indices(tokens_sparse.values), dense_shape=tokens_sparse.dense_shape) lookup_ids = tf.sparse.to_dense(sparse_lookup_ids, default_value=0) # Targets are the next word for each word of the sentence. tokens_ids_seq = lookup_ids[:, 0:-1] tokens_ids_target = lookup_ids[:, 1:] tokens_prefix = tokens[:, 0:-1] # Mask determining which positions we care about for a loss: all positions # that have a valid non-terminal token. mask = tf.logical_and( tf.logical_not(tf.equal(tokens_prefix, "")), tf.logical_not(tf.equal(tokens_prefix, "<E>"))) input_mask = tf.cast(mask, tf.int32) with tf.GradientTape() as t: sentence_embeddings = tf.nn.embedding_lookup(self._embeddings, tokens_ids_seq) lstm_initial_state = self._lstm_cell.get_initial_state( sentence_embeddings) lstm_output = self._rnn_layer( inputs=sentence_embeddings, initial_state=lstm_initial_state) # Stack LSTM outputs into a batch instead of a 2D array. lstm_output = tf.reshape(lstm_output, [-1, self._lstm_cell.output_size]) logits = self._logit_layer(lstm_output) targets = tf.reshape(tokens_ids_target, [-1]) weights = tf.cast(tf.reshape(input_mask, [-1]), tf.float32) losses = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=targets, logits=logits) # Final loss is the mean loss for all token losses. final_loss = tf.math.divide( tf.reduce_sum(tf.multiply(losses, weights)), tf.reduce_sum(weights), name="final_loss") watched = t.watched_variables() gradients = t.gradient(final_loss, watched) for w, g in zip(watched, gradients): w.assign_sub(g) return final_loss
def decode_greedy(self, sequence_length, first_word): initial_state = self._lstm_cell.get_initial_state( dtype=tf.float32, batch_size=1) sequence = [first_word] current_word = first_word current_id = tf.expand_dims(self._words_to_indices(current_word), 0) current_state = initial_state for _ in range(sequence_length): token_embeddings = tf.nn.embedding_lookup(self._embeddings, current_id) lstm_outputs, current_state = self._lstm_cell(token_embeddings, current_state) lstm_outputs = tf.reshape(lstm_outputs, [-1, self._lstm_cell.output_size]) logits = self._logit_layer(lstm_outputs) softmax = tf.nn.softmax(logits) next_ids = tf.math.argmax(softmax, axis=1) next_words = self._indices_to_words(next_ids)[0] current_id = next_ids current_word = next_words sequence.append(current_word) return sequence
def _tokenize(self, sentences): # Perform a minimalistic text preprocessing by removing punctuation and # splitting on spaces. normalized_sentences = tf.strings.regex_replace( input=sentences, pattern=r"\pP", rewrite="") normalized_sentences = tf.reshape(normalized_sentences, [-1]) sparse_tokens = tf.string_split(normalized_sentences, " ") # Deal with a corner case: there is one empty sentence. sparse_tokens, _ = tf.sparse.fill_empty_rows(sparse_tokens, tf.constant("")) # Deal with a corner case: all sentences are empty. sparse_tokens = tf.sparse.reset_shape(sparse_tokens) sparse_token_ids = self._table.lookup(sparse_tokens.values) return (sparse_tokens.indices, sparse_token_ids, sparse_tokens.dense_shape)
def softquantiles(x, quantiles, quantile_width=None, axis=-1, may_squeeze=True, **kwargs): """Computes soft quantiles via optimal transport. This operator takes advantage of the fact that an exhaustive softsort is not required to recover a single quantile. Instead, one can transport all input values in x onto only 3 weighted values. Target weights are adjusted so that those values in x that are transported to the middle value in the target vector y correspond to those concentrating around the quantile of interest. This idea generalizes to more quantiles, interleaving small weights on the quantile indices and bigger weights in between, corresponding to the gap from one desired quantile to the next one. Args: x: Tensor<float> of any shape. quantiles: list<float> the quantiles to be returned. It can also be a single float. quantile_width: (float) mass given to the bucket supposed to attract points whose value concentrate around the desired quantile value. Bigger width means that we allow the soft quantile to be a mixture of more points further away from the quantile. If None, the width is set at 1/n where n is the number of values considered (the size along the 'axis'). axis: (int) the axis along which to compute the quantile. may_squeeze: (bool) should we squeeze the output tensor in case of a single quantile. **kwargs: see SoftQuantilizer for possible extra parameters. Returns: A Tensor<float> similar to the input tensor, but the axis dimension is replaced by the number of quantiles specified in the quantiles list. Hence, if only a quantile is requested (quantiles is a float) only one value in that axis is returned. When several quantiles are requested, the tensor will have that many values in that axis. Raises: tf.errors.InvalidArgumentError when the quantiles and quantile width are not correct, namely quantiles are either not in sorted order or the quantile_width is too large. """ if isinstance(quantiles, float): quantiles = [quantiles] quantiles = tf.constant(quantiles, tf.float32) # Preprocesses submitted quantiles to check that they satisfy elementary # constraints. valid_quantiles = tf.boolean_mask( quantiles, tf.logical_and(quantiles > 0.0, quantiles < 1.0)) num_quantiles = tf.shape(valid_quantiles)[0] # Includes values on both ends of [0,1]. extended_quantiles = tf.concat([[0.0], valid_quantiles, [1.0]], axis=0) # Builds filler_weights in between the target quantiles. filler_weights = extended_quantiles[1:] - extended_quantiles[:-1] if quantile_width is None: quantile_width = tf.reduce_min( tf.concat( [filler_weights, [1.0 / tf.cast(tf.shape(x)[axis], dtype=x.dtype)]], axis=0)) # Takes into account quantile_width in the definition of weights shift = -tf.ones(tf.shape(filler_weights), dtype=x.dtype) shift = shift + 0.5 * ( tf.one_hot(0, num_quantiles + 1) + tf.one_hot(num_quantiles, num_quantiles + 1)) filler_weights = filler_weights + quantile_width * shift assert_op = tf.Assert(tf.reduce_all(filler_weights >= 0.0), [filler_weights]) with tf.control_dependencies([assert_op]): # Adds one more value to have tensors of the same shape to interleave them. quantile_weights = tf.ones(num_quantiles + 1) * quantile_width # Interleaves the filler_weights with the quantile weights. weights = tf.reshape( tf.stack([filler_weights, quantile_weights], axis=1), (-1,))[:-1] # Sends only the positive weights to the softsort operator. positive_weights = tf.boolean_mask(weights, weights > 0.0) all_quantiles = softsort( x, direction='ASCENDING', axis=axis, target_weights=positive_weights, **kwargs) # Recovers the indices corresponding to the desired quantiles. odds = tf.math.floormod(tf.range(weights.shape[0], dtype=tf.float32), 2) positives = tf.cast(weights > 0.0, tf.float32) indices = tf.cast(tf.math.cumsum(positives) * odds, dtype=tf.int32) indices = tf.boolean_mask(indices, indices > 0) - 1 result = tf.gather(all_quantiles, indices, axis=axis) # In the specific case where we want a single quantile, squeezes the # quantile dimension. can_squeeze = tf.equal(tf.shape(result)[axis], 1) if tf.math.logical_and(can_squeeze, may_squeeze): result = tf.squeeze(result, axis=axis) return result
def _build_tables(self, prior): """Computes integer-valued probability tables used by the range coder. These tables must not be re-generated independently on the sending and receiving side, since small numerical discrepancies between both sides can occur in this process. If the tables differ slightly, this in turn would very likely cause catastrophic error propagation during range decoding. For a more in-depth discussion of this, see: > "Integer Networks for Data Compression with Latent-Variable Models"<br /> > J. Ballé, N. Johnston, D. Minnen<br /> > https://openreview.net/forum?id=S1zz2i0cY7 The tables are stored in `tf.Variable`s as attributes of this object. The recommended way is to train the model, instantiate an entropy model with `compression=True`, and then distribute the model to a sender and a receiver. Arguments: prior: The `tfp.distributions.Distribution` object (see initializer). """ offset = helpers.quantization_offset(prior) lower_tail = helpers.lower_tail(prior, self.tail_mass) upper_tail = helpers.upper_tail(prior, self.tail_mass) # Largest distance observed between lower tail and median, and between # median and upper tail. minima = offset - lower_tail minima = tf.cast(tf.math.ceil(minima), tf.int32) minima = tf.math.maximum(minima, 0) maxima = upper_tail - offset maxima = tf.cast(tf.math.ceil(maxima), tf.int32) maxima = tf.math.maximum(maxima, 0) # PMF starting positions and lengths. pmf_start = offset - tf.cast(minima, self.dtype) pmf_length = maxima + minima + 1 # Sample the densities in the computed ranges, possibly computing more # samples than necessary at the upper end. max_length = tf.math.reduce_max(pmf_length) if tf.executing_eagerly() and max_length > 2048: logging.warning( "Very wide PMF with %d elements may lead to out of memory issues. " "Consider priors with smaller dispersion or increasing `tail_mass` " "parameter.", int(max_length)) samples = tf.range(tf.cast(max_length, self.dtype), dtype=self.dtype) samples = tf.reshape(samples, [-1] + len(self.prior_shape) * [1]) samples += pmf_start pmf = prior.prob(samples) # Collapse batch dimensions of distribution. pmf = tf.reshape(pmf, [max_length, -1]) pmf = tf.transpose(pmf) pmf_length = tf.broadcast_to(pmf_length, self.prior_shape_tensor) pmf_length = tf.reshape(pmf_length, [-1]) cdf_length = pmf_length + 2 cdf_offset = tf.broadcast_to(-minima, self.prior_shape_tensor) cdf_offset = tf.reshape(cdf_offset, [-1]) # Prevent tensors from bouncing back and forth between host and GPU. with tf.device("/cpu:0"): def loop_body(args): prob, length = args prob = prob[:length] overflow = tf.math.maximum(1 - tf.reduce_sum(prob, keepdims=True), 0.) prob = tf.concat([prob, overflow], axis=0) cdf = range_coding_ops.pmf_to_quantized_cdf( prob, precision=self.range_coder_precision) return tf.pad( cdf, [[0, max_length - length]], mode="CONSTANT", constant_values=0) # TODO(jonycgn,ssjhv): Consider switching to Python control flow. cdf = tf.map_fn( loop_body, (pmf, pmf_length), dtype=tf.int32, name="pmf_to_cdf") if self.no_variables: self._cdf = cdf self._cdf_offset = cdf_offset self._cdf_length = cdf_length else: self._cdf = tf.Variable(cdf, trainable=False, name="cdf") self._cdf_offset = tf.Variable( cdf_offset, trainable=False, name="cdf_offset") self._cdf_length = tf.Variable( cdf_length, trainable=False, name="cdf_length")
def top_k_from_dist(self, user, embeddings, k): c = tf.math.softplus(self.c) user_emb_distance = tf.reshape( hyp_utils.hyp_distance_all_pairs( tf.reshape(user, [1, -1]), embeddings, c), [-1]) return tf.math.top_k(-user_emb_distance, k=k)[1]
def _bin_positions(self, x): x = tf.reshape(x, [-1, self._nbins]) return tf.math.softmax( x, axis=-1) * (2 - self._nbins * 1e-2) + 1e-2
def convolution_batch(x, kernel, rank, strides, padding, data_format=None, dilations=None, name=None): """Like `tf.nn.conv2d` except applies batch of kernels to batch of `x`.""" if rank != 2: raise NotImplementedError( 'Argument `rank` currently only supports `2`; ' 'saw "{}".'.format(rank)) if data_format is not None and data_format.upper() != 'NHWBC': raise ValueError( 'Argument `data_format` currently only supports "NHWBC"; ' 'saw "{}".'.format(data_format)) with tf.name_scope(name or 'conv2d_nhwbc'): # Prepare arguments. [ rank, _, # strides padding, dilations, data_format, ] = prepare_conv_args(rank, strides, padding, dilations) strides = prepare_tuple_argument(strides, rank + 2, arg_name='strides') dtype = dtype_util.common_dtype([x, kernel], dtype_hint=tf.float32) x = tf.convert_to_tensor(x, dtype=dtype, name='x') kernel = tf.convert_to_tensor(kernel, dtype=dtype, name='kernel') # Step 1: Transpose and double flatten kernel. # kernel.shape = B + F + [c, c']. Eg: [b, fh, fw, c, c'] kernel_shape = prefer_static.shape(kernel) kernel_batch_shape, kernel_event_shape = prefer_static.split( kernel_shape, num_or_size_splits=[-1, rank + 2]) kernel_batch_size = prefer_static.reduce_prod(kernel_batch_shape) kernel_ndims = prefer_static.rank(kernel) kernel_batch_ndims = kernel_ndims - rank - 2 perm = prefer_static.concat([ prefer_static.range(kernel_batch_ndims, kernel_batch_ndims + rank), prefer_static.range(0, kernel_batch_ndims), prefer_static.range(kernel_batch_ndims + rank, kernel_ndims), ], axis=0) # Eg, [1, 2, 0, 3, 4] kernel = tf.transpose(kernel, perm=perm) # F + B + [c, c'] kernel = tf.reshape(kernel, shape=prefer_static.concat([ kernel_event_shape[:rank], [ kernel_batch_size * kernel_event_shape[-2], kernel_event_shape[-1] ], ], axis=0)) # F + [bc, c'] # Step 2: Double flatten x. # x.shape = N + D + B + [c] x_shape = prefer_static.shape(x) [ x_sample_shape, x_rank_shape, x_batch_shape, x_channel_shape, ] = prefer_static.split( x_shape, num_or_size_splits=[-1, rank, kernel_batch_ndims, 1]) x = tf.reshape( x, # N + D + B + [c] shape=prefer_static.concat([ [prefer_static.reduce_prod(x_sample_shape)], x_rank_shape, [ prefer_static.reduce_prod(x_batch_shape) * prefer_static.reduce_prod(x_channel_shape) ], ], axis=0)) # [n] + D + [bc] # Step 3: Apply convolution. y = tf.nn.depthwise_conv2d(x, kernel, strides=strides, padding=padding, data_format='NHWC', dilations=dilations) # SAME: y.shape = [n, h, w, bcc'] # VALID: y.shape = [n, h-fh+1, w-fw+1, bcc'] # Step 4: Reshape/reduce for output. y_shape = prefer_static.shape(y) y = tf.reshape(y, shape=prefer_static.concat( [ x_sample_shape, y_shape[1:-1], kernel_batch_shape, kernel_event_shape[-2:], ], axis=0)) # N + D' + B + [c, c'] y = tf.reduce_sum(y, axis=-2) # N + D' + B + [c'] return y
def main(argv): del argv # unused arg if not FLAGS.use_gpu: raise ValueError('Only GPU is currently supported.') if FLAGS.num_cores > 1: raise ValueError('Only a single accelerator is currently supported.') tf.enable_v2_behavior() tf.random.set_seed(FLAGS.seed) tf.io.gfile.makedirs(FLAGS.output_dir) batch_size = FLAGS.per_core_batch_size * FLAGS.num_cores steps_per_eval = IMAGENET_VALIDATION_IMAGES // batch_size dataset_test = utils.ImageNetInput(is_training=False, data_dir=FLAGS.data_dir, batch_size=FLAGS.per_core_batch_size, use_bfloat16=False).input_fn() test_datasets = {'clean': dataset_test} corruption_types, max_intensity = utils.load_corrupted_test_info() for name in corruption_types: for intensity in range(1, max_intensity + 1): dataset_name = '{0}_{1}'.format(name, intensity) test_datasets[dataset_name] = utils.load_corrupted_test_dataset( name=name, intensity=intensity, batch_size=FLAGS.per_core_batch_size, drop_remainder=True, use_bfloat16=False) model = deterministic_model.resnet50(input_shape=(224, 224, 3), num_classes=NUM_CLASSES) logging.info('Model input shape: %s', model.input_shape) logging.info('Model output shape: %s', model.output_shape) logging.info('Model number of weights: %s', model.count_params()) # Search for checkpoints from their index file; then remove the index suffix. ensemble_filenames = tf.io.gfile.glob( os.path.join(FLAGS.checkpoint_dir, '**/*.index')) ensemble_filenames = [filename[:-6] for filename in ensemble_filenames] ensemble_size = len(ensemble_filenames) logging.info('Ensemble size: %s', ensemble_size) logging.info('Ensemble number of weights: %s', ensemble_size * model.count_params()) logging.info('Ensemble filenames: %s', str(ensemble_filenames)) checkpoint = tf.train.Checkpoint(model=model) # Write model predictions to files. num_datasets = len(test_datasets) for m, ensemble_filename in enumerate(ensemble_filenames): checkpoint.restore(ensemble_filename) for n, (name, test_dataset) in enumerate(test_datasets.items()): filename = '{dataset}_{member}.npy'.format(dataset=name, member=m) filename = os.path.join(FLAGS.output_dir, filename) if not tf.io.gfile.exists(filename): logits = [] test_iterator = iter(test_dataset) for _ in range(steps_per_eval): features, _ = next(test_iterator) logits.append(model(features, training=False)) logits = tf.concat(logits, axis=0) with tf.io.gfile.GFile(filename, 'w') as f: np.save(f, logits.numpy()) percent = (m * num_datasets + (n + 1)) / (ensemble_size * num_datasets) message = ( '{:.1%} completion for prediction: ensemble member {:d}/{:d}. ' 'Dataset {:d}/{:d}'.format(percent, m + 1, ensemble_size, n + 1, num_datasets)) logging.info(message) metrics = { 'test/negative_log_likelihood': tf.keras.metrics.Mean(), 'test/gibbs_cross_entropy': tf.keras.metrics.Mean(), 'test/accuracy': tf.keras.metrics.SparseCategoricalAccuracy(), 'test/ece': ed.metrics.ExpectedCalibrationError(num_bins=FLAGS.num_bins), } corrupt_metrics = {} for name in test_datasets: corrupt_metrics['test/nll_{}'.format(name)] = tf.keras.metrics.Mean() corrupt_metrics['test/accuracy_{}'.format(name)] = ( tf.keras.metrics.SparseCategoricalAccuracy()) corrupt_metrics['test/ece_{}'.format( name)] = ed.metrics.ExpectedCalibrationError( num_bins=FLAGS.num_bins) # Evaluate model predictions. for n, (name, test_dataset) in enumerate(test_datasets.items()): logits_dataset = [] for m in range(ensemble_size): filename = '{dataset}_{member}.npy'.format(dataset=name, member=m) filename = os.path.join(FLAGS.output_dir, filename) with tf.io.gfile.GFile(filename, 'rb') as f: logits_dataset.append(np.load(f)) logits_dataset = tf.convert_to_tensor(logits_dataset) test_iterator = iter(test_dataset) for step in range(steps_per_eval): _, labels = next(test_iterator) logits = logits_dataset[:, (step * batch_size):((step + 1) * batch_size)] labels = tf.cast(tf.reshape(labels, [-1]), tf.int32) negative_log_likelihood = tf.reduce_mean( ensemble_negative_log_likelihood(labels, logits)) per_probs = tf.nn.softmax(logits) probs = tf.reduce_mean(per_probs, axis=0) if name == 'clean': gibbs_ce = tf.reduce_mean(gibbs_cross_entropy(labels, logits)) metrics['test/negative_log_likelihood'].update_state( negative_log_likelihood) metrics['test/gibbs_cross_entropy'].update_state(gibbs_ce) metrics['test/accuracy'].update_state(labels, probs) metrics['test/ece'].update_state(labels, probs) else: corrupt_metrics['test/nll_{}'.format(name)].update_state( negative_log_likelihood) corrupt_metrics['test/accuracy_{}'.format(name)].update_state( labels, probs) corrupt_metrics['test/ece_{}'.format(name)].update_state( labels, probs) message = ( '{:.1%} completion for evaluation: dataset {:d}/{:d}'.format( (n + 1) / num_datasets, n + 1, num_datasets)) logging.info(message) corrupt_results = utils.aggregate_corrupt_metrics( corrupt_metrics, corruption_types, max_intensity, FLAGS.alexnet_errors_path) total_results = {name: metric.result() for name, metric in metrics.items()} total_results.update(corrupt_results) logging.info('Metrics: %s', total_results)
def sample_discount_curve_paths(self, times, curve_times, num_samples, time_step, num_time_steps=None, random_type=None, seed=None, skip=0, name=None): """Returns a sample of simulated discount curves for the Hull-white model. Args: times: A real positive `Tensor` of shape `[num_times,]`. The times `t` at which the discount curves are to be evaluated. curve_times: A real positive `Tensor` of shape `[num_curve_times]`. The maturities at which discount curve is computed at each simulation time. num_samples: Positive scalar `int`. The number of paths to draw. time_step: Scalar real `Tensor`. Maximal distance between time grid points in Euler scheme. Used only when Euler scheme is applied. Default value: `None`. num_time_steps: An optional Scalar integer `Tensor` - a total number of time steps performed by the algorithm. The maximal distance betwen points in grid is bounded by `times[-1] / (num_time_steps - times.shape[0])`. Either this or `time_step` should be supplied. Default value: `None`. random_type: Enum value of `RandomType`. The type of (quasi)-random number generator to use to generate the paths. Default value: None which maps to the standard pseudo-random numbers. seed: Seed for the random number generator. The seed is only relevant if `random_type` is one of `[STATELESS, PSEUDO, HALTON_RANDOMIZED, PSEUDO_ANTITHETIC, STATELESS_ANTITHETIC]`. For `PSEUDO`, `PSEUDO_ANTITHETIC` and `HALTON_RANDOMIZED` the seed should be an Python integer. For `STATELESS` and `STATELESS_ANTITHETIC` must be supplied as an integer `Tensor` of shape `[2]`. Default value: `None` which means no seed is set. skip: `int32` 0-d `Tensor`. The number of initial points of the Sobol or Halton sequence to skip. Used only when `random_type` is 'SOBOL', 'HALTON', or 'HALTON_RANDOMIZED', otherwise ignored. Default value: `0`. name: Str. The name to give this op. Default value: `sample_discount_curve_paths`. Returns: A tuple containing three `Tensor`s. * The first element is a `Tensor` of shape `batch_shape + [num_samples, num_curve_times, num_times]` containing the simulated zero coupon bond curves `P(t, T)`. * The second element is a `Tensor` of shape `batch_shape + [num_samples, num_times]` containing the simulated short rate paths. * The third element is a `Tensor` of shape `batch_shape + [num_samples, num_times]` containing the simulated discount factor paths. ### References: [1]: Leif B.G. Andersen and Vladimir V. Piterbarg. Interest Rate Modeling, Volume II: Term Structure Models. 2010. """ name = name or self._name + '_sample_discount_curve_paths' with tf.name_scope(name): times = tf.convert_to_tensor(times, self._dtype) num_times = tf.shape(times)[0] curve_times = tf.convert_to_tensor(curve_times, self._dtype) rate_paths, discount_factor_paths, x_t, y_t = self._sample_paths( times, time_step, num_time_steps, num_samples, random_type, skip, seed) # Reshape x_t to (batch_size, num_samples, 1, num_times, nfactors) x_t = tf.expand_dims(x_t, axis=self._batch_rank + 1) # Reshape y_t to (batch_size, num_samples, 1, num_times, nfactors**2) y_t = tf.expand_dims(y_t, axis=self._batch_rank + 1) # Reshape `times` and `curve_times` so that they have the dimensions of # ([num_smaples,num_curve_times,num_sim_times]). num_curve_nodes = tf.shape(curve_times)[0] num_sim_steps = tf.shape(times)[0] times = tf.reshape(times, (1, 1, num_sim_steps)) curve_times = tf.reshape(curve_times, (1, num_curve_nodes, 1)) # Reshape `mean_reversion` to the dimensions of # (batch_shape, [num_smaples,num_curve_times,num_sim_times]). mean_reversion = tf.reshape( self._mean_reversion, self._batch_shape + [1, 1, 1, self._factors]) return (self._bond_reconstitution(times, times + curve_times, mean_reversion, x_t, y_t, num_samples, num_times), rate_paths, discount_factor_paths)
def _sample_n(self, n, seed): components_seed, mix_seed = samplers.split_seed( seed, salt='MixtureSameFamily') try: seed_stream = SeedStream(seed, salt='MixtureSameFamily') except TypeError as e: # Can happen for Tensor seeds. seed_stream = None seed_stream_err = e try: x = self.components_distribution.sample( # [n, B, k, E] n, seed=components_seed) if seed_stream is not None: seed_stream() # Advance even if unused. except TypeError as e: if ('Expected int for argument' not in str(e) and TENSOR_SEED_MSG_PREFIX not in str(e)): raise if seed_stream is None: raise seed_stream_err msg = ( 'Falling back to stateful sampling for `components_distribution` ' '{} of type `{}`. Please update to use `tf.random.stateless_*` ' 'RNGs. This fallback may be removed after 20-Aug-2020. {}') warnings.warn( msg.format(self.components_distribution.name, type(self.components_distribution), str(e))) x = self.components_distribution.sample( # [n, B, k, E] n, seed=seed_stream()) event_shape = None event_ndims = tensorshape_util.rank(self.event_shape) if event_ndims is None: event_shape = self.components_distribution.event_shape_tensor() event_ndims = ps.rank_from_shape(event_shape) event_ndims_static = tf.get_static_value(event_ndims) num_components = None if event_ndims_static is not None: num_components = tf.compat.dimension_value( x.shape[-1 - event_ndims_static]) # We could also check if num_components can be computed statically from # self.mixture_distribution's logits or probs. if num_components is None: num_components = tf.shape(x)[-1 - event_ndims] # TODO(jvdillon): Consider using tf.gather (by way of index unrolling). npdt = dtype_util.as_numpy_dtype(x.dtype) try: mix_sample = self.mixture_distribution.sample( n, seed=mix_seed) # [n, B] or [n] except TypeError as e: if ('Expected int for argument' not in str(e) and TENSOR_SEED_MSG_PREFIX not in str(e)): raise if seed_stream is None: raise seed_stream_err msg = ( 'Falling back to stateful sampling for `mixture_distribution` ' '{} of type `{}`. Please update to use `tf.random.stateless_*` ' 'RNGs. This fallback may be removed after 20-Aug-2020. ({})') warnings.warn( msg.format(self.mixture_distribution.name, type(self.mixture_distribution), str(e))) mix_sample = self.mixture_distribution.sample( n, seed=seed_stream()) # [n, B] or [n] mask = tf.one_hot( indices=mix_sample, # [n, B] or [n] depth=num_components, on_value=npdt(1), off_value=npdt(0)) # [n, B, k] or [n, k] # Pad `mask` to [n, B, k, [1]*e] or [n, [1]*b, k, [1]*e] . batch_ndims = ps.rank(x) - event_ndims - 1 mask_batch_ndims = ps.rank(mask) - 1 pad_ndims = batch_ndims - mask_batch_ndims mask_shape = ps.shape(mask) target_shape = ps.concat([ mask_shape[:-1], ps.ones([pad_ndims], dtype=tf.int32), mask_shape[-1:], ps.ones([event_ndims], dtype=tf.int32), ], axis=0) mask = tf.reshape(mask, shape=target_shape) if dtype_util.is_floating(x.dtype) or dtype_util.is_complex(x.dtype): masked = tf.math.multiply_no_nan(x, mask) else: masked = x * mask ret = tf.reduce_sum(masked, axis=-1 - event_ndims) # [n, B, E] if self._reparameterize: if event_shape is None: event_shape = self.components_distribution.event_shape_tensor() ret = self._reparameterize_sample(ret, event_shape=event_shape) return ret
def _reparameterize_sample(self, x, event_shape): """Adds reparameterization (pathwise) gradients to samples of the mixture. Implicit reparameterization gradients are dx/dphi = -(d transform(x, phi) / dx)^-1 * d transform(x, phi) / dphi, where transform(x, phi) is distributional transform that removes all parameters from samples x. We implement them by replacing x with -stop_gradient(d transform(x, phi) / dx)^-1 * transform(x, phi)] for the backward pass (gradient computation). The derivative of this quantity w.r.t. phi is then the implicit reparameterization gradient. Note that this replaces the gradients w.r.t. both the mixture distribution parameters and components distributions parameters. Limitations: 1. Fundamental: components must be fully reparameterized. 2. Distributional transform is currently only implemented for factorized components. 3. Distributional transform currently only works for known rank of the batch tensor. Args: x: Sample of mixture distribution event_shape: The event shape of this distribution Returns: Tensor with same value as x, but with reparameterization gradients """ # Remove the existing gradients of x wrt parameters of the components. x = tf.stop_gradient(x) event_size = ps.cast(ps.reduce_prod(event_shape), dtype=tf.int32) x_2d_shape = [-1, event_size] # [S*prod(B), prod(E)] # Perform distributional transform of x in [S, B, E] shape, # but have Jacobian of size [S*prod(B), prod(E), prod(E)]. def reshaped_distributional_transform(x_2d): return tf.reshape( self._distributional_transform(tf.reshape(x_2d, ps.shape(x)), event_shape), x_2d_shape) # transform_2d: [S*prod(B), prod(E)] # jacobian: [S*prod(B), prod(E), prod(E)] x_2d = tf.reshape(x, x_2d_shape) transform_2d, jacobian = value_and_batch_jacobian( reshaped_distributional_transform, x_2d) # We only provide the first derivative; the second derivative computed by # autodiff would be incorrect, so we raise an error if it is requested. transform_2d = _prevent_2nd_derivative(transform_2d) # Compute [- stop_gradient(jacobian)^-1 * transform] by solving a linear # system. The Jacobian is lower triangular because the distributional # transform for i-th event dimension does not depend on the next # dimensions. surrogate_x_2d = -tf.linalg.triangular_solve( tf.stop_gradient(jacobian), transform_2d[..., tf.newaxis], lower=True) # [S*prod(B), prod(E), 1] surrogate_x = tf.reshape(surrogate_x_2d, ps.shape(x)) # Replace gradients of x with gradients of surrogate_x, but keep the value. return x + (surrogate_x - tf.stop_gradient(surrogate_x))
def _make_pairs(x): return tf.reshape(tf.tile(x[:, tf.newaxis, :], [1, 2, 1]), [-1, x.shape[-1]])
def DenseAR(x, h=None, hidden_layers=(), activation=tf.nn.relu, log_scale_clip=None, log_scale_clip_pre=None, train=False, dropout_rate=0.0, sigmoid_scale=False, log_scale_factor=1.0, log_scale_reg=0.0, shift_only=False, **kwargs): input_depth = int(x.shape.with_rank_at_least(1)[-1]) if input_depth is None: raise NotImplementedError( "Rightmost dimension must be known prior to graph execution.") input_shape = (np.int32(x.shape.as_list()) if x.shape.is_fully_defined() else tf.shape(x)) for i, units in enumerate(hidden_layers): x = MaskedDense(inputs=x, units=units, num_blocks=input_depth, exclusive=True if i == 0 else False, activation=activation, **kwargs) if h is not None: x += tfkl.Dense(units, use_bias=False, **kwargs)(h) if dropout_rate > 0: x = tfkl.Dropout(dropout_rate)(x, training=train) if shift_only: shift = MaskedDense(inputs=x, units=input_depth, num_blocks=input_depth, activation=None, **kwargs) return shift, None else: if log_scale_factor == 1.0 and log_scale_reg == 0.0 and not log_scale_clip_pre: x = MaskedDense(inputs=x, units=2 * input_depth, num_blocks=input_depth, activation=None, **kwargs) if h is not None: x += tfkl.Dense(2 * input_depth, use_bias=False, **kwargs)(h) x = tf.reshape(x, shape=tf.concat([input_shape, [2]], axis=0)) shift, log_scale = tf.unstack(x, num=2, axis=-1) else: shift = MaskedDense(inputs=x, units=input_depth, num_blocks=input_depth, activation=None, **kwargs) if log_scale_reg > 0.0: regularizer = lambda w: log_scale_reg * 2.0 * tf.nn.l2_loss(w) else: regularizer = None log_scale = MaskedDense(inputs=x, units=input_depth, num_blocks=input_depth, activation=None, use_bias=False, kernel_regularizer=regularizer, **kwargs) log_scale *= log_scale_factor if log_scale_clip_pre: log_scale = log_scale_clip_pre * tf.nn.tanh( log_scale / log_scale_clip_pre) log_scale += tf.get_variable("log_scale_bias", [1, input_depth], initializer=tf.zeros_initializer()) if h is not None: shift += tfkl.Dense(input_depth, use_bias=False, **kwargs)(h) log_scale += tfkl.Dense(input_depth, use_bias=False, **kwargs)(h) if sigmoid_scale: log_scale = tf.log_sigmoid(log_scale) if log_scale_clip: log_scale = log_scale_clip * tf.nn.tanh(log_scale / log_scale_clip) return shift, log_scale
def _sum_pairs(x): if x.shape[0] % 2 != 0: x = tf.concat( [x, tf.zeros(tf.concat([[1], tf.shape(x)[1:]], 0))], 0) return tf.reduce_sum( tf.reshape(x, [tf.shape(x)[0] // 2, 2, -1]), 1)
def EffectiveSampleSize(states, filter_beyond_lag=300, filter_threshold=0.05, use_geyer=False, center=True, normalize=True): """ESS computation for one single Tensor argument.""" def _axis_size(x, axis=None): """Get number of elements of `x` in `axis`, as type `x.dtype`.""" if axis is None: return tf.cast(tf.size(x), x.dtype) return tf.cast(tf.reduce_prod(tf.gather(tf.shape(x), axis)), x.dtype) with tf.name_scope("effective_sample_size_single_state"): states = tf.convert_to_tensor(states, name="states") dt = states.dtype # filter_beyond_lag == None ==> auto_corr is the full sequence. auto_corr = SanitizedAutoCorrelationMean(states, axis=0, reduce_axis=1, center=center, normalize=normalize, max_lags=filter_beyond_lag) orig_auto_corr = auto_corr if use_geyer: def _sum_pairs(x): if x.shape[0] % 2 != 0: x = tf.concat( [x, tf.zeros(tf.concat([[1], tf.shape(x)[1:]], 0))], 0) return tf.reduce_sum( tf.reshape(x, [tf.shape(x)[0] // 2, 2, -1]), 1) def _make_pairs(x): return tf.reshape(tf.tile(x[:, tf.newaxis, :], [1, 2, 1]), [-1, x.shape[-1]]) auto_corr_pairs = _make_pairs( _sum_pairs(auto_corr))[:auto_corr.shape[0]] mask = auto_corr_pairs < 0. mask = tf.cast(mask, dt) mask = tf.cumsum(mask, axis=0) mask = tf.maximum(1. - mask, 0.) auto_corr *= mask elif filter_threshold is not None: filter_threshold = tf.convert_to_tensor(filter_threshold, dtype=dt, name="filter_threshold") # Get a binary mask to zero out values of auto_corr below the threshold. # mask[i, ...] = 1 if auto_corr[j, ...] > threshold for all j <= i, # mask[i, ...] = 0, otherwise. # So, along dimension zero, the mask will look like [1, 1, ..., 0, 0,...] # Building step by step, # Assume auto_corr = [1, 0.5, 0.0, 0.3], and filter_threshold = 0.2. # Step 1: mask = [False, False, True, False] mask = tf.abs(auto_corr) < filter_threshold # Step 2: mask = [0, 0, 1, 1] mask = tf.cast(mask, dtype=dt) # Step 3: mask = [0, 0, 1, 2] mask = tf.cumsum(mask, axis=0) # Step 4: mask = [1, 1, 0, 0] mask = tf.maximum(1. - mask, 0.) auto_corr *= mask # With R[k] := auto_corr[k, ...], # ESS = N / {1 + 2 * Sum_{k=1}^N (N - k) / N * R[k]} # = N / {-1 + 2 * Sum_{k=0}^N (N - k) / N * R[k]} (since R[0] = 1) # approx N / {-1 + 2 * Sum_{k=0}^M (N - k) / N * R[k]} # where M is the filter_beyond_lag truncation point chosen above. # Get the factor (N - k) / N, and give it shape [M, 1,...,1], having total # ndims the same as auto_corr n = _axis_size(states, axis=0) k = tf.range(0., _axis_size(auto_corr, axis=0)) nk_factor = (n - k) / n if auto_corr.shape.ndims is not None: new_shape = [-1] + [1] * (auto_corr.shape.ndims - 1) else: new_shape = tf.concat( ([-1], tf.ones([tf.rank(auto_corr) - 1], dtype=tf.int32)), axis=0) nk_factor = tf.reshape(nk_factor, new_shape) # return tf.reduce_mean(n / ( # -1 + 2 * tf.reduce_sum(nk_factor * auto_corr, axis=0)), 0) # return n / (1.0 + 2 * # tf.reduce_sum(nk_factor[1:, ...] * auto_corr[1:, ...], # axis=0)) # return tf.reduce_mean(n / (-auto_corr[0] + 2 * # tf.reduce_sum(nk_factor * auto_corr, axis=0)), 0) # print(auto_corr[0]) return n / (orig_auto_corr[0] + 2 * tf.reduce_sum( nk_factor[1:, Ellipsis] * auto_corr[1:, Ellipsis], axis=0))
def pairwise_square_distance_tensor(x1, x2, feature_ndims, x1_example_ndims=1, x2_example_ndims=1): """Returns pairwise distance between x1 and x2. This method is a generalization `pairwise_square_distance_matrix`. Given `x1` and `x2`, Tensors with shape `[..., N1, ... Nm, D1, ... Dk]` and `[..., M1, ... Ml, D1, ... Dk]`, compute the pairwise distance tensor `A` of shape `[..., N1, ... Nm, M1, ... Ml]`, where `m` is `x1_example_ndims` and `l` is `x2_example_ndims`. Args: x1: Floating point `Tensor` with shape `B1 + E1 + [D1, ..., Dk]`, where `B1` is a (possibly empty) batch shape, and `E1` is a list of `x1_example_ndims` values. x2: Floating point `Tensor` with shape `B2 + [M] + [D1, ..., Dk]`, where `B2` is a (possibly empty) batch shape that broadcasts with `B1`, and `E2` is a list of `x1_example_ndims` values. feature_ndims: The number of dimensions to consider for the euclidean norm. This is `k` from above. x1_example_ndims: Integer for number of example dimensions in `x1`. This is `len(E1)`. x2_example_ndims: Integer for number of example dimensions in `x2`. This is `len(E2)`. Returns: `Tensor` of shape `bc(B1, B2) + E1 + E2` representing the pairwise square distance tensor. """ # Collapse all the example dimensions and then expand after. x1_shape = tf.shape(x1) x1_example_shape = x1_shape[-(feature_ndims + x1_example_ndims):-feature_ndims] x2_shape = tf.shape(x2) x2_example_shape = x2_shape[-(feature_ndims + x2_example_ndims):-feature_ndims] x1 = tf.reshape( x1, tf.concat([ x1_shape[:-(feature_ndims + x1_example_ndims)], [-1], x1_shape[-feature_ndims:] ], axis=0)) x2 = tf.reshape( x2, tf.concat([ x2_shape[:-(feature_ndims + x2_example_ndims)], [-1], x2_shape[-feature_ndims:] ], axis=0)) pairwise = pairwise_square_distance_matrix(x1, x2, feature_ndims=feature_ndims) # Now we need to undo the transformation. return tf.reshape( pairwise, tf.concat( [tf.shape(pairwise)[:-2], x1_example_shape, x2_example_shape], axis=0))
def soft_multivariate_quantiles(x, quantiles, quantile_width=None, **kwargs): """Computes soft multivariate quantiles via optimal transport. Transport multivariate input values in x onto 2^d + 1 weighted points, {0,1}^d + [0.5, ..., 0.5]. Target weights are adjusted so that those values in x that are transported to the middle value in the target vector correspond to those concentrating around the quantile of interest. Args: x: Tensor<float> of shape [batch, N, d] quantiles: Tensor<float> of shape [r, d], r targeted quantiles of dimension d quantile_width: (float) mass given to the bucket supposed to attract points whose value concentrate around the desired quantile value. Bigger width means that we allow the soft quantile to be a mixture of more points further away from the quantile. If None, the width is set at 1/n where n is the number of values considered (the size along the 'axis'). **kwargs: see sinkhorn.autodiff_sinkhorn for possible extra parameters. Returns: A Tensor<float> [N,r,d] of multivariate quantiles per batch. """ quantiles = tf.constant(quantiles, tf.float32) batch_size = x.shape[0] n = tf.cast(x.shape[1], tf.float32) d = x.shape[2] if quantile_width is None: quantile_width = 2 / n num_quantiles = tf.shape(quantiles)[0] hypercube_vertices = tf.constant( list(itertools.product([-1, 1], repeat=d)), tf.float32) # weights attached to vertices for each quantile. this is n_quantiles x 2^r weights = quantiles[:, tf.newaxis, :]**( 0.5 * (1 - hypercube_vertices))[tf.newaxis, Ellipsis] weights *= (1 - quantiles)[:, tf.newaxis, :]**( 0.5 * (1 + hypercube_vertices))[tf.newaxis, Ellipsis] weights = (1 - quantile_width) * tf.reduce_prod(weights, axis=2) # adding weights for quantile itself (in position 0). weights = tf.concat((quantile_width * tf.ones((num_quantiles, 1)), weights), axis=1) # augmenting and formating as batch_size * 2^r +1 * num_quantiles weights = tf.reshape( tf.tile(tf.transpose(weights), [batch_size, 1]), [batch_size, 2**d + 1, num_quantiles]) # set target locations, by adding the point at 0 that will absorb the quantile # augment it with batch_size y = tf.concat((tf.zeros((1, d), dtype=tf.float32), hypercube_vertices), axis=0) y = tf.reshape(tf.tile(y, [batch_size, 1]), [batch_size, 2**d + 1, d]) # center x x_mean = tf.reduce_mean(x, axis=1) x = x - x_mean[:, tf.newaxis, :] transports = sinkhorn.autodiff_sinkhorn( x, y, tf.ones([batch_size, n, num_quantiles], dtype=tf.float32) / n, weights, **kwargs) # recover convex combinations resulting from transporting to central point in # in all batches and quantile variations. transports = 1 / quantile_width * tf.reshape(transports[:, :, 0, :], [batch_size, n, -1]) # apply these convex combinations to data points + recenter. all_soft_quantiles = tf.reduce_sum( transports[:, :, :, tf.newaxis] * x[:, :, tf.newaxis, :], axis=1) + x_mean[:, tf.newaxis, :] # reshape those quantiles after having applied convex combinations. return tf.reshape(all_soft_quantiles, [batch_size, num_quantiles, d])
def reshaped_distributional_transform(x_2d): return tf.reshape( self._distributional_transform(tf.reshape(x_2d, ps.shape(x)), event_shape), x_2d_shape)
def step_fn(inputs): """Per-Replica StepFn.""" images, labels = inputs if FLAGS.ensemble_size > 1: images = tf.tile(images, [FLAGS.ensemble_size, 1, 1, 1]) labels = tf.tile(labels, [FLAGS.ensemble_size]) with tf.GradientTape() as tape: logits = model(images, training=True) if FLAGS.use_bfloat16: logits = tf.cast(logits, tf.float32) probs = tf.nn.softmax(logits) if FLAGS.ensemble_size > 1: per_probs = tf.reshape( probs, tf.concat([[FLAGS.ensemble_size, -1], probs.shape[1:]], 0)) diversity_results = ed.metrics.average_pairwise_diversity( per_probs, FLAGS.ensemble_size) negative_log_likelihood = tf.reduce_mean( tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)) filtered_variables = [] for var in model.trainable_variables: # Apply l2 on the BN parameters and bias terms. This # excludes only fast weight approximate posterior/prior parameters, # but pay caution to their naming scheme. if ('kernel' in var.name or 'batch_norm' in var.name or 'bias' in var.name): filtered_variables.append(tf.reshape(var, (-1,))) l2_loss = FLAGS.l2 * 2 * tf.nn.l2_loss( tf.concat(filtered_variables, axis=0)) kl = sum(model.losses) / APPROX_IMAGENET_TRAIN_IMAGES kl_scale = tf.cast(global_step + 1, tf.float32) kl_scale /= steps_per_epoch * FLAGS.kl_annealing_epochs kl_scale = tf.minimum(1., kl_scale) kl_loss = kl_scale * kl loss = negative_log_likelihood + l2_loss + kl_loss # Scale the loss given the TPUStrategy will reduce sum all gradients. scaled_loss = loss / strategy.num_replicas_in_sync grads = tape.gradient(scaled_loss, model.trainable_variables) # Separate learning rate implementation. if FLAGS.fast_weight_lr_multiplier != 1.0: grads_and_vars = [] for grad, var in zip(grads, model.trainable_variables): # Apply different learning rate on the fast weights. This excludes BN # and slow weights, but pay caution to the naming scheme. if ('batch_norm' not in var.name and 'kernel' not in var.name): grads_and_vars.append((grad * FLAGS.fast_weight_lr_multiplier, var)) else: grads_and_vars.append((grad, var)) optimizer.apply_gradients(grads_and_vars) else: optimizer.apply_gradients(zip(grads, model.trainable_variables)) metrics['train/ece'].update_state(labels, probs) metrics['train/loss'].update_state(loss) metrics['train/negative_log_likelihood'].update_state( negative_log_likelihood) metrics['train/kl'].update_state(kl) metrics['train/kl_scale'].update_state(kl_scale) metrics['train/accuracy'].update_state(labels, logits) if FLAGS.ensemble_size > 1: for k, v in diversity_results.items(): training_diversity['train/' + k].update_state(v) global_step.assign_add(1)
def _distributional_transform(self, x, event_shape): """Performs distributional transform of the mixture samples. Distributional transform removes the parameters from samples of a multivariate distribution by applying conditional CDFs: (F(x_1), F(x_2 | x1_), ..., F(x_d | x_1, ..., x_d-1)) (the indexing is over the 'flattened' event dimensions). The result is a sample of product of Uniform[0, 1] distributions. We assume that the components are factorized, so the conditional CDFs become F(x_i | x_1, ..., x_i-1) = sum_k w_i^k F_k (x_i), where w_i^k is the posterior mixture weight: for i > 0 w_i^k = w_k prob_k(x_1, ..., x_i-1) / sum_k' w_k' prob_k'(x_1, ..., x_i-1) and w_0^k = w_k is the mixture probability of the k-th component. Args: x: Sample of mixture distribution event_shape: The event shape of this distribution Returns: Result of the distributional transform """ if tensorshape_util.rank(x.shape) is None: # tf.math.softmax raises an error when applied to inputs of undefined # rank. raise ValueError( 'Distributional transform does not support inputs of ' 'undefined rank.') # Obtain factorized components distribution and assert that it's # a scalar distribution. if isinstance(self._components_distribution, independent.Independent): univariate_components = self._components_distribution.distribution else: univariate_components = self._components_distribution with tf.control_dependencies([ assert_util.assert_equal( univariate_components.is_scalar_event(), True, message='`univariate_components` must have scalar event') ]): event_ndims = ps.rank_from_shape(event_shape) x_padded = self._pad_sample_dims( x, event_ndims=event_ndims) # [S, B, 1, E] log_prob_x = univariate_components.log_prob( x_padded) # [S, B, k, E] cdf_x = univariate_components.cdf(x_padded) # [S, B, k, E] # log prob_k (x_1, ..., x_i-1) event_size = ps.cast(ps.reduce_prod(event_shape), dtype=tf.int32) cumsum_log_prob_x = tf.reshape( tf.math.cumsum( # [S*prod(B)*k, prod(E)] tf.reshape(log_prob_x, [-1, event_size]), exclusive=True, axis=-1), ps.shape(log_prob_x)) # [S, B, k, E] event_ndims = ps.rank_from_shape(event_shape) logits_mix_prob = self.mixture_distribution.logits_parameter() logits_mix_prob = tf.reshape( logits_mix_prob, # [k] or [B, k] ps.concat([ ps.shape(logits_mix_prob), ps.ones([event_ndims], dtype=tf.int32), ], axis=0)) # [k, [1]*e] or [B, k, [1]*e] # Logits of the posterior weights: log w_k + log prob_k (x_1, ..., x_i-1) log_posterior_weights_x = logits_mix_prob + cumsum_log_prob_x component_axis = tensorshape_util.rank(x.shape) - event_ndims posterior_weights_x = tf.math.softmax(log_posterior_weights_x, axis=component_axis) return tf.reduce_sum(posterior_weights_x * cdf_x, axis=component_axis)
def _head(self, neck_outputs): # <tf.float32>[time * batch_size, 1, hidden_dim] visual_feature = neck_outputs['visual_feature'] # <tf.float32>[time * batch_size, num_tokens, hidden_dim] text_feature = neck_outputs['text_feature'] # <tf.float32>[time, batch_size, 1, hidden_dim] visual_feature = tf.reshape( visual_feature, [self._current_num_timesteps, self._current_batch_size] + visual_feature.shape[1:].as_list()) # <tf.float32>[batch_size, time, hidden_dim] visual_feature = tf.squeeze(visual_feature, axis=2) visual_feature = tf.transpose(visual_feature, [1, 0, 2]) first_true = utils.get_first_true_column( tf.reshape(neck_outputs[constants.DISC_MASK], [self._current_num_timesteps, self._current_batch_size])) # <tf.float32>[batch_size, num_tokens, hidden_dim] text_feature = tf.cond( tf.keras.backend.any(first_true), lambda: tf.boolean_mask(text_feature, tf.reshape(first_true, [-1])), lambda: tf.reshape(text_feature, [ self._current_num_timesteps, self._current_batch_size ] + text_feature.shape[1:].as_list())[0, :, :, :]) # visual_feature = tf.nn.l2_normalize(visual_feature, axis=2) # text_feature = tf.nn.l2_normalize(text_feature, axis=2) # <tf.float32>[batch_size, time, num_tokens] alpha_i_j = tf.matmul(visual_feature, tf.transpose(text_feature, perm=[0, 2, 1])) # <tf.float32>[batch_size, time, num_tokens] ealpha_i_j = tf.exp(alpha_i_j) sum_i_j = tf.tile( tf.expand_dims(tf.reduce_sum(ealpha_i_j, 2), 2), [1, 1, tf.shape(ealpha_i_j)[2]]) mask = tf.cast( tf.transpose( tf.reshape(neck_outputs[constants.DISC_MASK], [self._current_num_timesteps, self._current_batch_size]), perm=[1, 0]), tf.float32) # <tf.float32>[batch, time, num_tokens] c_i_j = tf.divide(ealpha_i_j, sum_i_j) # <tf.float32>[batch, time] score = tf.reduce_sum(c_i_j * alpha_i_j, 2) escore = tf.exp(-1 * score) * mask sum_escore = tf.tile( tf.expand_dims(tf.reduce_sum(escore, 1), 1), [1, tf.shape(escore)[1]]) score_weight = tf.divide(escore, sum_escore) similarities = tf.reduce_sum(mask * score * score_weight, 1) similarities = tf.expand_dims(similarities, axis=0) # [time_step, batch_size] similarities = tf.tile(similarities, [self._current_num_timesteps, 1]) # Apply an affine transform. similarities = similarities * self.affine_a + self.affine_b output_a = tf.reshape(tf.convert_to_tensor(self.affine_a), [1, 1]) output_b = tf.reshape(tf.convert_to_tensor(self.affine_b), [1, 1]) output_a = tf.tile(output_a, [self._current_num_timesteps, self._current_batch_size]) output_b = tf.tile(output_b, [self._current_num_timesteps, self._current_batch_size]) return common.AgentOutput( policy_logits=similarities, baseline=(output_a, output_b))
def __call__(self, roi_features, class_indices, is_training=None): """Mask branch for the Mask-RCNN model. Args: roi_features: A ROI feature tensor of shape [batch_size, num_rois, height_l, width_l, num_filters]. class_indices: a Tensor of shape [batch_size, num_rois], indicating which class the ROI is. is_training: `boolean`, if True if model is in training mode. Returns: mask_outputs: a tensor with a shape of [batch_size, num_masks, mask_height, mask_width, num_classes], representing the mask predictions. fg_gather_indices: a tensor with a shape of [batch_size, num_masks, 2], representing the fg mask targets. Raises: ValueError: If boxes is not a rank-3 tensor or the last dimension of boxes is not 4. """ def _get_stddev_equivalent_to_msra_fill(kernel_size, fan_out): """Returns the stddev of random normal initialization as MSRAFill.""" # Reference: https://github.com/pytorch/pytorch/blob/master/caffe2/operators/filler_op.h#L445-L463 # pylint: disable=line-too-long # For example, kernel size is (3, 3) and fan out is 256, stddev is 0.029. # stddev = (2/(3*3*256))^0.5 = 0.029 return (2 / (kernel_size[0] * kernel_size[1] * fan_out))**0.5 with backend.get_graph().as_default(): with tf.name_scope('mask_head'): _, num_rois, height, width, filters = roi_features.get_shape( ).as_list() net = tf.reshape(roi_features, [-1, height, width, filters]) for i in range(4): kernel_size = (3, 3) fan_out = 256 init_stddev = _get_stddev_equivalent_to_msra_fill( kernel_size, fan_out) net = tf.keras.layers.Conv2D( fan_out, kernel_size=kernel_size, strides=(1, 1), padding='same', dilation_rate=(1, 1), activation=None, kernel_initializer=tf.keras.initializers.RandomNormal( stddev=init_stddev), bias_initializer=tf.zeros_initializer(), name='mask-conv-l%d' % i)(net) net = self._batch_norm_relu()(net, is_training=is_training) kernel_size = (2, 2) fan_out = 256 init_stddev = _get_stddev_equivalent_to_msra_fill( kernel_size, fan_out) net = tf.keras.layers.Conv2DTranspose( fan_out, kernel_size=kernel_size, strides=(2, 2), padding='valid', activation=None, kernel_initializer=tf.keras.initializers.RandomNormal( stddev=init_stddev), bias_initializer=tf.zeros_initializer(), name='conv5-mask')(net) net = self._batch_norm_relu()(net, is_training=is_training) kernel_size = (1, 1) fan_out = self._num_classes init_stddev = _get_stddev_equivalent_to_msra_fill( kernel_size, fan_out) mask_outputs = tf.keras.layers.Conv2D( fan_out, kernel_size=kernel_size, strides=(1, 1), padding='valid', kernel_initializer=tf.keras.initializers.RandomNormal( stddev=init_stddev), bias_initializer=tf.zeros_initializer(), name='mask_fcn_logits')(net) mask_outputs = tf.reshape(mask_outputs, [ -1, num_rois, self._mask_target_size, self._mask_target_size, self._num_classes ]) with tf.name_scope('masks_post_processing'): # TODO(pengchong): Figure out the way not to use the static inferred # batch size. batch_size, num_masks = class_indices.get_shape().as_list() mask_outputs = tf.transpose(a=mask_outputs, perm=[0, 1, 4, 2, 3]) # Contructs indices for gather. batch_indices = tf.tile( tf.expand_dims(tf.range(batch_size), axis=1), [1, num_masks]) mask_indices = tf.tile( tf.expand_dims(tf.range(num_masks), axis=0), [batch_size, 1]) gather_indices = tf.stack( [batch_indices, mask_indices, class_indices], axis=2) mask_outputs = tf.gather_nd(mask_outputs, gather_indices) return mask_outputs
def _milstein_step(*, dim, i, written_count, current_state, result, drift_fn, volatility_fn, grad_volatility_fn, wiener_mean, num_samples, times, dt, sqrt_dt, keep_mask, random_type, seed, normal_draws, input_gradients, stratonovich_order, aux_normal_draws): """Performs one step of Milstein scheme.""" current_time = times[i + 1] written_count = tf.cast(written_count, tf.int32) if normal_draws is not None: dw = normal_draws[i] else: dw = random.mv_normal_sample((num_samples, ), mean=wiener_mean, random_type=random_type, seed=seed) if aux_normal_draws is not None: stratonovich_draws = [] for j in range(3): stratonovich_draws.append( tf.reshape(aux_normal_draws[j][i], [num_samples, dim, stratonovich_order])) else: stratonovich_draws = [] # Three sets of normal draws for stratonovich integrals. for j in range(3): stratonovich_draws.append( random.mv_normal_sample( (num_samples, ), mean=tf.zeros((dim, stratonovich_order), dtype=current_state.dtype, name='stratonovich_draws_{}'.format(j)), random_type=random_type, seed=seed)) if dim == 1: drift = drift_fn(current_time, current_state) vol = volatility_fn(current_time, current_state) grad_vol = grad_volatility_fn(current_time, current_state, tf.ones_like(current_state)) next_state = _milstein_1d(dw=dw, dt=dt[i], sqrt_dt=sqrt_dt[i], current_state=current_state, drift=drift, vol=vol, grad_vol=grad_vol) else: drift = drift_fn(current_time, current_state) vol = volatility_fn(current_time, current_state) # This is a list of size equal to the dimension of the state space `dim`. # It contains tensors of shape [num_samples, dim, wiener_dim] representing # the gradient of the volatility function. In our case, the dimension of the # wiener process `wiener_dim` is equal to the state dimension `dim`. grad_vol = [ grad_volatility_fn(current_time, current_state, start) for start in input_gradients ] next_state = _milstein_nd(dim=dim, num_samples=num_samples, dw=dw, dt=dt[i], sqrt_dt=sqrt_dt[i], current_state=current_state, drift=drift, vol=vol, grad_vol=grad_vol, stratonovich_draws=stratonovich_draws, stratonovich_order=stratonovich_order) result = utils.maybe_update_along_axis(tensor=result, do_update=keep_mask[i + 1], ind=written_count, axis=1, new_tensor=tf.expand_dims( next_state, axis=1)) written_count += tf.cast(keep_mask[i + 1], dtype=tf.int32) return i + 1, written_count, next_state, result
def _transpose_around_bijector_fn(self, bijector_fn, arg, src_event_ndims, dest_event_ndims=None, fn_reduces_event=False, **kwargs): # This function moves the axes corresponding to `self.sample_shape` to the # left of the batch shape, then applies `bijector_fn`, then moves the axes # corresponding to `self.sample_shape` back to the event part of the shape. # # `src_event_ndims` and `dest_event_ndims` indicate the expected event rank # (omitting `self.sample_shape`) before and after applying `bijector_fn`. # # This function arose because forward and inverse ended up being quite # similar. It was then only a small generalization to also support {F/I}LDJ. batch_ndims = ps.rank_from_shape(self.distribution.batch_shape_tensor, self.distribution.batch_shape) extra_sample_ndims = ps.rank_from_shape(self.sample_shape) arg_ndims = ps.rank(arg) # (1) Expand arg's dims. d = arg_ndims - batch_ndims - extra_sample_ndims - src_event_ndims arg = tf.reshape(arg, shape=ps.pad(ps.shape(arg), paddings=[[ps.maximum(0, -d), 0]], constant_values=1)) arg_ndims = ps.rank(arg) sample_ndims = ps.maximum(0, d) # (2) Transpose arg's dims. sample_dims = ps.range(0, sample_ndims) batch_dims = ps.range(sample_ndims, sample_ndims + batch_ndims) extra_sample_dims = ps.range( sample_ndims + batch_ndims, sample_ndims + batch_ndims + extra_sample_ndims) event_dims = ps.range(sample_ndims + batch_ndims + extra_sample_ndims, arg_ndims) perm = ps.concat( [sample_dims, extra_sample_dims, batch_dims, event_dims], axis=0) arg = tf.transpose(arg, perm=perm) # (3) Apply underlying bijector. result = bijector_fn(arg, **kwargs) # (4) Transpose sample_shape from the sample to the event shape. result_ndims = ps.rank(result) if fn_reduces_event: dest_event_ndims = 0 d = result_ndims - batch_ndims - extra_sample_ndims - dest_event_ndims if fn_reduces_event: # In some cases, fn may reduce event too far, i.e. ildj may return a # scalar `0.`, which won't work with the transpose we do below. result = tf.reshape(result, shape=ps.pad(ps.shape(result), paddings=[[ps.maximum(0, -d), 0]], constant_values=1)) result_ndims = ps.rank(result) sample_ndims = ps.maximum(0, d) sample_dims = ps.range(0, sample_ndims) extra_sample_dims = ps.range(sample_ndims, sample_ndims + extra_sample_ndims) batch_dims = ps.range(sample_ndims + extra_sample_ndims, sample_ndims + extra_sample_ndims + batch_ndims) event_dims = ps.range(sample_ndims + extra_sample_ndims + batch_ndims, result_ndims) perm = ps.concat( [sample_dims, batch_dims, extra_sample_dims, event_dims], axis=0) return tf.transpose(result, perm=perm)
def _slopes(self, x): x = tf.reshape(x, [-1, self._nbins - 1]) return tf.math.softplus(x) + 1e-2
def main(unused_args): del unused_args # # General setup. # ebm_util.init_tf2() ebm_util.set_seed(FLAGS.seed) output_dir = FLAGS.logdir checkpoint_dir = os.path.join(output_dir, 'checkpoint') samples_dir = os.path.join(output_dir, 'samples') tf.io.gfile.makedirs(samples_dir) tf.io.gfile.makedirs(checkpoint_dir) log_f = tf.io.gfile.GFile(os.path.join(output_dir, 'log.out'), mode='w') logger = ebm_util.setup_logging('main', log_f, console=False) logger.info({k: v._value for (k, v) in FLAGS._flags().items()}) # pylint: disable=protected-access # # Data # if FLAGS.dataset == 'mnist': x_train = ebm_util.mnist_dataset(N_CH) elif FLAGS.dataset == 'celeba': x_train = ebm_util.celeba_dataset() else: raise ValueError(f'Unknown dataset. {FLAGS.dataset}') train_ds = tf.data.Dataset.from_tensor_slices(x_train).shuffle( 10000).batch(FLAGS.batch_size) # # Models # if FLAGS.q_type == 'mean_field_gaussian': q = MeanFieldGaussianQ() u = make_u() # # Optimizers # def lr_p(step): lr = FLAGS.p_learning_rate * (1. - (step / (1.5 * FLAGS.train_steps))) return lr def lr_q(step): lr = FLAGS.q_learning_rate * (1. - (step / (1.5 * FLAGS.train_steps))) return lr opt_q = tf.optimizers.Adam(learning_rate=ebm_util.LambdaLr(lr_q)) opt_p = tf.optimizers.Adam(learning_rate=ebm_util.LambdaLr(lr_p), beta_1=FLAGS.p_adam_beta_1) # # Checkpointing # global_step_var = tf.Variable(0, trainable=False) checkpoint = tf.train.Checkpoint(opt_p=opt_p, opt_q=opt_q, u=u, q=q, global_step_var=global_step_var) checkpoint_path = os.path.join(checkpoint_dir, 'checkpoint') if tf.io.gfile.exists(checkpoint_path + '.index'): print(f'Restoring from {checkpoint_path}') checkpoint.restore(checkpoint_path) # # Stats initialization # stat_i = [] stat_keys = [ 'E_pos', # Mean energy of the positive samples. 'E_neg_q', # Mean energy of the negative samples (pre-HMC). 'E_neg_p', # Mean energy of the negative samples (post-HMC). 'H', # Entropy of Q (if known). 'pd_pos', # Pairse differences of the positive samples. 'pd_neg_q', # Pairwise differences of the negative samples (pre-HMC). 'pd_neg_p', # Pairwise differences of the negative samples (post-HMC). 'hmc_disp', # L2 distance between initial and final entropyMC samples. 'hmc_p_accept', # entropyMC P(accept). 'hmc_step_size', # entropyMC step size. 'x_neg_p_min', # Minimum value of the negative samples (post-HMC). 'x_neg_p_max', # Maximum value of the negative samples (post-HMC). 'time', # Time taken to do the training step. ] stat = {k: [] for k in stat_keys} def array_to_str(a, fmt='{:>8.4f}'): return ' '.join([fmt.format(v) for v in a]) def stats_callback(step, entropy, pd_neg_q): del step, entropy, pd_neg_q step_size = FLAGS.mcmc_step_size train_ds_iter = iter(train_ds) x_pos_1 = ebm_util.data_preprocess(next(train_ds_iter)) x_pos_2 = ebm_util.data_preprocess(next(train_ds_iter)) global_step = global_step_var.numpy() while global_step < (FLAGS.train_steps + 1): for x_pos in train_ds: # Drop partial batches. if x_pos.shape[0] != FLAGS.batch_size: continue # # Update # start_time = time.time() x_pos = ebm_util.data_preprocess(x_pos) x_pos = ebm_util.data_discrete_noise(x_pos) if FLAGS.p_loss == 'neutra_hmc': (x_neg_q, x_neg_p, p_accept, step_size, pos_e, pos_e_updated, neg_e_q, neg_e_p, neg_e_p_updated) = train_p(q, u, x_pos, step_size, opt_p) elif FLAGS.p_loss == 'neutra_iid': (x_neg_q, x_neg_p, p_accept, step_size, pos_e, pos_e_updated, neg_e_q, neg_e_p, neg_e_p_updated) = train_p_mh(q, u, x_pos, step_size, opt_p) else: raise ValueError(f'Unknown P loss {FLAGS.p_loss}') if FLAGS.q_loss == 'forward_kl': train_q_fwd_kl(q, x_neg_p, opt_q) entropy = 0.0 mle_loss = 0.0 elif FLAGS.q_loss == 'reverse_kl': for _ in range(10): _, entropy = train_q_rev_kl(q, u, opt_q) mle_loss = 0.0 elif FLAGS.q_loss == 'reverse_kl_mle': for _ in range(FLAGS.q_sub_steps): alpha = FLAGS.q_rkl_weight (_, entropy, _, mle_loss, norm_grads_ebm, norm_grads_mle) = train_q_rev_kl_mle( q, u, x_pos, tf.convert_to_tensor(alpha), opt_q) elif FLAGS.q_loss == 'mle': mle_loss = train_q_mle(q, x_pos, opt_q) entropy = 0.0 else: raise ValueError(f'Unknown Q loss {FLAGS.q_loss}') end_time = time.time() # # Stats # hmc_disp = tf.reduce_mean( tf.norm(tf.reshape(x_neg_q, [64, -1]) - tf.reshape(x_neg_p, [64, -1]), axis=1)) if global_step % FLAGS.plot_steps == 0: # Positives + negatives. ebm_util.plot( tf.reshape(ebm_util.data_postprocess(x_neg_q), [FLAGS.batch_size, N_WH, N_WH, N_CH]), os.path.join(samples_dir, f'x_neg_q_{global_step}.png')) ebm_util.plot( tf.reshape(ebm_util.data_postprocess(x_neg_p), [FLAGS.batch_size, N_WH, N_WH, N_CH]), os.path.join(samples_dir, f'x_neg_p_{global_step}.png')) ebm_util.plot( tf.reshape(ebm_util.data_postprocess(x_pos), [FLAGS.batch_size, N_WH, N_WH, N_CH]), os.path.join(samples_dir, f'x_pos_{global_step}.png')) # Samples for various temperatures. for t in [0.1, 0.5, 1.0, 2.0, 4.0]: _, x_neg_q_t, _ = q.sample_with_log_prob(FLAGS.batch_size, temp=t) ebm_util.plot( tf.reshape(ebm_util.data_postprocess(x_neg_q_t), [FLAGS.batch_size, N_WH, N_WH, N_CH]), os.path.join(samples_dir, f'x_neg_t_{t}_{global_step}.png')) stats_callback(global_step, entropy, ebm_util.nearby_difference(x_neg_q)) stat_i.append(global_step) stat['E_pos'].append(pos_e_updated) stat['E_neg_q'].append(neg_e_q) stat['E_neg_p'].append(neg_e_p) stat['H'].append(entropy) stat['pd_neg_q'].append(ebm_util.nearby_difference(x_neg_q)) stat['pd_neg_p'].append(ebm_util.nearby_difference(x_neg_p)) stat['pd_pos'].append(ebm_util.nearby_difference(x_pos)) stat['hmc_disp'].append(hmc_disp) stat['hmc_p_accept'].append(p_accept) stat['hmc_step_size'].append(step_size) stat['x_neg_p_min'].append(tf.reduce_min(x_neg_p)) stat['x_neg_p_max'].append(tf.reduce_max(x_neg_p)) stat['time'].append(end_time - start_time) ebm_util.plot_stat(stat_keys, stat, stat_i, output_dir) # Doing a linear interpolation in the latent space. z_pos_1 = q.forward(x_pos_1)[0] z_pos_2 = q.forward(x_pos_2)[0] x_alphas = [] n_steps = 10 for j in range(0, n_steps + 1): alpha = (j / n_steps) z_alpha = (1. - alpha) * z_pos_1 + (alpha) * z_pos_2 x_alpha = q.reverse(z_alpha)[0] x_alphas.append(x_alpha) ebm_util.plot_n_by_m( ebm_util.data_postprocess( tf.reshape(tf.stack(x_alphas, axis=1), [ (n_steps + 1) * FLAGS.batch_size, N_WH, N_WH, N_CH ])), os.path.join(samples_dir, f'x_alpha_{global_step}.png'), FLAGS.batch_size, n_steps + 1) # Doing random perturbations in the latent space. for eps in [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 2e0, 2.5e0, 3e0]: z_pos_2_eps = z_pos_2 + eps * tf.random.normal( z_pos_2.shape) x_alpha = q.reverse(z_pos_2_eps)[0] ebm_util.plot( tf.reshape(ebm_util.data_postprocess(x_alpha), [FLAGS.batch_size, N_WH, N_WH, N_CH]), os.path.join(samples_dir, f'x_alpha_eps_{eps}_{global_step}.png')) # Checking the log-probabilites of positive and negative examples under # Q. z_neg_test, x_neg_test, _ = q.sample_with_log_prob( FLAGS.batch_size, temp=FLAGS.q_temperature) z_pos_test = q.forward(x_pos)[0] z_neg_test_pd = ebm_util.nearby_difference(z_neg_test) z_pos_test_pd = ebm_util.nearby_difference(z_pos_test) z_norms_neg = tf.reduce_mean(tf.norm(z_neg_test, axis=1)) z_norms_pos = tf.reduce_mean(tf.norm(z_pos_test, axis=1)) log_prob_neg = tf.reduce_mean(q.log_prob(x_neg_test)) log_prob_pos = tf.reduce_mean(q.log_prob(x_pos)) logger.info(' '.join([ f'i={global_step:6d}', # Pre-update, post-update (f'E_pos=[{pos_e:10.4f} {pos_e_updated:10.4f} ' + f'{pos_e_updated - pos_e:10.4f}]'), # Pre-update pre-HMC, pre-update post-HMC, post-update post-HMC (f'E_neg=[{neg_e_q:10.4f} {neg_e_p:10.4f} ' + f'{neg_e_p_updated:10.4f} {neg_e_p_updated - neg_e_p:10.4f}]' ), f'mle={tf.reduce_mean(mle_loss):8.4f}', f'H={entropy:8.4f}', f'norm_grads_ebm={norm_grads_ebm:8.4f}', f'norm_grads_mle={norm_grads_mle:8.4f}', f'pd(x_pos)={ebm_util.nearby_difference(x_pos):8.4f}', f'pd(x_neg_q)={ebm_util.nearby_difference(x_neg_q):8.4f}', f'pd(x_neg_p)={ebm_util.nearby_difference(x_neg_p):8.4f}', f'hmc_disp={hmc_disp:8.4f}', f'p(accept)={p_accept:8.4f}', f'step_size={step_size:8.4f}', # Min, max. (f'x_neg_q=[{tf.reduce_min(x_neg_q):8.4f} ' + f'{tf.reduce_max(x_neg_q):8.4f}]'), (f'x_neg_p=[{tf.reduce_min(x_neg_p):8.4f} ' + f'{tf.reduce_max(x_neg_p):8.4f}]'), f'z_neg_norm={array_to_str(z_norms_neg)}', f'z_pos_norm={array_to_str(z_norms_pos)}', f'z_neg_test_pd={z_neg_test_pd:>8.2f}', f'z_pos_test_pd={z_pos_test_pd:>8.2f}', f'log_prob_neg={log_prob_neg:12.2f}', f'log_prob_pos={log_prob_pos:12.2f}', ])) if global_step % FLAGS.save_steps == 0: global_step_var.assign(global_step) checkpoint.write(os.path.join(checkpoint_dir, 'checkpoint')) global_step += 1
def __call__(self, net, is_training=False): """Builds Dropblock layer. Args: net: `Tensor` input tensor. is_training: `bool` if True, the model is in training mode. Returns: A version of input tensor with DropBlock applied. """ if not is_training or self._dropblock_keep_prob is None: return net logging.info( 'Applying DropBlock: dropblock_size {}, net.shape {}'.format( self._dropblock_size, net.shape)) if self._data_format == 'channels_last': _, height, width, _ = net.get_shape().as_list() else: _, _, height, width = net.get_shape().as_list() total_size = width * height dropblock_size = min(self._dropblock_size, min(width, height)) # Seed_drop_rate is the gamma parameter of DropBlcok. seed_drop_rate = (1.0 - self._dropblock_keep_prob ) * total_size / dropblock_size**2 / ( (width - self._dropblock_size + 1) * (height - self._dropblock_size + 1)) # Forces the block to be inside the feature map. w_i, h_i = tf.meshgrid(tf.range(width), tf.range(height)) valid_block = tf.logical_and( tf.logical_and(w_i >= int(dropblock_size // 2), w_i < width - (dropblock_size - 1) // 2), tf.logical_and(h_i >= int(dropblock_size // 2), h_i < width - (dropblock_size - 1) // 2)) if self._data_format == 'channels_last': valid_block = tf.reshape(valid_block, [1, height, width, 1]) else: valid_block = tf.reshape(valid_block, [1, 1, height, width]) randnoise = tf.random.uniform(net.shape, dtype=tf.float32) valid_block = tf.cast(valid_block, dtype=tf.float32) seed_keep_rate = tf.cast(1 - seed_drop_rate, dtype=tf.float32) block_pattern = (1 - valid_block + seed_keep_rate + randnoise) >= 1 block_pattern = tf.cast(block_pattern, dtype=tf.float32) if self._data_format == 'channels_last': ksize = [1, self._dropblock_size, self._dropblock_size, 1] else: ksize = [1, 1, self._dropblock_size, self._dropblock_size] block_pattern = -tf.nn.max_pool2d( -block_pattern, ksize=ksize, strides=[1, 1, 1, 1], padding='SAME', data_format='NHWC' if self._data_format == 'channels_last' else 'NCHW') percent_ones = tf.cast(tf.reduce_sum(input_tensor=block_pattern), tf.float32) / tf.cast( tf.size(input=block_pattern), tf.float32) net = net / tf.cast(percent_ones, net.dtype) * tf.cast( block_pattern, net.dtype) return net
def sample_and_preprocess(video, labels, seq_label, seq_len, name, num_steps, augment, sample_all=False, sample_all_stride=1, add_shape=False): """Samples frames and prepares them for training.""" if sample_all: # When dealing with very long videos we can choose to sub-sample to fit # data in memory. But be aware this also evaluates over a subset of frames. # Subsampling the validation set videos when reporting performance is not # recommended. steps = tf.range(0, seq_len, sample_all_stride) seq_len = tf.shape(steps)[0] chosen_steps = steps else: stride = CONFIG.DATA.STRIDE sampling_strategy = CONFIG.DATA.SAMPLING_STRATEGY # TODO(debidatta) : More flexible sampling if sampling_strategy == 'stride': # Offset can be set between 0 and maximum location from which we can get # total coverage of the video without having to pad. # This handles sampling over longer sequences. offset = tf.random.uniform( (), 0, tf.maximum(tf.cast(1, tf.int64), seq_len - stride * num_steps), dtype=tf.int64) # This handles sampling over shorter sequences by padding the last frame # many times. This is not ideal for the way alignment training batches are # created. steps = tf.minimum( seq_len - 1, tf.range(offset, offset + num_steps * stride + 1, stride)) steps = steps[:num_steps] elif sampling_strategy == 'offset_uniform': # Sample a random offset less than a provided max offset. Among all frames # higher than the chosen offset, randomly sample num_frames check1 = tf.debugging.assert_greater_equal( seq_len, tf.cast(CONFIG.DATA.RANDOM_OFFSET, tf.int64), message='Random offset is more than sequence length.') check2 = tf.less_equal( tf.cast(num_steps, tf.int64), seq_len - tf.cast(CONFIG.DATA.RANDOM_OFFSET, tf.int64), ) def _sample_random(): with tf.control_dependencies([tf.identity(check1.outputs[0])]): offset = CONFIG.DATA.RANDOM_OFFSET steps = tf.random.shuffle(tf.range(offset, seq_len)) steps = tf.gather(steps, tf.range(0, num_steps)) steps = tf.gather( steps, tf.nn.top_k(steps, k=num_steps).indices[::-1]) return steps def _sample_all(): return tf.range(0, num_steps, dtype=tf.int64) steps = tf.cond(check2, _sample_random, _sample_all) else: raise ValueError( 'Sampling strategy %s is unknown. Supported values are ' 'stride, offset_uniform .' % sampling_strategy) if not sample_all and 'tcn' in CONFIG.TRAINING_ALGO: pos_window = CONFIG.TCN.POSITIVE_WINDOW # pylint: disable=g-long-lambda pos_steps = tf.map_fn( lambda step: tf.random.uniform( (), minval=step - pos_window, maxval=step, dtype=tf.int64), steps) # pylint: enable=g-long-lambda steps = tf.stack([pos_steps, steps]) steps = tf.reshape(tf.transpose(steps), (-1, )) # Store chosen indices. chosen_steps = steps # Get multiple context steps depending on config at selected steps. steps = tf.reshape(tf.map_fn(get_steps, steps), [-1]) steps = tf.maximum(tf.cast(0, tf.int64), steps) steps = tf.minimum(seq_len - 1, steps) shape_all_steps = CONFIG.DATA.NUM_STEPS * num_steps if not sample_all and 'tcn' in CONFIG.TRAINING_ALGO: shape_all_steps *= 2 # Select data based on steps/ video = tf.gather(video, steps) # Decode the encoded JPEG images video = tf.map_fn(tf.image.decode_jpeg, video, parallel_iterations=FLAGS.num_parallel_calls, dtype=tf.uint8) # Take images in range [0, 255] and normalize to [0, 1] video = tf.map_fn(normalize_input, video, parallel_iterations=FLAGS.num_parallel_calls, dtype=tf.float32) # Perform data-augmentation and return images in range [-1, 1] video = preprocess_input(video, augment) if add_shape: video.set_shape( [shape_all_steps, CONFIG.IMAGE_SIZE, CONFIG.IMAGE_SIZE, 3]) if CONFIG.DATA.FRAME_LABELS: labels = tf.gather(labels, steps) if add_shape: labels.set_shape([shape_all_steps]) return { 'frames': video, 'frame_labels': labels, 'chosen_steps': chosen_steps, 'seq_lens': seq_len, 'seq_labels': seq_label, 'name': name }
def __call__(self, fpn_features, boxes, outer_boxes, classes, is_training=None): """Generate the detection priors from the box detections and FPN features. This corresponds to the Fig. 4 of the ShapeMask paper at https://arxiv.org/pdf/1904.03239.pdf Args: fpn_features: a dictionary of FPN features. boxes: a float tensor of shape [batch_size, num_instances, 4] representing the tight gt boxes from dataloader/detection. outer_boxes: a float tensor of shape [batch_size, num_instances, 4] representing the loose gt boxes from dataloader/detection. classes: a int Tensor of shape [batch_size, num_instances] of instance classes. is_training: training mode or not. Returns: crop_features: a float Tensor of shape [batch_size * num_instances, mask_crop_size, mask_crop_size, num_downsample_channels]. This is the instance feature crop. detection_priors: A float Tensor of shape [batch_size * num_instances, mask_size, mask_size, 1]. """ with backend.get_graph().as_default(): # loads class specific or agnostic shape priors if self._shape_prior_path: if self._use_category_for_mask: fid = tf.io.gfile.GFile(self._shape_prior_path, 'rb') # The encoding='bytes' options is for incompatibility between python2 # and python3 pickle. class_tups = pickle.load(fid, encoding='bytes') max_class_id = class_tups[-1][0] + 1 class_masks = np.zeros( (max_class_id, self._num_clusters, self._mask_crop_size, self._mask_crop_size), dtype=np.float32) for cls_id, _, cls_mask in class_tups: assert cls_mask.shape == (self._num_clusters, self._mask_crop_size**2) class_masks[cls_id] = cls_mask.reshape( self._num_clusters, self._mask_crop_size, self._mask_crop_size) self.class_priors = tf.convert_to_tensor(value=class_masks, dtype=tf.float32) else: npy_path = tf.io.gfile.GFile(self._shape_prior_path) class_np_masks = np.load(npy_path) assert class_np_masks.shape == ( self._num_clusters, self._mask_crop_size, self._mask_crop_size), 'Invalid priors!!!' self.class_priors = tf.convert_to_tensor( value=class_np_masks, dtype=tf.float32) else: self.class_priors = tf.zeros([ self._num_clusters, self._mask_crop_size, self._mask_crop_size ], tf.float32) batch_size = boxes.get_shape()[0] min_level_shape = fpn_features[ self._min_mask_level].get_shape().as_list() self._max_feature_size = min_level_shape[1] detection_prior_levels = self._compute_box_levels(boxes) level_outer_boxes = outer_boxes / tf.pow( 2., tf.expand_dims(detection_prior_levels, -1)) detection_prior_levels = tf.cast(detection_prior_levels, tf.int32) uniform_priors = spatial_transform_ops.crop_mask_in_target_box( tf.ones([ batch_size, self._num_of_instances, self._mask_crop_size, self._mask_crop_size ], tf.float32), boxes, outer_boxes, self._mask_crop_size) # Prepare crop features. multi_level_features = self._get_multilevel_features(fpn_features) crop_features = spatial_transform_ops.single_level_feature_crop( multi_level_features, level_outer_boxes, detection_prior_levels, self._min_mask_level, self._mask_crop_size) # Predict and fuse shape priors. shape_weights = self._classify_and_fuse_detection_priors( uniform_priors, classes, crop_features) fused_shape_priors = self._fuse_priors(shape_weights, classes) fused_shape_priors = tf.reshape(fused_shape_priors, [ batch_size, self._num_of_instances, self._mask_crop_size, self._mask_crop_size ]) predicted_detection_priors = spatial_transform_ops.crop_mask_in_target_box( fused_shape_priors, boxes, outer_boxes, self._mask_crop_size) predicted_detection_priors = tf.reshape( predicted_detection_priors, [-1, self._mask_crop_size, self._mask_crop_size, 1]) return crop_features, predicted_detection_priors
def get_batch_nodes(self, indices): radius_batch = tf.reshape(self.radius_by_batch, [1, -1, 1]) return self.get_hyperbolic_points(radius_batch, self.node(indices))
def __call__(self, crop_features, detection_priors, inst_classes, is_training=None): """Generate instance masks from FPN features and detection priors. This corresponds to the Fig. 5-6 of the ShapeMask paper at https://arxiv.org/pdf/1904.03239.pdf Args: crop_features: a float Tensor of shape [batch_size * num_instances, mask_crop_size, mask_crop_size, num_downsample_channels]. This is the instance feature crop. detection_priors: a float Tensor of shape [batch_size * num_instances, mask_crop_size, mask_crop_size, 1]. This is the detection prior for the instance. inst_classes: a int Tensor of shape [batch_size, num_instances] of instance classes. is_training: a bool indicating whether in training mode. Returns: mask_outputs: instance mask prediction as a float Tensor of shape [batch_size * num_instances, mask_size, mask_size, num_classes]. """ # Embed the anchor map into some feature space for anchor conditioning. detection_prior_features = tf.keras.layers.Conv2D( self._num_downsample_channels, kernel_size=(1, 1), bias_initializer=tf.zeros_initializer(), kernel_initializer=tf.keras.initializers.RandomNormal(mean=0., stddev=0.01), padding='same', name='anchor-conv')(detection_priors) prior_conditioned_features = crop_features + detection_prior_features coarse_output_features = self.coarsemask_decoder_net( prior_conditioned_features, is_training) coarse_mask_classes = tf.keras.layers.Conv2D( self._mask_num_classes, kernel_size=(1, 1), # Focal loss bias initialization to have foreground 0.01 probability. bias_initializer=tf.constant_initializer(-np.log((1 - 0.01) / 0.01)), kernel_initializer=tf.keras.initializers.RandomNormal(mean=0, stddev=0.01), padding='same', name='class-predict')(coarse_output_features) if self._use_category_for_mask: inst_classes = tf.cast(tf.reshape(inst_classes, [-1]), tf.int32) coarse_mask_classes_t = tf.transpose(a=coarse_mask_classes, perm=(0, 3, 1, 2)) # pylint: disable=g-long-lambda coarse_mask_logits = tf.cond( pred=tf.size(input=inst_classes) > 0, true_fn=lambda: tf.gather_nd( coarse_mask_classes_t, tf.stack([ tf.range(tf.size(input=inst_classes)), inst_classes - 1 ], axis=1)), false_fn=lambda: coarse_mask_classes_t[:, 0, :, :]) # pylint: enable=g-long-lambda coarse_mask_logits = tf.expand_dims(coarse_mask_logits, -1) else: coarse_mask_logits = coarse_mask_classes coarse_class_probs = tf.nn.sigmoid(coarse_mask_logits) class_probs = tf.cast(coarse_class_probs, prior_conditioned_features.dtype) return coarse_mask_classes, class_probs, prior_conditioned_features
def call(self, x): x = tf.reshape(x, shape=[-1, N_WH, N_WH, N_CH]) prior = tf.reduce_sum((x**2), axis=[1, 2, 3]) energy = tf.squeeze(self.net(x)) return FLAGS.p_prior_weight * prior + energy / FLAGS.p_temperature