def step_fn(inputs): """Per-Replica StepFn.""" images, labels = inputs if FLAGS.version2 and FLAGS.ensemble_size > 1: images = tf.tile(images, [FLAGS.ensemble_size, 1, 1, 1]) if not (FLAGS.member_sampling or FLAGS.expected_probs): labels = tf.tile(labels, [FLAGS.ensemble_size]) if FLAGS.num_train_samples > 1: images = tf.tile(images, [FLAGS.num_train_samples, 1, 1, 1]) with tf.GradientTape() as tape: logits = model(images, training=True) probs = tf.nn.softmax(logits) # Diversity evaluation. if FLAGS.version2 and FLAGS.ensemble_size > 1: per_probs = tf.reshape( probs, tf.concat([[FLAGS.ensemble_size, -1], probs.shape[1:]], 0)) diversity_results = ed.metrics.average_pairwise_diversity( per_probs, FLAGS.ensemble_size) if FLAGS.num_train_samples > 1: probs = tf.reshape( probs, tf.concat( [[FLAGS.num_train_samples, -1], probs.shape[1:]], 0)) probs = tf.reduce_mean(probs, 0) if FLAGS.member_sampling and FLAGS.version2 and FLAGS.ensemble_size > 1: idx = tf.random.uniform([], maxval=FLAGS.ensemble_size, dtype=tf.int64) idx_one_hot = tf.expand_dims( tf.one_hot(idx, FLAGS.ensemble_size, dtype=probs.dtype), 0) probs_shape = probs.shape probs = tf.reshape(probs, [FLAGS.ensemble_size, -1]) probs = tf.matmul(idx_one_hot, probs) probs = tf.reshape(probs, tf.concat([[-1], probs_shape[1:]], 0)) elif FLAGS.expected_probs and FLAGS.version2 and FLAGS.ensemble_size > 1: probs = tf.reshape( probs, tf.concat([[FLAGS.ensemble_size, -1], probs.shape[1:]], 0)) probs = tf.reduce_mean(probs, 0) negative_log_likelihood = tf.reduce_mean( tf.keras.losses.sparse_categorical_crossentropy( labels, probs)) filtered_variables = [] for var in model.trainable_variables: # Apply l2 on the slow weights and bias terms. This excludes BN # parameters and fast weight approximate posterior/prior parameters, # but pay caution to their naming scheme. if 'kernel' in var.name or 'bias' in var.name: filtered_variables.append(tf.reshape(var, (-1, ))) l2_loss = FLAGS.l2 * 2 * tf.nn.l2_loss( tf.concat(filtered_variables, axis=0)) kl = sum(model.losses) / train_dataset_size kl_scale = tf.cast(optimizer.iterations + 1, kl.dtype) kl_scale /= FLAGS.kl_annealing_steps kl_scale = tf.minimum(1., kl_scale) kl_loss = kl_scale * kl # Scale the loss given the TPUStrategy will reduce sum all gradients. loss = negative_log_likelihood + l2_loss + kl_loss scaled_loss = loss / strategy.num_replicas_in_sync grads = tape.gradient(scaled_loss, model.trainable_variables) # Separate learning rate implementation. grad_list = [] if FLAGS.fast_weight_lr_multiplier != 1.0: grads_and_vars = list(zip(grads, model.trainable_variables)) for vec, var in grads_and_vars: # Apply different learning rate on the fast weight approximate # posterior/prior parameters. This is excludes BN and slow weights, # but pay caution to the naming scheme. if ('batch_norm' not in var.name and 'kernel' not in var.name): grad_list.append( (vec * FLAGS.fast_weight_lr_multiplier, var)) else: grad_list.append((vec, var)) optimizer.apply_gradients(grad_list) else: optimizer.apply_gradients(zip(grads, model.trainable_variables)) metrics['train/ece'].update_state(labels, probs) metrics['train/loss'].update_state(loss) metrics['train/negative_log_likelihood'].update_state( negative_log_likelihood) metrics['train/accuracy'].update_state(labels, probs) if FLAGS.version2 and FLAGS.ensemble_size > 1: for k, v in diversity_results.items(): training_diversity['train/' + k].update_state(v)
def grad(model, inputs, targets): with tf.GradientTape() as tape: loss_value = loss(model, inputs, targets, training=True) for loss_internal in model.losses: loss_value += loss_internal return loss_value, tape.gradient(loss_value, model.trainable_variables)
def compare(a, b): with tf.GradientTape() as tape: tape.watch(a) r = f(a, b) expected = tape.gradient(r, a) self.assertAllEqual(expected, g(a, b))
def f(x): with tf.GradientTape() as t: t.watch(x) z = m(x**2) grads = t.gradient(z, x) return grads
def optimize(): with tf.GradientTape() as tape: loss = -gp.log_prob(observations) grads = tape.gradient(loss, gp.trainable_variables) optimizer.apply_gradients(zip(grads, gp.trainable_variables)) return loss
def jit_with_grad(mat): with tf.GradientTape(): return alt_chol_jit(mat)
def train_op(): with tf.GradientTape() as tape: neg_log_prob = -log_prob() grads = tape.gradient(neg_log_prob, [trainable_log_rates])[0] optimizer.apply_gradients([(grads, trainable_log_rates)]) return neg_log_prob, tf.math.exp(trainable_log_rates)
def step_fn(inputs): images, targets = inputs with tf.GradientTape() as tape: loss = compute_loss2(images, targets) grads = tape.gradient(loss, model2.variables) optimizer.apply_gradients(zip(grads, model2.variables))
def minimize_fn(): with tf.GradientTape() as tape: loss = get_loss() grads = tape.gradient(loss, var_list) return opt.apply_gradients(zip(grads, var_list))
def fn(): with tf.GradientTape() as tape: x = tf.ones((0, 2, 2, 2)) layer(x, training=True) return tape
def test_latent_dirichlet_allocation(self, jd_class): # pylint: disable=g-doc-args """Tests Latent Dirichlet Allocation joint model. The LDA generative process can be written as: ```none N[i] ~ Poisson(xi) theta[i] ~ Dirichlet(alpha) Z[i] ~ Multinomial(N[i], theta[i]) for k in 1...K: X[i,k] ~ Multinomial(Z[i, k], beta[j]) ``` Typically `xi` is specified and `alpha`, `beta` are fit using type-II maximum likelihood estimators. Reference: http://www.jmlr.org/papers/volume3/blei03a/blei03a.pdf """ seed = test_util.test_seed_stream() # Hyperparameters. num_topics = 3 num_words = 10 avg_doc_length = 5 u = tfd.Uniform(low=-1., high=1.) alpha = tfp.util.TransformedVariable(u.sample([num_topics], seed=seed()), tfb.Softplus(), name='alpha') beta = tf.Variable(u.sample([num_topics, num_words], seed=seed()), name='beta') # Note near 1:1 with mathematical specification. The main distinction is the # use of Independent--this lets us easily aggregate multinomials across # topics (and in any "shape" of documents). def lda_coroutine_model(): n = yield Root(tfd.Poisson(rate=avg_doc_length)) theta = yield Root(tfd.Dirichlet(concentration=alpha)) z = yield tfd.Multinomial(total_count=n, probs=theta) yield tfd.Multinomial(total_count=z, logits=beta) if jd_class is tfd.JointDistributionCoroutineAutoBatched: model = lda_coroutine_model elif jd_class is tfd.JointDistributionSequentialAutoBatched: model = [ tfd.Poisson(rate=avg_doc_length), # n tfd.Dirichlet(concentration=alpha), # theta lambda theta, n: tfd.Multinomial(total_count=n, probs=theta ), # z lambda z: tfd.Multinomial(total_count=z, logits=beta) ] elif jd_class is tfd.JointDistributionNamedAutoBatched: model = collections.OrderedDict(( ('n', tfd.Poisson(rate=avg_doc_length)), ('theta', tfd.Dirichlet(concentration=alpha)), ('z', lambda theta, n: tfd.Multinomial(total_count=n, probs=theta)), ('X', lambda z: tfd.Multinomial(total_count=z, logits=beta)))) # TODO(b/159842104): Enable autovectorization for Multinomial sampling. lda = jd_class(model, validate_args=True, use_vectorized_map=False) # Now, let's sample some "documents" and compute the log-prob of each. docs_shape = [2, 4] # That is, 8 docs in the shape of [2, 4]. sample = lda.sample(docs_shape, seed=seed()) log_probs = lda.log_prob(sample) self.assertEqual(docs_shape, log_probs.shape) # Verify we correctly track trainable variables. self.assertLen(lda.trainable_variables, 2) self.assertIs(alpha.pretransformed_input, lda.trainable_variables[0]) self.assertIs(beta, lda.trainable_variables[1]) # Ensure we can compute gradients. with tf.GradientTape() as tape: # Note: The samples are not taped, hence implicitly "stop_gradient." negloglik = -lda.log_prob(sample) grads = tape.gradient(negloglik, lda.trainable_variables) self.assertLen(grads, 2) self.assertAllEqual((alpha.pretransformed_input.shape, beta.shape), (grads[0].shape, grads[1].shape)) self.assertAllNotNone(grads)
def _reparameterize_sample(self, x): """Adds reparameterization (pathwise) gradients to samples of the mixture. Implicit reparameterization gradients are dx/dphi = -(d transform(x, phi) / dx)^-1 * d transform(x, phi) / dphi, where transform(x, phi) is distributional transform that removes all parameters from samples x. We implement them by replacing x with -stop_gradient(d transform(x, phi) / dx)^-1 * transform(x, phi)] for the backward pass (gradient computation). The derivative of this quantity w.r.t. phi is then the implicit reparameterization gradient. Note that this replaces the gradients w.r.t. both the mixture distribution parameters and components distributions parameters. Limitations: 1. Fundamental: components must be fully reparameterized. 2. Distributional transform is currently only implemented for factorized components. 3. Distributional transform currently only works for known rank of the batch tensor. Arguments: x: Sample of mixture distribution Returns: Tensor with same value as x, but with reparameterization gradients """ # Remove the existing gradients of x wrt parameters of the components. x = tf.stop_gradient(x) x_2d_shape = [-1, self._event_size] # [S*prod(B), prod(E)] # Perform distributional transform of x in [S, B, E] shape, # but have Jacobian of size [S*prod(B), prod(E), prod(E)]. def reshaped_distributional_transform(x_2d): return tf.reshape( self._distributional_transform(tf.reshape(x_2d, tf.shape(x))), x_2d_shape) # transform_2d: [S*prod(B), prod(E)] # jacobian: [S*prod(B), prod(E), prod(E)] x_2d = tf.reshape(x, x_2d_shape) try: with tf.GradientTape() as tape: tape.watch(x_2d) transform_2d = reshaped_distributional_transform(x_2d) jacobian = tape.batch_jacobian(transform_2d, x_2d) except TypeError: # TODO(b/139374388): Remove exception workaround. with tf.GradientTape(persistent=True) as tape: tape.watch(x_2d) transform_2d = reshaped_distributional_transform(x_2d) jacobian = tape.batch_jacobian(transform_2d, x_2d, experimental_use_pfor=False) # We only provide the first derivative; the second derivative computed by # autodiff would be incorrect, so we raise an error if it is requested. transform_2d = _prevent_2nd_derivative(transform_2d) # Compute [- stop_gradient(jacobian)^-1 * transform] by solving a linear # system. The Jacobian is lower triangular because the distributional # transform for i-th event dimension does not depend on the next # dimensions. surrogate_x_2d = -tf.linalg.triangular_solve( tf.stop_gradient(jacobian), tf.expand_dims(transform_2d, axis=-1), lower=True) # [S*prod(B), prod(E), 1] surrogate_x = tf.reshape(surrogate_x_2d, tf.shape(x)) # Replace gradients of x with gradients of surrogate_x, but keep the value. return x + (surrogate_x - tf.stop_gradient(surrogate_x))
def compute_jacobian(x): with tf.GradientTape() as g: g.watch(x) y = tfp.math.log_cumsum_exp(x) return g.jacobian(y, x)
def helper_keras_gradtape( trial_dir, save_all=False, include_collections=None, reduction_config=None, save_config=None, hook=None, batch_size=64, persistent=False, ): mnist = tf.keras.datasets.mnist (x_train, y_train), _ = mnist.load_data() dataset = tf.data.Dataset.from_tensor_slices( (tf.cast(x_train[..., tf.newaxis] / 255, tf.float32), tf.cast(y_train, tf.int64))) dataset = dataset.shuffle(1000).batch(batch_size) model = tf.keras.models.Sequential([ # WA for TF issue https://github.com/tensorflow/tensorflow/issues/36279 tf.keras.layers.Flatten(input_shape=(28, 28, 1)), tf.keras.layers.Dense(128, activation="relu"), tf.keras.layers.Dropout(0.2), tf.keras.layers.Dense(10, activation="softmax"), ]) if hook is None: if save_config is None: save_config = SaveConfig(save_interval=3) hook = smd.KerasHook( trial_dir, save_config=save_config, save_all=save_all, include_collections=include_collections, reduction_config=reduction_config, ) if not save_all and include_collections is not None: for cname in hook.include_collections: if cname not in include_collections: hook.get_collection(cname).save_config = SaveConfig( end_step=0) opt = tf.keras.optimizers.Adam() hook.wrap_optimizer(opt) cce = tf.keras.losses.CategoricalCrossentropy(from_logits=True) train_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy() n_epochs = 1 for epoch in range(n_epochs): for data, labels in dataset: dataset_labels = labels labels = tf.one_hot(labels, depth=10) with hook.wrap_tape( tf.GradientTape(persistent=persistent)) as tape: logits = model(data, training=True) # (32,10) loss_value = cce(labels, logits) grads = tape.gradient(loss_value, model.variables) # By default, the resources held by a GradientTape are released as # soon as GradientTape.gradient() method is called. To compute # multiple gradients over the same computation, create a persistent # gradient tape. This allows multiple calls to the gradient() method # as resources are released when the tape object is garbage collected. if persistent: _ = tape.gradient(loss_value, model.variables) opt.apply_gradients(zip(grads, model.variables)) acc = train_acc_metric(dataset_labels, logits) hook.record_tensor_value(tensor_name="accuracy", tensor_value=acc) train_acc_metric.reset_states() hook.close()
def run_one_epoch( self, dataset: tf.data.Dataset, training: bool = False, ): total_loss, num_samples, num_tokens, num_correct_tokens = 0.0, 0, 0, 0 ground_truth = [] predictions = [] for step, (batch_features, batch_labels) in enumerate(dataset): self.hyperparameters["batch_size"] = len(batch_labels) sources = batch_features targets = batch_labels with tf.GradientTape() as tape: model_outputs = self.compute_logits( batch_features, tf.cast(batch_labels["target_value"], tf.int32), training=training) result = self.compute_loss_and_acc(model_outputs, batch_features, batch_labels) total_loss += result.token_ce_loss num_samples += tf.cast(batch_features["num_graphs_in_batch"], tf.float32) num_tokens += result.num_predictions num_correct_tokens += result.num_correct_token_predictions target_texts = self.get_text_from_tensor( batch_labels["target_value"]) predicted_texts = self.get_text_from_tensor( tf.argmax(model_outputs, 2)) # source_text = self.get_source_from_tensor(batch_features["source_seq"]) ref = [([x[1:x.index("%END%")] if "%END%" in x else x[1:]]) for x in target_texts] hyp = [(x[:x.index("%END%")] if "%END%" in x else x) for x in predicted_texts] smoothing = SmoothingFunction().method4 try: bleu_score = corpus_bleu(ref, hyp, smoothing_function=smoothing) except: bleu_score = 0 # for r, h, s in zip(ref, hyp, source_text): # print( # f"Target: {' '.join(r[0])}\n" # f"Prediction: {' '.join(h)}\n" # f"Source: {' '.join(s)}\n" # ) # print(bleu_score) ground_truth += ref predictions += hyp if training: gradients = tape.gradient(result.token_ce_loss, self.trainable_variables) self.optimizer.apply_gradients( zip(gradients, self.trainable_variables)) print( " Batch %4i: Epoch avg. loss: %.5f || Batch loss: %.5f | acc: %.5f | bleu: %.5f" % (step, total_loss / num_samples, result.token_ce_loss, float(result.num_correct_token_predictions) / (float(result.num_predictions) + float(1e-7)), bleu_score), end="\n", ) print("\r\x1b[K", end="") return (total_loss / num_samples, float(num_correct_tokens) / (float(num_tokens) + 1e-7), ground_truth, predictions)
def plot_jacobians(self, which, intervals, arrow_intervals, scale=2, figsize=None): if not (len(intervals) == len(arrow_intervals) == self.ndim_source == self.ndim_latent == 2): raise ValueError("This method is only defined for 2D models.") if which not in ("analysis", "synthesis"): raise ValueError("`which` must be 'analysis' or 'synthesis'.") data = [ tf.linspace(float(i[0]), float(i[1]), int(i[2])) for i in intervals ] data = tf.meshgrid(*data, indexing="ij") data = tf.stack(data, axis=-1) data_dist = self.source.prob(data).numpy() if which == "analysis": arrow_data = [ tf.linspace(float(i[0]), float(i[1]), int(i[2])) for i in arrow_intervals ] arrow_data = tf.meshgrid(*arrow_data, indexing="ij") arrow_data = tf.stack(arrow_data, axis=-1) arrow_data = tf.reshape(arrow_data, (-1, arrow_data.shape[-1])) with tf.GradientTape(watch_accessed_variables=False) as tape: tape.watch(arrow_data) arrow_latents = self.analysis(arrow_data) # First dimension is batch, second is latent dim, third is source dim. jacobian = tape.batch_jacobian(arrow_latents, arrow_data) jacobian = tf.linalg.inv(jacobian) jacobian = tf.transpose(jacobian, (0, 2, 1)) else: arrow_latents = [ tf.linspace(float(i[0]), float(i[1]), int(i[2])) for i in arrow_intervals ] arrow_latents = tf.meshgrid(*arrow_latents, indexing="ij") arrow_latents = tf.stack(arrow_latents, axis=-1) arrow_latents = tf.reshape(arrow_latents, (-1, arrow_latents.shape[-1])) with tf.GradientTape(watch_accessed_variables=False) as tape: tape.watch(arrow_latents) arrow_data = self.synthesis(arrow_latents) jacobian = tape.batch_jacobian(arrow_data, arrow_latents) jacobian = tf.transpose(jacobian, (0, 2, 1)) google_pink = (0xf4 / 255, 0x39 / 255, 0xa0 / 255) google_purple = (0xa1 / 255, 0x42 / 255, 0xf4 / 255) plt.figure(figsize=figsize or (16, 14)) plt.imshow(data_dist, vmin=0, vmax=data_dist.max(), origin="lower", extent=(data[0, 0, 1], data[0, -1, 1], data[0, 0, 0], data[-1, 0, 0])) plt.quiver( arrow_data[:, 1], arrow_data[:, 0], jacobian[:, 0, 1], jacobian[:, 0, 0], pivot="tail", angles="xy", headlength=4, headaxislength=4, units="dots", color=google_pink, scale_units="xy", scale=scale, ) plt.quiver( arrow_data[:, 1], arrow_data[:, 0], jacobian[:, 1, 1], jacobian[:, 1, 0], pivot="tail", angles="xy", headlength=4, headaxislength=4, units="dots", color=google_purple, scale_units="xy", scale=scale, ) plt.axis("image") plt.grid(False) plt.xlim(data[0, 0, 1], data[0, -1, 1]) plt.ylim(data[0, 0, 0], data[-1, 0, 0]) plt.xlabel("source dimension 1") plt.ylabel("source dimension 2")
def test_latent_dirichlet_allocation(self): """Tests Latent Dirichlet Allocation joint model. The LDA generative process can be written as: ```none N[i] ~ Poisson(xi) theta[i] ~ Dirichlet(alpha) Z[i] ~ Multinomial(N[i], theta[i]) for k in 1...K: X[i,k] ~ Multinomial(Z[i, k], beta[j]) ``` Typically `xi` is specified and `alpha`, `beta` are fit using type-II maximum likelihood estimators. Reference: http://www.jmlr.org/papers/volume3/blei03a/blei03a.pdf """ # Hyperparameters. num_topics = 3 num_words = 10 avg_doc_length = 5 u = tfd.Uniform(low=-1., high=1.) alpha = tfp.util.DeferredTensor( tf.math.softplus, tf.Variable(u.sample([num_topics]), name='raw_alpha')) beta = tf.Variable(u.sample([num_topics, num_words]), name='beta') # LDA Model. # Note near 1:1 with mathematical specification. The main distinction is the # use of Independent--this lets us easily aggregate multinomials across # topics (and in any "shape" of documents). lda = tfd.JointDistributionSequential([ tfd.Poisson(rate=avg_doc_length), # n tfd.Dirichlet(concentration=alpha), # theta lambda theta, n: tfd.Multinomial(total_count=n, probs=theta), # z lambda z: tfd.Independent( # x pylint: disable=g-long-lambda tfd.Multinomial(total_count=z, logits=beta), reinterpreted_batch_ndims=1), ]) # Now, let's sample some "documents" and compute the log-prob of each. docs_shape = [2, 4] # That is, 8 docs in the shape of [2, 4]. [n, theta, z, x] = lda.sample(docs_shape) log_probs = lda.log_prob([n, theta, z, x]) self.assertEqual(docs_shape, log_probs.shape) # Verify we correctly track trainable variables. self.assertLen(lda.trainable_variables, 2) self.assertIs(alpha.pretransformed_input, lda.trainable_variables[0]) self.assertIs(beta, lda.trainable_variables[1]) # Ensure we can compute gradients. with tf.GradientTape() as tape: # Note: The samples are not taped, hence implicitly "stop_gradient." negloglik = -lda.log_prob([n, theta, z, x]) grads = tape.gradient(negloglik, lda.trainable_variables) self.assertLen(grads, 2) self.assertAllEqual((alpha.pretransformed_input.shape, beta.shape), (grads[0].shape, grads[1].shape)) self.assertAllNotNone(grads)
flag = True indeces_ = [] while(flag): pair_ = [int(a) for a in list(input()) if a != ' '] if any([a > max_indeces for a in pair_]): assert (False), ('We enter index more then available!') print(pair_) if(pair_[0] == 0 or pair_[1] == 0): flag = False else: indeces_.append(pair_) return indeces_ #indeces_ = create_a_list_indeces(num_of_parameters) #print([(iter_1, iter_2) \ # for iter_1, iter_2 in indeces_]) markov_theory = Markovitz_theory() markov_theory._initial_model_param_() funct_obj = wrapper_fuzzy_constraints_(markov_theory) x = tf.Variable(np.random.random(markov_theory.weights.shape[0])) print(x.shape) with tf.GradientTape() as g: g.watch((x)) f = funct_obj(x) grad_ = g.gradient(f, x) print(grad_.numpy())
def kernel(target_log_prob_fn, current_state, step_size, seed=None, current_target_log_prob=None, current_grads_target_log_prob=None, name="nuts_kernel"): """Simulates a No-U-Turn Sampler (NUTS) trajectory. Args: target_log_prob_fn: Python callable which takes an argument like `*current_state` and returns its (possibly unnormalized) log-density under the target distribution. current_state: List of `Tensor`s representing the states to simulate from. step_size: List of `Tensor`s representing the step sizes for the leapfrog integrator. Must have same shape as `current_state`. seed: Integer to seed the random number generator. current_target_log_prob: Scalar `Tensor` representing the value of `target_log_prob_fn` at the `current_state`. current_grads_target_log_prob: List of `Tensor`s representing gradient of `current_target_log_prob` with respect to `current_state`. Must have same shape as `current_state`. name: A name for the operation. Returns: next_state: List of `Tensor`s representing the next states of the NUTS trajectory. Has same shape as `current_state`. next_target_log_prob: Scalar `Tensor` representing the value of `target_log_prob_fn` at `next_state`. next_grads_target_log_prob: List of `Tensor`s representing the gradient of `next_target_log_prob` with respect to `next_state`. Raises: NotImplementedError: If the execution mode is not eager. """ if not tf.executing_eagerly(): raise NotImplementedError("`kernel` is only available in Eager mode.") with tf.name_scope(name): with tf.name_scope("initialize"): current_state = [tf.convert_to_tensor(s) for s in current_state] step_size = [tf.convert_to_tensor(s) for s in step_size] if (current_target_log_prob is None or current_grads_target_log_prob is None): with tf.GradientTape() as tape: tape.watch(current_state) current_target_log_prob = target_log_prob_fn( *current_state) current_grads_target_log_prob = tape.gradient( current_target_log_prob, current_state) if any(grad is None for grad in current_grads_target_log_prob): raise ValueError("Gradient is None for a state.") seed_stream = tfp.distributions.SeedStream(seed, "nuts_kernel") current_momentum = [] for state_tensor in current_state: momentum_tensor = tf.random.normal( shape=tf.shape(state_tensor), dtype=state_tensor.dtype, seed=seed_stream()) current_momentum.append(momentum_tensor) # Draw a slice variable u ~ Uniform(0, p(initial state, initial # momentum)) and compute log u. For numerical stability, we perform this # in log space where log u = log (u' * p(...)) = log u' + log # p(...) and u' ~ Uniform(0, 1). log_slice_sample = tf.math.log( tf.random.uniform([], seed=seed_stream())) log_slice_sample += _log_joint(current_target_log_prob, current_momentum) # Initialize loop variables. It comprises a collection of information # about a "reverse" state, a collection of information about a "forward" # state, a collection of information about the next state, # the trajectory's tree depth, the number of candidate states, and # whether to continue the trajectory. reverse_state = current_state reverse_target_log_prob = current_target_log_prob reverse_grads_target_log_prob = current_grads_target_log_prob reverse_momentum = current_momentum forward_state = current_state forward_target_log_prob = current_target_log_prob forward_grads_target_log_prob = current_grads_target_log_prob forward_momentum = current_momentum next_state = current_state next_target_log_prob = current_target_log_prob next_grads_target_log_prob = current_grads_target_log_prob depth = 0 num_states = 1 continue_trajectory = True while continue_trajectory: # Grow the No-U-Turn Sampler trajectory by choosing a random direction and # simulating Hamiltonian dynamics in that direction. This extends either # the forward or reverse state. direction = tfp.math.random_rademacher([], seed=seed_stream()) if direction < 0: [ reverse_state, reverse_target_log_prob, reverse_grads_target_log_prob, reverse_momentum, _, _, _, _, next_state_in_subtree, next_target_log_prob_in_subtree, next_grads_target_log_prob_in_subtree, num_states_in_subtree, continue_trajectory, ] = _build_tree( target_log_prob_fn=target_log_prob_fn, current_state=reverse_state, current_target_log_prob=reverse_target_log_prob, current_grads_target_log_prob=reverse_grads_target_log_prob, current_momentum=reverse_momentum, direction=direction, depth=depth, step_size=step_size, log_slice_sample=log_slice_sample, seed=seed_stream()) else: [ _, _, _, _, forward_state, forward_target_log_prob, forward_grads_target_log_prob, forward_momentum, next_state_in_subtree, next_target_log_prob_in_subtree, next_grads_target_log_prob_in_subtree, num_states_in_subtree, continue_trajectory, ] = _build_tree( target_log_prob_fn=target_log_prob_fn, current_state=forward_state, current_target_log_prob=forward_target_log_prob, current_grads_target_log_prob=forward_grads_target_log_prob, current_momentum=forward_momentum, direction=direction, depth=depth, step_size=step_size, log_slice_sample=log_slice_sample, seed=seed_stream()) if continue_trajectory: # If the built tree did not terminate, accept the tree's next state # with a certain probability. accept_state_in_subtree = _random_bernoulli( [], probs=tf.minimum(1., num_states_in_subtree / num_states), dtype=tf.bool, seed=seed_stream()) if accept_state_in_subtree: next_state = next_state_in_subtree next_target_log_prob = next_target_log_prob_in_subtree next_grads_target_log_prob = next_grads_target_log_prob_in_subtree # Continue the NUTS trajectory if the tree-building did not terminate, and # if the reverse-most and forward-most states do not exhibit a U-turn. has_no_u_turn = tf.logical_and( _has_no_u_turn(forward_state, reverse_state, forward_momentum), _has_no_u_turn(forward_state, reverse_state, reverse_momentum)) continue_trajectory = continue_trajectory and has_no_u_turn num_states += num_states_in_subtree depth += 1 return next_state, next_target_log_prob, next_grads_target_log_prob
def helper_test_keras_v2_gradienttape(script_mode: bool = False, json_file_contents="{}"): """ Test the default ZCC behavior of saving losses and metrics in eager and non-eager modes.""" smd.del_hook() tf.keras.backend.clear_session() with SagemakerSimulator(json_file_contents=json_file_contents) as sim: model = tf.keras.models.Sequential([ tf.keras.layers.Flatten(input_shape=(28, 28, 1)), # WA for TF issue #36279 tf.keras.layers.Dense(128, activation="relu"), tf.keras.layers.Dropout(0.2), tf.keras.layers.Dense(10, activation="softmax"), ]) (x_train, y_train), _ = get_keras_data() dataset = tf.data.Dataset.from_tensor_slices( (tf.cast(x_train[..., tf.newaxis] / 255, tf.float32), tf.cast(y_train, tf.int64))) dataset = dataset.shuffle(1000).batch(64) opt = tf.keras.optimizers.RMSprop() cce = tf.keras.losses.CategoricalCrossentropy(from_logits=True) train_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy() n_epochs = 2 if script_mode: if json_file_contents == "{}": hook = smd.KerasHook(out_dir=sim.out_dir, export_tensorboard=True) else: hook = smd.KerasHook.create_from_json_file() for epoch in range(n_epochs): print("Epoch %d/%d" % (epoch + 1, n_epochs)) for data, labels in dataset: dataset_labels = labels labels = tf.one_hot(labels, depth=10) with hook.wrap_tape(tf.GradientTape()) as tape: logits = model(data, training=True) # (32,10) loss_value = cce(labels, logits) grads = tape.gradient(loss_value, model.variables) opt.apply_gradients(zip(grads, model.variables)) acc = train_acc_metric(dataset_labels, logits) hook.record_tensor_value(tensor_name="accuracy", tensor_value=acc) log = "Epoch %d " % (epoch + 1) log += "Accuracy %.4f" % train_acc_metric.result() print(log) train_acc_metric.reset_states() hook = smd.get_hook() assert hook hook.close() # Check that hook created and tensors saved trial = smd.create_trial(path=sim.out_dir) assert len(trial.steps()) > 0, "Nothing saved at any step." assert len(trial.tensor_names()) > 0, "Tensors were not saved." assert len(trial.tensor_names(collection="losses")) > 0 else: # ZCC support added from smdebug v0.8.0) for epoch in range(n_epochs): print("Epoch %d/%d" % (epoch + 1, n_epochs)) for data, labels in dataset: dataset_labels = labels labels = tf.one_hot(labels, depth=10) with tf.GradientTape(persistent=True) as tape: logits = model(data, training=True) # (32,10) loss_value = cce(labels, logits) grads = tape.gradient(loss_value, model.variables) opt.apply_gradients(zip(grads, model.variables)) acc = train_acc_metric(dataset_labels, logits) log = "Epoch %d " % (epoch + 1) log += "Accuracy %.4f" % train_acc_metric.result() print(log) train_acc_metric.reset_states() hook = smd.get_hook() if not is_tf_2_2(): assert not hook # only supported on TF 2.2 and greater return assert hook hook.close() # Check that hook created and tensors saved trial = smd.create_trial(path=sim.out_dir) assert len(trial.steps()) > 0, "Nothing saved at any step." assert len(trial.tensor_names()) > 0, "Tensors were not saved." assert len(trial.tensor_names(collection="losses")) > 0
def solve_nu_zeta(self, dataset: dataset_lib.OffpolicyDataset, target_policy: tf_policy.TFPolicy, regularizer: float = 1e-6): """Solves for density ratios and then approximates target policy value. Args: dataset: The dataset to sample experience from. target_policy: The policy whose value we want to estimate. regularizer: A small constant to add to matrices before inverting them or to floats before taking square root. Returns: Estimated average per-step reward of the target policy. """ if not hasattr(self, '_td_mat'): # Set up env_steps. episodes, valid_steps = dataset.get_all_episodes( limit=self._limit_episodes) total_num_steps_per_episode = tf.shape(valid_steps)[1] - 1 num_episodes = tf.shape(valid_steps)[0] num_samples = num_episodes * total_num_steps_per_episode valid_and_not_last = tf.logical_and(valid_steps, episodes.discount > 0) valid_indices = tf.squeeze( tf.where(tf.reshape(valid_and_not_last[:, :-1], [-1]))) initial_env_step = tf.nest.map_structure( lambda t: tf.squeeze( tf.reshape( tf.repeat( t[:, 0:1, ...], axis=1, repeats=total_num_steps_per_episode), [num_samples, -1])), episodes) initial_env_step = tf.nest.map_structure( lambda t: tf.gather(t, valid_indices), initial_env_step) tfagents_initial_env_step = dataset_lib.convert_to_tfagents_timestep( initial_env_step) env_step = tf.nest.map_structure( lambda t: tf.squeeze( tf.reshape(t[:, 0:total_num_steps_per_episode, ...], [num_samples, -1])), episodes) env_step = tf.nest.map_structure(lambda t: tf.gather(t, valid_indices), env_step) tfagents_env_step = dataset_lib.convert_to_tfagents_timestep(env_step) next_env_step = tf.nest.map_structure( lambda t: tf.squeeze( tf.reshape(t[:, 1:total_num_steps_per_episode + 1, ...], [num_samples, -1])), episodes) next_env_step = tf.nest.map_structure( lambda t: tf.gather(t, valid_indices), next_env_step) tfagents_next_env_step = dataset_lib.convert_to_tfagents_timestep( next_env_step) # get probabilities initial_target_probs = target_policy.distribution( tfagents_initial_env_step).action.probs_parameter() next_target_probs = target_policy.distribution( tfagents_next_env_step).action.probs_parameter() # First, get the nu_loss and data weights #current_nu_loss = self._get_nu_loss(initial_env_step, env_step, # next_env_step, target_policy) #data_weight, _ = self._get_weights(current_nu_loss) # # debug only and to reproduce dual dice result, DELETE # data_weight = tf.ones_like(data_weight) state_action_count = self._get_state_action_counts(env_step) counts = tf.reduce_sum(tf.one_hot(state_action_count, self._dimension), 0) gamma_sample = tf.pow(self._gamma, tf.cast(env_step.step_num, tf.float32)) # # debug only and to reproduce dual dice result, DELETE # gamma_sample = tf.ones_like(gamma_sample) # now we need to expand_dims to include action space in extra dimensions #data_weights = tf.reshape(data_weight, [-1, self._num_limits]) # both are data sample weights for L2 problem, needs to be normalized later #gamma_data_weights = tf.reshape(gamma_sample, [-1, 1]) * data_weights initial_states = tf.tile( tf.reshape(initial_env_step.observation, [-1, 1]), [1, self._num_actions]) initial_actions = tf.tile( tf.reshape(tf.range(self._num_actions), [1, -1]), [initial_env_step.observation.shape[0], 1]) initial_nu_indices = self._get_index(initial_states, initial_actions) # linear term w.r.t. initial distribution #b_vec_2 = tf.stack([ # tf.reduce_sum( # tf.reshape( # data_weights[:, itr] / tf.reduce_sum(data_weights[:, itr]), # [-1, 1]) * tf.reduce_sum( # tf.one_hot(initial_nu_indices, self._dimension) * # (1 - self._gamma) * # tf.expand_dims(initial_target_probs, axis=-1), # axis=1), # axis=0) for itr in range(self._num_limits) #], # axis=0) next_states = tf.tile( tf.reshape(next_env_step.observation, [-1, 1]), [1, self._num_actions]) next_actions = tf.tile( tf.reshape(tf.range(self._num_actions), [1, -1]), [next_env_step.observation.shape[0], 1]) next_nu_indices = self._get_index(next_states, next_actions) next_nu_indices = tf.where( tf.expand_dims(next_env_step.is_absorbing(), -1), -1 * tf.ones_like(next_nu_indices), next_nu_indices) nu_indices = self._get_index(env_step.observation, env_step.action) target_log_probabilities = target_policy.distribution( tfagents_env_step).action.log_prob(env_step.action) if not self._solve_for_state_action_ratio: policy_ratio = tf.exp(target_log_probabilities - env_step.get_log_probability()) else: policy_ratio = tf.ones([ target_log_probabilities.shape[0], ]) policy_ratios = tf.tile( tf.reshape(policy_ratio, [-1, 1]), [1, self._num_actions]) # the tabular feature vector a_vec = tf.one_hot(nu_indices, self._dimension) - tf.reduce_sum( self._gamma * tf.expand_dims(next_target_probs * policy_ratios, axis=-1) * tf.one_hot(next_nu_indices, self._dimension), axis=1) # linear term w.r.t. reward #b_vec_1 = tf.stack([ # tf.reduce_sum( # tf.reshape( # (gamma_data_weights[:, itr] / # tf.reduce_sum(gamma_data_weights[:, itr])) * self._reward_fn(env_step), #/ # #tf.cast(state_action_count, tf.float32), # [-1, 1]) * a_vec, # axis=0) for itr in range(self._num_limits) #], # axis=0) # quadratic term of feature # Get weighted outer product by using einsum to save computing resource! #a_mat = tf.stack([ # tf.einsum( # 'ai, a, aj -> ij', a_vec, # #1.0 / tf.cast(state_action_count, tf.float32), # gamma_data_weights[:, itr] / # tf.reduce_sum(gamma_data_weights[:, itr]), # a_vec) # for itr in range(self._num_limits) #], # axis=0) td_mat = tf.einsum('ai, a, aj -> ij', tf.one_hot(nu_indices, self._dimension), 1.0 / tf.cast(state_action_count, tf.float32), a_vec) weighted_rewards = policy_ratio * self._reward_fn(env_step) bias = tf.reduce_sum( tf.one_hot(nu_indices, self._dimension) * tf.reshape(weighted_rewards, [-1, 1]) * 1.0 / tf.cast(state_action_count, tf.float32)[:, None], axis=0) # Initialize self._nu = np.ones_like(self._nu) * bias[:, None] self._nu2 = np.ones_like(self._nu2) * bias[:, None] self._a_vec = a_vec self._td_mat = td_mat self._bias = bias self._weighted_rewards = weighted_rewards self._state_action_count = state_action_count self._nu_indices = nu_indices self._initial_nu_indices = initial_nu_indices self._initial_target_probs = initial_target_probs self._gamma_sample = gamma_sample self._gamma_sample = tf.ones_like(gamma_sample) saddle_bellman_residuals = ( tf.matmul(self._a_vec, self._nu) - self._weighted_rewards[:, None]) saddle_bellman_residuals *= -1 * self._algae_alpha_sign saddle_zetas = tf.gather(self._zeta, self._nu_indices) saddle_initial_nu_values = tf.reduce_sum( # Average over actions. self._initial_target_probs[:, :, None] * tf.gather(self._nu, self._initial_nu_indices), axis=1) saddle_init_nu_loss = ((1 - self._gamma) * saddle_initial_nu_values * self._algae_alpha_sign) saddle_bellman_residuals2 = ( tf.matmul(self._a_vec, self._nu2) - self._weighted_rewards[:, None]) saddle_bellman_residuals2 *= 1 * self._algae_alpha_sign saddle_zetas2 = tf.gather(self._zeta2, self._nu_indices) saddle_initial_nu_values2 = tf.reduce_sum( # Average over actions. self._initial_target_probs[:, :, None] * tf.gather(self._nu2, self._initial_nu_indices), axis=1) saddle_init_nu_loss2 = ((1 - self._gamma) * saddle_initial_nu_values2 * -1 * self._algae_alpha_sign) saddle_loss = 0.5 * ( saddle_init_nu_loss + saddle_bellman_residuals * saddle_zetas + -tf.math.abs(self._algae_alpha) * 0.5 * tf.square(saddle_zetas) + -saddle_init_nu_loss2 + -saddle_bellman_residuals2 * saddle_zetas2 + tf.math.abs(self._algae_alpha) * 0.5 * tf.square(saddle_zetas2)) # Binary search to find best alpha. left = tf.constant([-8., -8.]) right = tf.constant([32., 32.]) for _ in range(16): mid = 0.5 * (left + right) self._alpha.assign(mid) weights, log_weights = self._get_weights(saddle_loss * self._gamma_sample[:, None]) divergence = self._compute_divergence(weights, log_weights) divergence_violation = divergence - self._two_sided_limit left = tf.where(divergence_violation > 0., mid, left) right = tf.where(divergence_violation > 0., right, mid) self._alpha.assign(0.5 * (left + right)) weights, log_weights = self._get_weights(saddle_loss * self._gamma_sample[:, None]) gamma_data_weights = tf.stop_gradient(weights * self._gamma_sample[:, None]) #print(tf.concat([gamma_data_weights, saddle_loss], axis=-1)) avg_saddle_loss = ( tf.reduce_sum(gamma_data_weights * saddle_loss, axis=0) / tf.reduce_sum(gamma_data_weights, axis=0)) weighted_state_action_count = tf.reduce_sum( tf.one_hot(self._nu_indices, self._dimension)[:, :, None] * weights[:, None, :], axis=0) weighted_state_action_count = tf.gather(weighted_state_action_count, self._nu_indices) my_td_mat = tf.einsum( 'ai, ab, ab, aj -> bij', tf.one_hot(self._nu_indices, self._dimension), #1.0 / tf.cast(self._state_action_count, tf.float32), 1.0 / weighted_state_action_count, weights, self._a_vec) my_bias = tf.reduce_sum( tf.transpose(weights)[:, :, None] * tf.one_hot(self._nu_indices, self._dimension)[None, :, :] * tf.reshape(self._weighted_rewards, [1, -1, 1]) * #1.0 / tf.cast(self._state_action_count, tf.float32)[None, :, None], 1.0 / tf.transpose(weighted_state_action_count)[:, :, None], axis=1) #print('hello', saddle_initial_nu_values[:1], saddle_zetas[:3], # self._nu[:2], my_bias[:, :2], saddle_loss[:4]) with tf.GradientTape( watch_accessed_variables=False, persistent=True) as tape: tape.watch([self._nu, self._nu2, self._alpha]) bellman_residuals = tf.matmul( my_td_mat, tf.transpose(self._nu)[:, :, None]) - my_bias[:, :, None] bellman_residuals = tf.transpose(tf.squeeze(bellman_residuals, -1)) bellman_residuals = tf.gather(bellman_residuals, self._nu_indices) initial_nu_values = tf.reduce_sum( # Average over actions. self._initial_target_probs[:, :, None] * tf.gather(self._nu, self._initial_nu_indices), axis=1) bellman_residuals *= self._algae_alpha_sign init_nu_loss = ((1 - self._gamma) * initial_nu_values * self._algae_alpha_sign) nu_loss = ( tf.math.square(bellman_residuals) / 2.0 + tf.math.abs(self._algae_alpha) * init_nu_loss) loss = ( gamma_data_weights * nu_loss / tf.reduce_sum(gamma_data_weights, axis=0, keepdims=True)) bellman_residuals2 = tf.matmul( my_td_mat, tf.transpose(self._nu2)[:, :, None]) - my_bias[:, :, None] bellman_residuals2 = tf.transpose(tf.squeeze(bellman_residuals2, -1)) bellman_residuals2 = tf.gather(bellman_residuals2, self._nu_indices) initial_nu_values2 = tf.reduce_sum( # Average over actions. self._initial_target_probs[:, :, None] * tf.gather(self._nu2, self._initial_nu_indices), axis=1) bellman_residuals2 *= -1 * self._algae_alpha_sign init_nu_loss2 = ((1 - self._gamma) * initial_nu_values2 * -1 * self._algae_alpha_sign) nu_loss2 = ( tf.math.square(bellman_residuals2) / 2.0 + tf.math.abs(self._algae_alpha) * init_nu_loss2) loss2 = ( gamma_data_weights * nu_loss2 / tf.reduce_sum(gamma_data_weights, axis=0, keepdims=True)) divergence = self._compute_divergence(weights, log_weights) divergence_violation = divergence - self._two_sided_limit alpha_loss = (-tf.exp(self._alpha) * tf.stop_gradient(divergence_violation)) extra_loss = tf.reduce_sum(tf.math.square(self._nu[-1, :])) extra_loss2 = tf.reduce_sum(tf.math.square(self._nu2[-1, :])) nu_grad = tape.gradient(loss + extra_loss, [self._nu])[0] nu_grad2 = tape.gradient(loss2 + extra_loss2, [self._nu2])[0] avg_loss = tf.reduce_sum( 0.5 * (loss - loss2) / tf.math.abs(self._algae_alpha), axis=0) nu_jacob = tape.jacobian(nu_grad, [self._nu])[0] nu_hess = tf.stack([nu_jacob[:, i, :, i] for i in range(self._num_limits)], axis=0) nu_jacob2 = tape.jacobian(nu_grad2, [self._nu2])[0] nu_hess2 = tf.stack( [nu_jacob2[:, i, :, i] for i in range(self._num_limits)], axis=0) for idx, div in enumerate(divergence): tf.summary.scalar('divergence%d' % idx, div) #alpha_grads = tape.gradient(alpha_loss, [self._alpha]) #alpha_grad_op = self._alpha_optimizer.apply_gradients( # zip(alpha_grads, [self._alpha])) #self._alpha.assign(tf.minimum(8., tf.maximum(-8., self._alpha))) #print(self._alpha, tf.concat([weights, nu_loss], -1)) #regularizer = 0.1 nu_transformed = tf.transpose( tf.squeeze( tf.linalg.solve(nu_hess + regularizer * tf.eye(self._dimension), tf.expand_dims(-tf.transpose(nu_grad), axis=-1)))) self._nu = self._nu + 0.1 * nu_transformed nu_transformed2 = tf.transpose( tf.squeeze( tf.linalg.solve(nu_hess2 + regularizer * tf.eye(self._dimension), tf.expand_dims(-tf.transpose(nu_grad2), axis=-1)))) self._nu2 = self._nu2 + 0.1 * nu_transformed2 print(avg_loss * self._algae_alpha_sign, avg_saddle_loss * self._algae_alpha_sign, self._nu[:2], divergence) #print(init_nu_loss[:8], init_nu_loss[-8:]) #print(bellman_residuals[:8]) #print(self._nu[:3], self._zeta[:3]) zetas = tf.matmul(my_td_mat, tf.transpose(self._nu)[:, :, None]) - my_bias[:, :, None] zetas = tf.transpose(tf.squeeze(zetas, -1)) zetas *= -self._algae_alpha_sign zetas /= tf.math.abs(self._algae_alpha) self._zeta = self._zeta + 0.1 * (zetas - self._zeta) zetas2 = tf.matmul(my_td_mat, tf.transpose(self._nu2)[:, :, None]) - my_bias[:, :, None] zetas2 = tf.transpose(tf.squeeze(zetas2, -1)) zetas2 *= 1 * self._algae_alpha_sign zetas2 /= tf.math.abs(self._algae_alpha) self._zeta2 = self._zeta2 + 0.1 * (zetas2 - self._zeta2) #self._zeta = ( # tf.einsum('ij,ja-> ia', self._td_mat, self._nu) - # tf.transpose(my_bias)) #self._zeta *= -tf.reshape(self._algae_alpha_sign, [1, self._num_limits]) #self._zeta /= tf.math.abs(self._algae_alpha) return [ avg_saddle_loss * self._algae_alpha_sign, avg_loss * self._algae_alpha_sign, divergence ]
def test_works_like_conv2d_transpose(self, input_shape, filter_shape, channels_out, strides, padding, dilations): strides_tuple = strides if not self.unequal_strides_ok: if strides[0] != strides[1]: # Skip this test case if the method does not support unequal strides. return else: strides = strides[0] x, k = _make_input_and_kernel( self.make_input, input_batch_shape=[], input_shape=input_shape, # Use singleton kernel_batch_shape to avoid the short circuit to # `conv2d_transpose`. kernel_batch_shape=[1], filter_shape=filter_shape, channels_out=channels_out, dtype=self.dtype) output_shape, strides_ = convolution_util._get_output_shape( rank=2, strides=strides_tuple, padding=padding, dilations=dilations, input_shape=input_shape, output_size=channels_out, filter_shape=filter_shape) tf_kernel = tf.transpose(tf.reshape( k, ps.concat([filter_shape, [input_shape[-1], channels_out]], axis=0)), perm=[0, 1, 3, 2]) # conv2d_transpose does not support dilations > 1; use Keras instead. if any(d > 1 for d in dilations): keras_convt = tf.keras.layers.Conv2DTranspose( filters=channels_out, kernel_size=filter_shape, strides=strides, padding=padding, dilation_rate=dilations, use_bias=False) _ = keras_convt(x) # build kernel keras_convt.kernel = tf_kernel y_expected = keras_convt(x) else: y_expected = tf.nn.conv2d_transpose(x, tf_kernel, output_shape=output_shape, strides=strides_, padding=padding, dilations=dilations) conv_fn = self.make_conv_fn(filter_shape, strides, padding, dilations) with tf.GradientTape() as tape: tape.watch([x, k]) y_actual = conv_fn(x, k) grad = tape.gradient(y_actual, [x, k]) self.assertAllNotNone(grad) [y_expected_, y_actual_] = self.evaluate([y_expected, y_actual]) self.assertAllClose(y_expected_, y_actual_, rtol=1e-5, atol=0)
def pass_arg(Xx, nsim, tr_size, num_iter): print("Tr_size:", tr_size) def fix_seeds(seed): random.seed(seed) np.random.seed(seed) tf.random.set_seed(seed) session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf) # K.set_session(sess) tf.compat.v1.keras.backend.set_session(sess) ss = 1 fix_seeds(ss) # Compute the RMSE given the ground truth (y_true) and the predictions(y_pred) def root_mean_squared_error(y_true, y_pred): return tf.math.sqrt( tf.math.reduce_mean(tf.math.square(y_pred - y_true), axis=-1)) class InputTransformedKernel( tfp.math.psd_kernels.PositiveSemidefiniteKernel): def __init__(self, kernel, transformation, name='InputTransformedKernel'): self._kernel = kernel self._transformation = transformation super(InputTransformedKernel, self).__init__(feature_ndims=kernel.feature_ndims, dtype=kernel.dtype, name=name) def apply(self, x1, x2): return self._kernel.apply(self._transformation(x1), self._transformation(x2)) def matrix(self, x1, x2): return self._kernel.matrix(self._transformation(x1), self._transformation(x2)) @property def batch_shape(self): return self._kernel.batch_shape def batch_shape_tensor(self): return self._kernel.batch_shape_tensor class InputScaledKernel(InputTransformedKernel): def __init__(self, kernel, length_scales): super(InputScaledKernel, self).__init__( kernel, lambda x: x / tf.expand_dims( length_scales, -(kernel.feature_ndims + 1))) # Load labeled data data = np.loadtxt('../data/labeled_data.dat') x_labeled = data[:, :2].astype( np.float64) # -2 because we do not need porosity predictions y_labeled = data[:, -2:-1].astype( np.float64) # dimensionless bond length and porosity measurements # normalize dataset with MinMaxScaler scaler = preprocessing.MinMaxScaler(feature_range=(0.0, 1.0)) x_labeled = scaler.fit_transform(x_labeled) # y_labeled = scaler.fit_transform(y_labeled) tr_size = int(tr_size) # train and test data trainX, trainY = x_labeled[:tr_size, :], y_labeled[:tr_size] # testX, testY = x_labeled[tr_size:,:], y_labeled[tr_size:] trainY = np.transpose(trainY) # testY = np.transpose(testY) data_phyloss = np.loadtxt('../data/unlabeled_data_BK_constw_v2_1525.dat') x_unlabeled = data_phyloss[:, :] # initial porosity initporo = x_unlabeled[:, -1] x_unlabeled1 = x_unlabeled[:1303, :2] x_unlabeled2 = x_unlabeled[-6:, :2] x_unlabeled = np.vstack((x_unlabeled1, x_unlabeled2)) x_unlabeled = scaler.fit_transform(x_unlabeled) init_poro1 = initporo[:1303] init_poro2 = initporo[-6:] init_poro = np.hstack((init_poro1, init_poro2)) def build_gp(amplitude, length_scale): """Defines the conditional dist. of GP outputs, given kernel parameters.""" # Create the covariance kernel, which will be shared between the prior (which we # use for maximum likelihood training) and the posterior (which we use for # posterior predictive sampling) se_kernel = tfk.ExponentiatedQuadratic( amplitude) # length_scale = None here, implicitly # This is the "ARD" kernel (we don't like abbreviations or bizarrely obscure names in # TFP, so we're probably going to call this "InputScaledKernel" since....that's what it is! :) kernel = InputScaledKernel(se_kernel, length_scale) # Create the GP prior distribution, which we will use to train the model # parameters. return tfd.GaussianProcess(kernel=kernel, index_points=trainX) gp_joint_model = tfd.JointDistributionNamedAutoBatched({ 'amplitude': tfd.TransformedDistribution(distribution=tfd.Normal( loc=0., scale=np.float64(1.)), bijector=tfb.Exp(), batch_shape=[1]), 'length_scale': tfd.TransformedDistribution(distribution=tfd.Normal( loc=0., scale=np.float64(1.)), bijector=tfb.Exp(), batch_shape=[2]), 'observations': build_gp, }) # Create the trainable model parameters, which we'll subsequently optimize. # Note that we constrain them to be strictly positive. constrain_positive = tfb.Shift(np.finfo(np.float64).tiny)(tfb.Exp()) amplitude_var = tfp.util.TransformedVariable( initial_value=np.random.uniform(size=1), bijector=constrain_positive, name='amplitude', dtype=np.float64) length_scale_var = tfp.util.TransformedVariable( initial_value=np.random.uniform(size=[2]), bijector=constrain_positive, name='length_scale', dtype=np.float64) trainable_variables = [ v.trainable_variables[0] for v in [amplitude_var, length_scale_var] ] @tf.function(autograph=False, experimental_compile=False) def target_log_prob(amplitude, length_scale, poroi, lam): tf.random.set_seed(1234) se_kernel = tfk.ExponentiatedQuadratic( amplitude) # length_scale = None here, implicitly optimized_kernel = InputScaledKernel(se_kernel, length_scale) gprm = tfd.GaussianProcessRegressionModel(kernel=optimized_kernel, index_points=x_unlabeled) samples = gprm.sample(1) pred = tf.squeeze(samples, axis=0) phyloss_poro = tf.math.reduce_mean( tf.nn.relu(tf.negative(pred)) + tf.nn.relu(pred - poroi)) # print("phyloss_poro:",lam*phyloss_poro) # return lam*phyloss_poro return lam * phyloss_poro - gp_joint_model.log_prob( { 'amplitude': amplitude, 'length_scale': length_scale, 'observations': trainY }) fix_seeds(1) # Optimize the model parameters. num_iters = int(num_iter) lam = 100000 optimizer = tf.optimizers.Adam(learning_rate=.1) # Store the likelihood values during training, so we can plot the progress lls_ = np.zeros(num_iters, np.float64) for i in range(num_iters): with tf.GradientTape() as tape: loss = target_log_prob(amplitude_var, length_scale_var, init_poro, lam) # physics loss & normal loss # print(i,"loss_inloop:",loss) grads = tape.gradient(loss, trainable_variables) optimizer.apply_gradients(zip(grads, trainable_variables)) lls_[i] = loss # print('Trained parameters:') # print('amplitude: {}'.format(amplitude_var._value().numpy())) # print('length_scale: {}'.format(length_scale_var._value().numpy())) # tf.random.set_seed(1234) fix_seeds(1) se_kernel = tfk.ExponentiatedQuadratic( amplitude_var) # length_scale = None here, implicitly optimized_kernel = InputScaledKernel(se_kernel, length_scale_var) gprm = tfd.GaussianProcessRegressionModel(kernel=optimized_kernel, index_points=Xx) preds = gprm.sample(int(nsim)) samples = np.array(tf.squeeze(preds, axis=1)) return samples
def compute_and_plot_saliency(model, image_path): """ This function computes and plots the saliency plot. You need to compute the matrix M detailed in section 3.1 in K. Simonyan, A. Vedaldi, and A. Zisserman, "Deep inside convolutional networks: Visualising imageclassification models and saliency maps," 2013, Available at https://arxiv.org/abs/1312.6034. :param model: Model which is used :param image_path: Path to the image to be analysed :return: None """ raw_image = tf.dtypes.cast(decode_jpeg(image_path), tf.float32) logits_tensor = model.get_layer('classifier') logits_model = tf.keras.Model(model.input, logits_tensor.output) with tf.GradientTape() as t: ######### Your code starts here ######### ######### Your code ends here ######### plt.subplot(2, 1, 1) plt.imshow(M) plt.title('Saliency with respect to predicted class %s' % LABELS[top_class]) plt.subplot(2, 1, 2) plt.imshow(decode_jpeg(image_path).numpy()) plt.savefig("../plots/saliency.png") plt.show() def plot_classification(image_path, classification_array): nH, nW, _ = classification_array.shape image_data = decode_jpeg(image_path).numpy() aspect_ratio = float(image_data.shape[0]) / image_data.shape[1] plt.figure(figsize=(8, 8*aspect_ratio)) p1 = plt.subplot(2,2,1) plt.imshow(classification_array[:,:,0], interpolation='none', cmap='jet') plt.title('%s probability' % LABELS[0]) p1.set_aspect(aspect_ratio*nW/nH) plt.colorbar() p2 = plt.subplot(2,2,2) plt.imshow(classification_array[:,:,1], interpolation='none', cmap='jet') plt.title('%s probability' % LABELS[1]) p2.set_aspect(aspect_ratio*nW/nH) plt.colorbar() p2 = plt.subplot(2,2,3) plt.imshow(classification_array[:,:,2], interpolation='none', cmap='jet') plt.title('%s probability' % LABELS[2]) p2.set_aspect(aspect_ratio*nW/nH) plt.colorbar() plt.subplot(2,2,4) plt.imshow(image_data) plt.savefig("../plots/detect.png") plt.show() if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--image', type=str) parser.add_argument('--scheme', type=str) FLAGS, _ = parser.parse_known_args() maybe_makedirs("../plots") model = tf.keras.models.load_model('./trained_models/trained.h5') if FLAGS.scheme == 'brute': plot_classification(FLAGS.image, compute_brute_force_classification(model, FLAGS.image, 8, 8)) elif FLAGS.scheme == 'conv': plot_classification(FLAGS.image, compute_convolutional_KxK_classification(model, FLAGS.image)) elif FLAGS.scheme == 'saliency': compute_and_plot_saliency(model, FLAGS.image) else: print('Unrecognized scheme:', FLAGS.scheme)
def testDistribution(self, dist_name, data): seed = test_util.test_seed() # Explicitly draw event_dim here to avoid relying on _params_event_ndims # later, so this test can support distributions that do not implement the # slicing protocol. event_dim = data.draw(hps.integers(min_value=2, max_value=6)) dist = data.draw( dhps.distributions(dist_name=dist_name, event_dim=event_dim, enable_vars=True)) batch_shape = dist.batch_shape batch_shape2 = data.draw( tfp_hps.broadcast_compatible_shape(batch_shape)) dist2 = data.draw( dhps.distributions(dist_name=dist_name, batch_shape=batch_shape2, event_dim=event_dim, enable_vars=True)) self.evaluate([var.initializer for var in dist.variables]) # Check that the distribution passes Variables through to the accessor # properties (without converting them to Tensor or anything like that). for k, v in six.iteritems(dist.parameters): if not tensor_util.is_ref(v): continue self.assertIs(getattr(dist, k), v) # Check that standard statistics do not read distribution parameters more # than twice (once in the stat itself and up to once in any validation # assertions). max_permissible = 2 + extra_tensor_conversions_allowed(dist) for stat in sorted( data.draw( hps.sets(hps.one_of( map(hps.just, [ 'covariance', 'entropy', 'mean', 'mode', 'stddev', 'variance' ])), min_size=3, max_size=3))): hp.note('Testing excessive var usage in {}.{}'.format( dist_name, stat)) try: with tfp_hps.assert_no_excessive_var_usage( 'statistic `{}` of `{}`'.format(stat, dist), max_permissible=max_permissible): getattr(dist, stat)() except NotImplementedError: pass # Check that `sample` doesn't read distribution parameters more than twice, # and that it produces non-None gradients (if the distribution is fully # reparameterized). with tf.GradientTape() as tape: # TDs do bijector assertions twice (once by distribution.sample, and once # by bijector.forward). max_permissible = 2 + extra_tensor_conversions_allowed(dist) with tfp_hps.assert_no_excessive_var_usage( 'method `sample` of `{}`'.format(dist), max_permissible=max_permissible): sample = dist.sample(seed=seed) if dist.reparameterization_type == tfd.FULLY_REPARAMETERIZED: grads = tape.gradient(sample, dist.variables) for grad, var in zip(grads, dist.variables): var_name = var.name.rstrip('_0123456789:') if var_name in NO_SAMPLE_PARAM_GRADS.get(dist_name, ()): continue if grad is None: raise AssertionError( 'Missing sample -> {} grad for distribution {}'.format( var_name, dist_name)) # Turn off validations, since TODO(b/129271256) log_prob can choke on dist's # own samples. Also, to relax conversion counts for KL (might do >2 w/ # validate_args). dist = dist.copy(validate_args=False) dist2 = dist2.copy(validate_args=False) # Test that KL divergence reads distribution parameters at most once, and # that is produces non-None gradients. try: for d1, d2 in (dist, dist2), (dist2, dist): if dist_name in SKIP_KL_CHECK_DIST_VAR_GRADS: continue with tf.GradientTape() as tape: with tfp_hps.assert_no_excessive_var_usage( '`kl_divergence` of (`{}` (vars {}), `{}` (vars {}))' .format(d1, d1.variables, d2, d2.variables), max_permissible=1 ): # No validation => 1 convert per var. kl = d1.kl_divergence(d2) wrt_vars = list(d1.variables) + list(d2.variables) grads = tape.gradient(kl, wrt_vars) for grad, var in zip(grads, wrt_vars): if grad is None and dist_name not in NO_KL_PARAM_GRADS: raise AssertionError( 'Missing KL({} || {}) -> {} grad:\n' # pylint: disable=duplicate-string-formatting-argument '{} vars: {}\n{} vars: {}'.format( d1, d2, var, d1, d1.variables, d2, d2.variables)) except NotImplementedError: # Raised by kl_divergence if no registered KL is found. pass # Test that log_prob produces non-None gradients, except for distributions # on the NO_LOG_PROB_PARAM_GRADS blocklist. if dist_name not in NO_LOG_PROB_PARAM_GRADS: with tf.GradientTape() as tape: lp = dist.log_prob(tf.stop_gradient(sample)) grads = tape.gradient(lp, dist.variables) for grad, var in zip(grads, dist.variables): if grad is None: raise AssertionError( 'Missing log_prob -> {} grad for distribution {}'. format(var, dist_name)) # Test that all forms of probability evaluation avoid reading distribution # parameters more than once. for evaluative in sorted( data.draw( hps.sets(hps.one_of( map(hps.just, [ 'log_prob', 'prob', 'log_cdf', 'cdf', 'log_survival_function', 'survival_function' ])), min_size=3, max_size=3))): hp.note('Testing excessive var usage in {}.{}'.format( dist_name, evaluative)) try: # No validation => 1 convert. But for TD we allow 2: # dist.log_prob(bijector.inverse(samp)) + bijector.ildj(samp) max_permissible = 2 + extra_tensor_conversions_allowed(dist) with tfp_hps.assert_no_excessive_var_usage( 'evaluative `{}` of `{}`'.format(evaluative, dist), max_permissible=max_permissible): getattr(dist, evaluative)(sample) except NotImplementedError: pass
u = 0 jds = tfd.JointDistributionSequential([ tfd.Normal(loc=x, scale=1.), # m tfd.Normal(loc=y, scale=1.), # b lambda b, m: tfd.Normal(loc=m * X + b, scale=1.) # Y ]) return jds.log_prob(x, y, z) print('gradient ', tfp.math.value_and_gradient(logp, [[1.0], 2.0, [5.0, 3.0, 2.2]])) x = tf.Variable(0.1) beta = tf.Variable(1.36) q = tf.Variable(2.1) x.assign(2.25) with tf.GradientTape() as tape: upgamma = (3 - q) / (2 * (q - 1)) num = (3.14**.5) * tf.math.exp(tf.math.lgamma(upgamma)) downgamma = 1 / (q - 1) den = ((q - 1)**.5) * tf.math.exp(tf.math.lgamma(downgamma)) cq = num / den pd = tf.math.pow( (1 - (1 - q) * beta * x**2), 1 / (1 - q)) * (beta**.5) * (1 / cq) lpd = tf.math.log(pd) x.assign(4.2) with tf.GradientTape() as tape1: trial = tf.math.exp(x) #print(tape.gradient(lpd,[beta,q])) #print(tape1.gradient(trial,[x]))
def step_fn(inputs): """Per-Replica StepFn.""" images, labels = inputs images = tf.tile(images, [FLAGS.ensemble_size, 1, 1, 1]) # generate lambdas lambdas = log_uniform_sample(per_core_batch_size, lambda_parameters) lambdas = tf.reshape(lambdas, (FLAGS.ensemble_size * per_core_batch_size, lambdas_config.dim)) with tf.GradientTape() as tape: logits = model([images, lambdas], training=True) if FLAGS.use_bfloat16: logits = tf.cast(logits, tf.float32) if FLAGS.use_gibbs_ce: # Average of single model CEs # tiling of labels should be only done for Gibbs CE loss labels = tf.tile(labels, [FLAGS.ensemble_size]) negative_log_likelihood = tf.reduce_mean( tf.keras.losses.sparse_categorical_crossentropy( labels, logits, from_logits=True)) else: # Ensemble CE uses no tiling of the labels negative_log_likelihood = ensemble_crossentropy( labels, logits, FLAGS.ensemble_size) # Note: Divide l2_loss by sample_size (this differs from uncertainty_ # baselines implementation.) l2_loss = sum(model.losses) / train_sample_size loss = negative_log_likelihood + l2_loss # Scale the loss given the TPUStrategy will reduce sum all gradients. scaled_loss = loss / strategy.num_replicas_in_sync grads = tape.gradient(scaled_loss, model.trainable_variables) # Separate learning rate for fast weights. grads_and_vars = [] for grad, var in zip(grads, model.trainable_variables): if (('alpha' in var.name or 'gamma' in var.name) and 'batch_norm' not in var.name): grads_and_vars.append( (grad * FLAGS.fast_weight_lr_multiplier, var)) else: grads_and_vars.append((grad, var)) optimizer.apply_gradients(grads_and_vars) probs = tf.nn.softmax(logits) per_probs = tf.split(probs, num_or_size_splits=FLAGS.ensemble_size, axis=0) per_probs_stacked = tf.stack(per_probs, axis=0) metrics['train/ece'].update_state(labels, probs) metrics['train/loss'].update_state(loss) metrics['train/negative_log_likelihood'].update_state( negative_log_likelihood) metrics['train/accuracy'].update_state(labels, logits) diversity_results = um.average_pairwise_diversity( per_probs_stacked, FLAGS.ensemble_size) for k, v in diversity_results.items(): metrics['train/' + k].update_state(v) if grads_and_vars: grads, _ = zip(*grads_and_vars)
def testTrain(self, layer_id, rng_updater_id, batch_size, trax_has_weights, explicit_build, use_model): """Tests training (forward and backward pass) for TraxKerasLayer. Args: layer_id: an integer, the index into `_LAYERS`. rng_updater_id: an integer, the index into `_RNG_UPDATERS`. batch_size: an integer or `None`, the value for the `batch_size` argument in `TraxKerasLayer.__init__`. trax_has_weights: bool, whether to make the trax layer contain weights at the time when `TraxKerasLayer.build` is called. explicit_build: bool, whether to explicitly call `TraxKerasLayer.build`. use_model: bool, whether to build a `tf.keras.Model` out of the `TraxKerasLayer` layer and use the model to do the training instead of the bare layer. If `True`, we will also test checkpointing and restoring using the model. """ with trax.fastmath.use_backend("tensorflow-numpy"): make_trax_layer, input_shapes_no_batch, dtype, allow_none_batch = ( _LAYERS[layer_id]) # We make a fresh trax layer for each test case, so that different test # cases won't interfere with each other. trax_layer = make_trax_layer() if not allow_none_batch and batch_size is None: self.skipTest("This Trax layer can't handle None batch size.") rng_updater = _RNG_UPDATERS[rng_updater_id] input_shapes = math_lib.nested_map(lambda s: [batch_size] + s, input_shapes_no_batch) input_sig = trax2keras.tensor_shapes_to_shape_dtypes( input_shapes, dtype) initializer_rng = math_lib.random.get_prng(765) weights, state = trax_layer.init(input_sig, rng=initializer_rng) generator = tf.random.Generator.from_seed(567) def get_inputs(): return dummy_inputs(generator, input_sig) if trax_has_weights: trax_layer(to_arrays(get_inputs()), weights=weights, state=state) rng = math_lib.random.get_prng(1234) keras_layer = trax2keras.TraxKerasLayer( trax_layer, batch_size=batch_size, initializer_rng=initializer_rng, rng=rng, rng_updater=rng_updater) if explicit_build: keras_layer.build(input_shapes) if use_model: x = tf.keras.Input(shape=input_shapes_no_batch, dtype=dtype) y = keras_layer(x) keras_model = tf.keras.Model(inputs=x, outputs=y) lr = 0.1 # learning rate for _ in range(3): inputs = get_inputs() with tf.GradientTape() as trax_tape: trax_tape.watch([x.data for x in tf.nest.flatten(weights)]) trax_outputs, state = trax_layer.pure_fn(to_arrays(inputs), weights=weights, state=state, rng=rng) trax_grads = trax_tape.gradient( *to_tensors([trax_outputs, weights])) # `g` may be `tf.IndexedSlices`, so we need to `convert_to_tensor` # before multiplication. weights = tf.nest.map_structure( lambda w, g: w + jnp.asarray(lr * tf.convert_to_tensor(g), w.dtype), weights, trax_grads) rng = rng_updater(rng) with tf.GradientTape() as keras_tape: if use_model: keras_outputs = keras_model(inputs) else: keras_outputs = keras_layer(inputs) if isinstance(keras_outputs, tuple) and len(keras_outputs) == 1: keras_outputs = keras_outputs[0] self.assertAllClose(to_tensors(trax_outputs), keras_outputs) keras_grads = keras_tape.gradient( keras_outputs, keras_layer.trainable_variables) tf.nest.map_structure( lambda v, g: v.assign_add( # pylint: disable=g-long-lambda tf.cast(lr * tf.convert_to_tensor(g), v.dtype)), keras_layer.trainable_variables, keras_grads) self.assertAllClose(to_tensors(weights), read_values(keras_layer._weights), rtol=2e-6, atol=2e-4 if has_gpu() else 1e-6) self.assertAllClose(to_tensors(state), read_values(keras_layer._state)) self.assertAllClose(to_tensors(rng), read_values(keras_layer._rng)) if use_model: fname = os.path.join(self.get_temp_dir(), "checkpoint") keras_model.save(fname) loaded_model = tf.keras.models.load_model(fname) for _ in range(2): inputs = get_inputs() self.assertAllClose(keras_model(inputs), loaded_model(inputs))
def testBijector(self, bijector_name, data): tfp_hps.guitar_skip_if_matches('Tanh', bijector_name, 'b/144163991') bijector, event_dim = self._draw_bijector(bijector_name, data) # Forward mapping: Check differentiation through forward mapping with # respect to the input and parameter variables. Also check that any # variables are not referenced overmuch. xs = self._draw_domain_tensor(bijector, data, event_dim) wrt_vars = [xs] + [ v for v in bijector.trainable_variables if v.dtype.is_floating ] with tf.GradientTape() as tape: with tfp_hps.assert_no_excessive_var_usage( 'method `forward` of {}'.format(bijector)): tape.watch(wrt_vars) # TODO(b/73073515): Fix graph mode gradients with bijector caching. ys = bijector.forward(xs + 0) grads = tape.gradient(ys, wrt_vars) assert_no_none_grad(bijector, 'forward', wrt_vars, grads) # For scalar bijectors, verify correctness of the _is_increasing method. # TODO(b/148459057): Except, don't verify Softfloor on Guitar because # of numerical problem. def exception(bijector): if not tfp_hps.running_under_guitar(): return False if isinstance(bijector, tfb.Softfloor): return True if is_invert(bijector): return exception(bijector.bijector) return False if (bijector.forward_min_event_ndims == 0 and bijector.inverse_min_event_ndims == 0 and not exception(bijector)): dydx = grads[0] hp.note('dydx: {}'.format(dydx)) isfinite = tf.math.is_finite(dydx) incr_or_slope_eq0 = bijector._internal_is_increasing() | tf.equal( dydx, 0) # pylint: disable=protected-access self.assertAllEqual( isfinite & incr_or_slope_eq0, isfinite & (dydx >= 0) | tf.zeros_like(incr_or_slope_eq0)) # FLDJ: Check differentiation through forward log det jacobian with # respect to the input and parameter variables. Also check that any # variables are not referenced overmuch. event_ndims = data.draw( hps.integers(min_value=bijector.forward_min_event_ndims, max_value=xs.shape.ndims)) with tf.GradientTape() as tape: max_permitted = _ldj_tensor_conversions_allowed(bijector, is_forward=True) with tfp_hps.assert_no_excessive_var_usage( 'method `forward_log_det_jacobian` of {}'.format(bijector), max_permissible=max_permitted): tape.watch(wrt_vars) # TODO(b/73073515): Fix graph mode gradients with bijector caching. ldj = bijector.forward_log_det_jacobian( xs + 0, event_ndims=event_ndims) grads = tape.gradient(ldj, wrt_vars) assert_no_none_grad(bijector, 'forward_log_det_jacobian', wrt_vars, grads) # Inverse mapping: Check differentiation through inverse mapping with # respect to the codomain "input" and parameter variables. Also check that # any variables are not referenced overmuch. ys = self._draw_codomain_tensor(bijector, data, event_dim) wrt_vars = [ys] + [ v for v in bijector.trainable_variables if v.dtype.is_floating ] with tf.GradientTape() as tape: with tfp_hps.assert_no_excessive_var_usage( 'method `inverse` of {}'.format(bijector)): tape.watch(wrt_vars) # TODO(b/73073515): Fix graph mode gradients with bijector caching. xs = bijector.inverse(ys + 0) grads = tape.gradient(xs, wrt_vars) assert_no_none_grad(bijector, 'inverse', wrt_vars, grads) # ILDJ: Check differentiation through inverse log det jacobian with respect # to the codomain "input" and parameter variables. Also check that any # variables are not referenced overmuch. event_ndims = data.draw( hps.integers(min_value=bijector.inverse_min_event_ndims, max_value=ys.shape.ndims)) with tf.GradientTape() as tape: max_permitted = _ldj_tensor_conversions_allowed(bijector, is_forward=False) with tfp_hps.assert_no_excessive_var_usage( 'method `inverse_log_det_jacobian` of {}'.format(bijector), max_permissible=max_permitted): tape.watch(wrt_vars) # TODO(b/73073515): Fix graph mode gradients with bijector caching. ldj = bijector.inverse_log_det_jacobian( ys + 0, event_ndims=event_ndims) grads = tape.gradient(ldj, wrt_vars) assert_no_none_grad(bijector, 'inverse_log_det_jacobian', wrt_vars, grads) # Verify that `_is_permutation` implies constant zero Jacobian. if bijector._is_permutation: self.assertTrue(bijector._is_constant_jacobian) self.assertAllEqual(ldj, 0.) # Verify correctness of batch shape. xs_batch_shapes = tf.nest.map_structure( lambda x, nd: ps.shape(x)[:ps.rank(x) - nd], xs, bijector.inverse_event_ndims(event_ndims)) empirical_batch_shape = functools.reduce( ps.broadcast_shape, nest.flatten_up_to(bijector.forward_min_event_ndims, xs_batch_shapes)) batch_shape = bijector.experimental_batch_shape( y_event_ndims=event_ndims) if tensorshape_util.is_fully_defined(batch_shape): self.assertAllEqual(empirical_batch_shape, batch_shape) self.assertAllEqual( empirical_batch_shape, bijector.experimental_batch_shape_tensor( y_event_ndims=event_ndims)) # Check that the outputs of forward_dtype and inverse_dtype match the dtypes # of the outputs of forward and inverse. self.assertAllEqualNested(ys.dtype, bijector.forward_dtype(xs.dtype)) self.assertAllEqualNested(xs.dtype, bijector.inverse_dtype(ys.dtype))
def grad(model, inputs): with tf.GradientTape() as tape: _ = model(inputs, training=True) loss_value = sum(model.losses) return model.losses, tape.gradient(loss_value, model.trainable_variables)